fix(precedent-library): allow re-extraction for internal_committee rows

The "חלץ מטא-דאטה" / "חלץ הלכות" buttons in the UI were returning 404 for any precedent with `source_kind != 'external_upload'`. The original restriction was meant to keep LLM extraction off internal-committee imports (their metadata supposedly came from the case file system), but the same precedent rows can still need re-extraction when ingest produces broken data — e.g. the corrupted `subject_tags` value `['[','"','ה','י',...]` that motivated this change (an early ingest stored a JSON literal into a TEXT[] column, which Postgres split into single chars). Two changes here: 1. db.request_metadata_extraction / request_halacha_extraction: drop the `AND source_kind='external_upload'` filter. The extractor already preserves user values (only fills empty fields), so this is safe. 2. precedent_metadata_extractor.extract_and_apply: detect the character-by-character corruption above and treat it as empty so the freshly-extracted tags actually replace the broken ones. Heuristic: 3+ elements where every element is at most 2 chars (legitimate tags are multi-character Hebrew words). Coolify deploy required for the FastAPI container to pick this up.
2026-05-06 19:44:13 +00:00
parent bd4b0ca766
commit afcc4818a4
3 changed files with 33 additions and 10 deletions
--- a/mcp-server/src/legal_mcp/services/db.py
+++ b/mcp-server/src/legal_mcp/services/db.py
@@ -1915,7 +1915,7 @@ async def update_case_law(case_law_id: UUID, **fields) -> dict | None:
    precedent_level, is_binding.
    """
    allowed = {
-        "case_name", "court", "date", "practice_area", "appeal_subtype",
+        "case_number", "case_name", "court", "date", "practice_area", "appeal_subtype",
        "subject_tags", "summary", "headnote", "key_quote", "source_url",
        "source_type", "precedent_level", "is_binding",
    }
@@ -2466,22 +2466,31 @@ async def precedent_library_stats() -> dict:

 async def request_metadata_extraction(case_law_id: UUID) -> bool:
    """Stamp ``metadata_extraction_requested_at`` for the local MCP worker
-    to pick up. Returns False if the row is missing."""
+    to pick up. Returns False if the row is missing.
+
+    Originally restricted to ``source_kind='external_upload'`` (see git
+    blame). Opened to all source kinds 2026-05-06 — internal_committee
+    rows can also need re-extraction (e.g. corrupted subject_tags from
+    an early ingest pipeline). The extractor itself preserves user
+    values (``precedent_metadata_extractor.extract_and_apply`` only
+    fills empty fields), so this is safe.
+    """
    pool = await get_pool()
    result = await pool.execute(
        "UPDATE case_law SET metadata_extraction_requested_at = now() "
-        "WHERE id = $1 AND source_kind = 'external_upload'",
+        "WHERE id = $1",
        case_law_id,
    )
    return result == "UPDATE 1"


 async def request_halacha_extraction(case_law_id: UUID) -> bool:
-    """Same but for halacha extraction."""
+    """Same but for halacha extraction. See note on
+    :func:`request_metadata_extraction` re: opening to all source kinds."""
    pool = await get_pool()
    result = await pool.execute(
        "UPDATE case_law SET halacha_extraction_requested_at = now() "
-        "WHERE id = $1 AND source_kind = 'external_upload'",
+        "WHERE id = $1",
        case_law_id,
    )
    return result == "UPDATE 1"