diff --git a/mcp-server/src/legal_mcp/services/db.py b/mcp-server/src/legal_mcp/services/db.py index 8e23da4..ab69227 100644 --- a/mcp-server/src/legal_mcp/services/db.py +++ b/mcp-server/src/legal_mcp/services/db.py @@ -1915,7 +1915,7 @@ async def update_case_law(case_law_id: UUID, **fields) -> dict | None: precedent_level, is_binding. """ allowed = { - "case_name", "court", "date", "practice_area", "appeal_subtype", + "case_number", "case_name", "court", "date", "practice_area", "appeal_subtype", "subject_tags", "summary", "headnote", "key_quote", "source_url", "source_type", "precedent_level", "is_binding", } @@ -2466,22 +2466,31 @@ async def precedent_library_stats() -> dict: async def request_metadata_extraction(case_law_id: UUID) -> bool: """Stamp ``metadata_extraction_requested_at`` for the local MCP worker - to pick up. Returns False if the row is missing.""" + to pick up. Returns False if the row is missing. + + Originally restricted to ``source_kind='external_upload'`` (see git + blame). Opened to all source kinds 2026-05-06 — internal_committee + rows can also need re-extraction (e.g. corrupted subject_tags from + an early ingest pipeline). The extractor itself preserves user + values (``precedent_metadata_extractor.extract_and_apply`` only + fills empty fields), so this is safe. + """ pool = await get_pool() result = await pool.execute( "UPDATE case_law SET metadata_extraction_requested_at = now() " - "WHERE id = $1 AND source_kind = 'external_upload'", + "WHERE id = $1", case_law_id, ) return result == "UPDATE 1" async def request_halacha_extraction(case_law_id: UUID) -> bool: - """Same but for halacha extraction.""" + """Same but for halacha extraction. See note on + :func:`request_metadata_extraction` re: opening to all source kinds.""" pool = await get_pool() result = await pool.execute( "UPDATE case_law SET halacha_extraction_requested_at = now() " - "WHERE id = $1 AND source_kind = 'external_upload'", + "WHERE id = $1", case_law_id, ) return result == "UPDATE 1" diff --git a/mcp-server/src/legal_mcp/services/precedent_library.py b/mcp-server/src/legal_mcp/services/precedent_library.py index de4a1c2..b419a21 100644 --- a/mcp-server/src/legal_mcp/services/precedent_library.py +++ b/mcp-server/src/legal_mcp/services/precedent_library.py @@ -257,8 +257,11 @@ async def reextract_halachot( case_law_id = UUID(case_law_id) record = await db.get_case_law(case_law_id) - if not record or record.get("source_kind") != "external_upload": - raise ValueError("precedent not found or not chair-uploaded") + if not record: + raise ValueError("precedent not found") + # Was restricted to source_kind='external_upload'; opened 2026-05-06 so + # internal_committee rows can also be re-extracted when ingest produced + # bad data. See note in db.request_metadata_extraction. await progress("extracting_halachot", 50, "מחלץ הלכות מחדש") result = await halacha_extractor.extract(case_law_id) @@ -402,8 +405,9 @@ async def reextract_metadata( case_law_id = UUID(case_law_id) record = await db.get_case_law(case_law_id) - if not record or record.get("source_kind") != "external_upload": - raise ValueError("precedent not found or not chair-uploaded") + if not record: + raise ValueError("precedent not found") + # See note in db.request_metadata_extraction — opened to all source kinds. await progress("extracting_metadata", 40, "מחלץ מטא-דאטה (תקציר, תגיות)") result = await precedent_metadata_extractor.extract_and_apply(case_law_id) diff --git a/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py b/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py index 2d844c6..89757de 100644 --- a/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py +++ b/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py @@ -223,7 +223,17 @@ async def apply_to_record( fields_to_update["key_quote"] = s cur_tags = record.get("subject_tags") or [] - if not cur_tags: + # Treat character-by-character corruption as empty. Early ingest + # pipelines stored a JSON string (`'["היטל השבחה"]'`) into a TEXT[] + # column, which Postgres split into individual chars: + # `['[', '"', 'ה', 'י', 'ט', 'ל', ' ', 'ה', 'ש', ...]`. Detection: + # 3+ elements where every element is at most 2 chars (legitimate + # tags are multi-character Hebrew words like `היטל_השבחה`). + is_corrupt = ( + len(cur_tags) >= 3 + and all(isinstance(t, str) and len(t) <= 2 for t in cur_tags) + ) + if not cur_tags or is_corrupt: sug_tags = suggested.get("subject_tags") or [] if sug_tags: fields_to_update["subject_tags"] = sug_tags