fix(precedent-library): allow re-extraction for internal_committee rows
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 3m13s
All checks were successful
Build & Deploy / build-and-deploy (push) Successful in 3m13s
The "חלץ מטא-דאטה" / "חלץ הלכות" buttons in the UI were returning 404 for any precedent with `source_kind != 'external_upload'`. The original restriction was meant to keep LLM extraction off internal-committee imports (their metadata supposedly came from the case file system), but the same precedent rows can still need re-extraction when ingest produces broken data — e.g. the corrupted `subject_tags` value `['[','"','ה','י',...]` that motivated this change (an early ingest stored a JSON literal into a TEXT[] column, which Postgres split into single chars). Two changes here: 1. db.request_metadata_extraction / request_halacha_extraction: drop the `AND source_kind='external_upload'` filter. The extractor already preserves user values (only fills empty fields), so this is safe. 2. precedent_metadata_extractor.extract_and_apply: detect the character-by-character corruption above and treat it as empty so the freshly-extracted tags actually replace the broken ones. Heuristic: 3+ elements where every element is at most 2 chars (legitimate tags are multi-character Hebrew words). Coolify deploy required for the FastAPI container to pick this up.
This commit is contained in:
@@ -1915,7 +1915,7 @@ async def update_case_law(case_law_id: UUID, **fields) -> dict | None:
|
|||||||
precedent_level, is_binding.
|
precedent_level, is_binding.
|
||||||
"""
|
"""
|
||||||
allowed = {
|
allowed = {
|
||||||
"case_name", "court", "date", "practice_area", "appeal_subtype",
|
"case_number", "case_name", "court", "date", "practice_area", "appeal_subtype",
|
||||||
"subject_tags", "summary", "headnote", "key_quote", "source_url",
|
"subject_tags", "summary", "headnote", "key_quote", "source_url",
|
||||||
"source_type", "precedent_level", "is_binding",
|
"source_type", "precedent_level", "is_binding",
|
||||||
}
|
}
|
||||||
@@ -2466,22 +2466,31 @@ async def precedent_library_stats() -> dict:
|
|||||||
|
|
||||||
async def request_metadata_extraction(case_law_id: UUID) -> bool:
|
async def request_metadata_extraction(case_law_id: UUID) -> bool:
|
||||||
"""Stamp ``metadata_extraction_requested_at`` for the local MCP worker
|
"""Stamp ``metadata_extraction_requested_at`` for the local MCP worker
|
||||||
to pick up. Returns False if the row is missing."""
|
to pick up. Returns False if the row is missing.
|
||||||
|
|
||||||
|
Originally restricted to ``source_kind='external_upload'`` (see git
|
||||||
|
blame). Opened to all source kinds 2026-05-06 — internal_committee
|
||||||
|
rows can also need re-extraction (e.g. corrupted subject_tags from
|
||||||
|
an early ingest pipeline). The extractor itself preserves user
|
||||||
|
values (``precedent_metadata_extractor.extract_and_apply`` only
|
||||||
|
fills empty fields), so this is safe.
|
||||||
|
"""
|
||||||
pool = await get_pool()
|
pool = await get_pool()
|
||||||
result = await pool.execute(
|
result = await pool.execute(
|
||||||
"UPDATE case_law SET metadata_extraction_requested_at = now() "
|
"UPDATE case_law SET metadata_extraction_requested_at = now() "
|
||||||
"WHERE id = $1 AND source_kind = 'external_upload'",
|
"WHERE id = $1",
|
||||||
case_law_id,
|
case_law_id,
|
||||||
)
|
)
|
||||||
return result == "UPDATE 1"
|
return result == "UPDATE 1"
|
||||||
|
|
||||||
|
|
||||||
async def request_halacha_extraction(case_law_id: UUID) -> bool:
|
async def request_halacha_extraction(case_law_id: UUID) -> bool:
|
||||||
"""Same but for halacha extraction."""
|
"""Same but for halacha extraction. See note on
|
||||||
|
:func:`request_metadata_extraction` re: opening to all source kinds."""
|
||||||
pool = await get_pool()
|
pool = await get_pool()
|
||||||
result = await pool.execute(
|
result = await pool.execute(
|
||||||
"UPDATE case_law SET halacha_extraction_requested_at = now() "
|
"UPDATE case_law SET halacha_extraction_requested_at = now() "
|
||||||
"WHERE id = $1 AND source_kind = 'external_upload'",
|
"WHERE id = $1",
|
||||||
case_law_id,
|
case_law_id,
|
||||||
)
|
)
|
||||||
return result == "UPDATE 1"
|
return result == "UPDATE 1"
|
||||||
|
|||||||
@@ -257,8 +257,11 @@ async def reextract_halachot(
|
|||||||
case_law_id = UUID(case_law_id)
|
case_law_id = UUID(case_law_id)
|
||||||
|
|
||||||
record = await db.get_case_law(case_law_id)
|
record = await db.get_case_law(case_law_id)
|
||||||
if not record or record.get("source_kind") != "external_upload":
|
if not record:
|
||||||
raise ValueError("precedent not found or not chair-uploaded")
|
raise ValueError("precedent not found")
|
||||||
|
# Was restricted to source_kind='external_upload'; opened 2026-05-06 so
|
||||||
|
# internal_committee rows can also be re-extracted when ingest produced
|
||||||
|
# bad data. See note in db.request_metadata_extraction.
|
||||||
|
|
||||||
await progress("extracting_halachot", 50, "מחלץ הלכות מחדש")
|
await progress("extracting_halachot", 50, "מחלץ הלכות מחדש")
|
||||||
result = await halacha_extractor.extract(case_law_id)
|
result = await halacha_extractor.extract(case_law_id)
|
||||||
@@ -402,8 +405,9 @@ async def reextract_metadata(
|
|||||||
case_law_id = UUID(case_law_id)
|
case_law_id = UUID(case_law_id)
|
||||||
|
|
||||||
record = await db.get_case_law(case_law_id)
|
record = await db.get_case_law(case_law_id)
|
||||||
if not record or record.get("source_kind") != "external_upload":
|
if not record:
|
||||||
raise ValueError("precedent not found or not chair-uploaded")
|
raise ValueError("precedent not found")
|
||||||
|
# See note in db.request_metadata_extraction — opened to all source kinds.
|
||||||
|
|
||||||
await progress("extracting_metadata", 40, "מחלץ מטא-דאטה (תקציר, תגיות)")
|
await progress("extracting_metadata", 40, "מחלץ מטא-דאטה (תקציר, תגיות)")
|
||||||
result = await precedent_metadata_extractor.extract_and_apply(case_law_id)
|
result = await precedent_metadata_extractor.extract_and_apply(case_law_id)
|
||||||
|
|||||||
@@ -223,7 +223,17 @@ async def apply_to_record(
|
|||||||
fields_to_update["key_quote"] = s
|
fields_to_update["key_quote"] = s
|
||||||
|
|
||||||
cur_tags = record.get("subject_tags") or []
|
cur_tags = record.get("subject_tags") or []
|
||||||
if not cur_tags:
|
# Treat character-by-character corruption as empty. Early ingest
|
||||||
|
# pipelines stored a JSON string (`'["היטל השבחה"]'`) into a TEXT[]
|
||||||
|
# column, which Postgres split into individual chars:
|
||||||
|
# `['[', '"', 'ה', 'י', 'ט', 'ל', ' ', 'ה', 'ש', ...]`. Detection:
|
||||||
|
# 3+ elements where every element is at most 2 chars (legitimate
|
||||||
|
# tags are multi-character Hebrew words like `היטל_השבחה`).
|
||||||
|
is_corrupt = (
|
||||||
|
len(cur_tags) >= 3
|
||||||
|
and all(isinstance(t, str) and len(t) <= 2 for t in cur_tags)
|
||||||
|
)
|
||||||
|
if not cur_tags or is_corrupt:
|
||||||
sug_tags = suggested.get("subject_tags") or []
|
sug_tags = suggested.get("subject_tags") or []
|
||||||
if sug_tags:
|
if sug_tags:
|
||||||
fields_to_update["subject_tags"] = sug_tags
|
fields_to_update["subject_tags"] = sug_tags
|
||||||
|
|||||||
Reference in New Issue
Block a user