fix(precedents): נרמול case_number עמיד-להתנגשות — מדלג ומתעד, לא קורס (#145) #266

Merged
chaim merged 1 commits from worktree-backfill-citations-run into main 2026-06-15 04:17:39 +00:00
3 changed files with 38 additions and 4 deletions

View File

@@ -4112,6 +4112,20 @@ async def get_case_law_by_citation(case_number: str) -> dict | None:
return _row_to_case_law(row) if row else None
async def case_number_collides(case_number: str, exclude_id: UUID) -> bool:
"""True if assigning ``case_number`` to a NON-internal row would violate the
partial unique index ``uq_case_law_external_number`` (``case_number`` WHERE
source_kind <> 'internal_committee') — i.e. another non-internal row already owns
that docket. Lets a caller SKIP the identity normalization (a duplicate to dedupe
later) instead of crashing the whole operation on the unique violation."""
pool = await get_pool()
return bool(await pool.fetchval(
"SELECT 1 FROM case_law WHERE case_number = $1 AND id <> $2 "
"AND source_kind <> 'internal_committee' LIMIT 1",
case_number, exclude_id,
))
async def create_external_case_law(
case_number: str,
case_name: str,

View File

@@ -364,7 +364,21 @@ async def apply_to_record(
and cn_clean != cur_cn
and (overwrite_case_number or citation_shaped)
):
fields_to_update["case_number"] = cn_clean
# Skip (don't crash) when the clean docket already belongs to ANOTHER
# non-internal row — a duplicate to dedupe later, not this run's concern.
# Writing it would hit uq_case_law_external_number and abort the whole merge
# (including the citation). No-silent-swallow: log the skip.
if (
record.get("source_kind") != "internal_committee"
and await db.case_number_collides(cn_clean, case_law_id)
):
logger.warning(
"metadata_extractor: case_number normalization %r%r skipped — docket "
"already owned by another non-internal row (likely duplicate)",
cur_cn, cn_clean,
)
else:
fields_to_update["case_number"] = cn_clean
# parties — store the extracted "עורר נ' משיב" line (the re-derivable basis for
# the deterministic citation). Only fill when empty; chair edits are preserved.

View File

@@ -67,7 +67,7 @@ async def main() -> None:
rows = await _empty_citation_rows(args.limit)
print(f"רשומות עם citation_formatted ריק: {len(rows)}\n")
n_pass1 = n_pass2 = n_abstain = 0
n_pass1 = n_pass2 = n_abstain = n_errors = 0
for r in rows:
cid = r["id"]
# Pass 1 — deterministic from the stored row (no LLM).
@@ -92,7 +92,13 @@ async def main() -> None:
print(f" ? [llm?] {r['case_number']} — would run extractor (dry-run)")
continue
res = await precedent_metadata_extractor.extract_and_apply(cid)
# One bad row must never abort the batch — log and move on.
try:
res = await precedent_metadata_extractor.extract_and_apply(cid)
except Exception as e: # noqa: BLE001 — best-effort backfill, reported per-row
n_errors += 1
print(f" ✗ [error] {r['case_number']}: {type(e).__name__}: {e}")
continue
record2 = await db.get_case_law(cid)
new_cit = (record2.get("citation_formatted") or "").strip()
if new_cit:
@@ -109,7 +115,7 @@ async def main() -> None:
print(
f"\nסיכום: דטרמיניסטי={n_pass1} · LLM={n_pass2} · "
f"נמנע (חסר רכיב)={n_abstain}"
f"נמנע (חסר רכיב)={n_abstain} · שגיאות={n_errors}"
+ ("" if args.apply else " (dry-run — לא נכתב)")
)