From a02b929b5c8f6ce0d1ab69f11608caf40295e0b3 Mon Sep 17 00:00:00 2001 From: Chaim Date: Mon, 15 Jun 2026 04:17:07 +0000 Subject: [PATCH] =?UTF-8?q?fix(precedents):=20=D7=A0=D7=A8=D7=9E=D7=95?= =?UTF-8?q?=D7=9C=20case=5Fnumber=20=D7=A2=D7=9E=D7=99=D7=93-=D7=9C=D7=94?= =?UTF-8?q?=D7=AA=D7=A0=D7=92=D7=A9=D7=95=D7=AA=20=E2=80=94=20=D7=9E=D7=93?= =?UTF-8?q?=D7=9C=D7=92=20=D7=95=D7=9E=D7=AA=D7=A2=D7=93,=20=D7=9C=D7=90?= =?UTF-8?q?=20=D7=A7=D7=95=D7=A8=D7=A1=20(#145)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ה-backfill של citation_formatted חשף קריסה ב-apply_to_record: כשפסק-דין חיצוני מכיל docket שכבר שייך לרשומה כפולה אחרת, נרמול case_number → docket-נקי נתקל ב-uq_case_law_external_number ומפיל את כל המיזוג (כולל הציטוט). דוגמה: 'ע"א 3213/97' → '3213/97' שכבר קיים (כפילות נקר). - db.case_number_collides(case_number, exclude_id) — בודק אם docket כבר שייך לרשומה לא-internal אחרת (האינדקס החלקי). - apply_to_record — מדלג על נרמול ה-case_number כשיש התנגשות (כפילות לדדופ בהמשך, לא ענייננו כאן) וממשיך לכתוב את הציטוט. no-silent-swallow: מתעד warning. - scripts/backfill_precedent_citations.py — try/except per-row + מונה שגיאות, כך ששורה אחת לא מפילה את האצווה. אומת: ריצה-מחדש מלאה ללא קריסה (0 שגיאות); ההתנגשות תועדה ודולגה כצפוי; פסיקת בית-משפט: 224/228 מולאו, 4 נמנעו (חסר צדדים/תאריך — abstention, INV-AH). test_fu2b_reconcile ✓. Invariants: INV-AH (abstention) · G1 (נרמול-בכתיבה נשמר, רק לא קורס) · חוקה §6 (אין בליעה שקטה — דילוג מתועד). Co-Authored-By: Claude Opus 4.8 (1M context) --- mcp-server/src/legal_mcp/services/db.py | 14 ++++++++++++++ .../services/precedent_metadata_extractor.py | 16 +++++++++++++++- scripts/backfill_precedent_citations.py | 12 +++++++++--- 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/mcp-server/src/legal_mcp/services/db.py b/mcp-server/src/legal_mcp/services/db.py index 74dcda5..6084a4d 100644 --- a/mcp-server/src/legal_mcp/services/db.py +++ b/mcp-server/src/legal_mcp/services/db.py @@ -4112,6 +4112,20 @@ async def get_case_law_by_citation(case_number: str) -> dict | None: return _row_to_case_law(row) if row else None +async def case_number_collides(case_number: str, exclude_id: UUID) -> bool: + """True if assigning ``case_number`` to a NON-internal row would violate the + partial unique index ``uq_case_law_external_number`` (``case_number`` WHERE + source_kind <> 'internal_committee') — i.e. another non-internal row already owns + that docket. Lets a caller SKIP the identity normalization (a duplicate to dedupe + later) instead of crashing the whole operation on the unique violation.""" + pool = await get_pool() + return bool(await pool.fetchval( + "SELECT 1 FROM case_law WHERE case_number = $1 AND id <> $2 " + "AND source_kind <> 'internal_committee' LIMIT 1", + case_number, exclude_id, + )) + + async def create_external_case_law( case_number: str, case_name: str, diff --git a/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py b/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py index 9a4c350..21bfc36 100644 --- a/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py +++ b/mcp-server/src/legal_mcp/services/precedent_metadata_extractor.py @@ -364,7 +364,21 @@ async def apply_to_record( and cn_clean != cur_cn and (overwrite_case_number or citation_shaped) ): - fields_to_update["case_number"] = cn_clean + # Skip (don't crash) when the clean docket already belongs to ANOTHER + # non-internal row — a duplicate to dedupe later, not this run's concern. + # Writing it would hit uq_case_law_external_number and abort the whole merge + # (including the citation). No-silent-swallow: log the skip. + if ( + record.get("source_kind") != "internal_committee" + and await db.case_number_collides(cn_clean, case_law_id) + ): + logger.warning( + "metadata_extractor: case_number normalization %r→%r skipped — docket " + "already owned by another non-internal row (likely duplicate)", + cur_cn, cn_clean, + ) + else: + fields_to_update["case_number"] = cn_clean # parties — store the extracted "עורר נ' משיב" line (the re-derivable basis for # the deterministic citation). Only fill when empty; chair edits are preserved. diff --git a/scripts/backfill_precedent_citations.py b/scripts/backfill_precedent_citations.py index 3e7b00f..50673d2 100644 --- a/scripts/backfill_precedent_citations.py +++ b/scripts/backfill_precedent_citations.py @@ -67,7 +67,7 @@ async def main() -> None: rows = await _empty_citation_rows(args.limit) print(f"רשומות עם citation_formatted ריק: {len(rows)}\n") - n_pass1 = n_pass2 = n_abstain = 0 + n_pass1 = n_pass2 = n_abstain = n_errors = 0 for r in rows: cid = r["id"] # Pass 1 — deterministic from the stored row (no LLM). @@ -92,7 +92,13 @@ async def main() -> None: print(f" ? [llm?] {r['case_number']} — would run extractor (dry-run)") continue - res = await precedent_metadata_extractor.extract_and_apply(cid) + # One bad row must never abort the batch — log and move on. + try: + res = await precedent_metadata_extractor.extract_and_apply(cid) + except Exception as e: # noqa: BLE001 — best-effort backfill, reported per-row + n_errors += 1 + print(f" ✗ [error] {r['case_number']}: {type(e).__name__}: {e}") + continue record2 = await db.get_case_law(cid) new_cit = (record2.get("citation_formatted") or "").strip() if new_cit: @@ -109,7 +115,7 @@ async def main() -> None: print( f"\nסיכום: דטרמיניסטי={n_pass1} · LLM={n_pass2} · " - f"נמנע (חסר רכיב)={n_abstain}" + f"נמנע (חסר רכיב)={n_abstain} · שגיאות={n_errors}" + ("" if args.apply else " (dry-run — לא נכתב)") ) -- 2.49.1