ה-backfill של citation_formatted חשף קריסה ב-apply_to_record: כשפסק-דין חיצוני מכיל docket שכבר שייך לרשומה כפולה אחרת, נרמול case_number → docket-נקי נתקל ב-uq_case_law_external_number ומפיל את כל המיזוג (כולל הציטוט). דוגמה: 'ע"א 3213/97' → '3213/97' שכבר קיים (כפילות נקר). - db.case_number_collides(case_number, exclude_id) — בודק אם docket כבר שייך לרשומה לא-internal אחרת (האינדקס החלקי). - apply_to_record — מדלג על נרמול ה-case_number כשיש התנגשות (כפילות לדדופ בהמשך, לא ענייננו כאן) וממשיך לכתוב את הציטוט. no-silent-swallow: מתעד warning. - scripts/backfill_precedent_citations.py — try/except per-row + מונה שגיאות, כך ששורה אחת לא מפילה את האצווה. אומת: ריצה-מחדש מלאה ללא קריסה (0 שגיאות); ההתנגשות תועדה ודולגה כצפוי; פסיקת בית-משפט: 224/228 מולאו, 4 נמנעו (חסר צדדים/תאריך — abstention, INV-AH). test_fu2b_reconcile ✓. Invariants: INV-AH (abstention) · G1 (נרמול-בכתיבה נשמר, רק לא קורס) · חוקה §6 (אין בליעה שקטה — דילוג מתועד). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
125 lines
5.1 KiB
Python
125 lines
5.1 KiB
Python
"""Backfill citation_formatted (מראה מקום) on case_law rows that lack it.
|
|
|
|
Why this exists: a Flash model was asked to *format* the full citation and dropped
|
|
the field outright on every run (#145). citation_formatted is now a DERIVED display
|
|
field assembled deterministically (db.format_precedent_citation, X1 §3 / INV-ID2) from
|
|
structured components. This script applies that derivation to the existing corpus.
|
|
|
|
Two-pass per row (cheapest first, INV-AH abstention throughout — never invents):
|
|
|
|
1. NO-LLM: try db.format_precedent_citation on the STORED row. Fills committee rows
|
|
that already have parties + docket + date (e.g. once parties were captured). No
|
|
API cost.
|
|
2. LLM: if pass 1 abstains and the row has full_text, run the metadata extractor
|
|
(extract_and_apply) — it extracts the COMPONENTS (parties, citation_prefix) and
|
|
assembles the citation. This is what fills the 171 court rulings whose captions
|
|
carry the parties+prefix.
|
|
|
|
Rows where even the LLM can't recover a component (no rubric → no parties, e.g. our own
|
|
caption-stripped internal decisions) are left empty and LOGGED — not back-filled with a
|
|
guess (חוקה §6 — אין בליעה שקטה; the chair fills those by hand in /precedents/[id]).
|
|
|
|
Idempotent (G3): only ever fills an EMPTY citation_formatted; re-running skips rows that
|
|
already have one.
|
|
|
|
Run (dry-run, default — reports what each pass WOULD do, writes nothing):
|
|
HOME=/home/chaim mcp-server/.venv/bin/python scripts/backfill_precedent_citations.py
|
|
Apply:
|
|
HOME=/home/chaim mcp-server/.venv/bin/python scripts/backfill_precedent_citations.py --apply
|
|
Options:
|
|
--limit N process at most N empty-citation rows
|
|
--no-llm pass-1 only (deterministic from stored fields; zero API cost)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import asyncio
|
|
import os
|
|
import sys
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "mcp-server", "src"))
|
|
|
|
from legal_mcp.services import db, precedent_metadata_extractor # noqa: E402
|
|
|
|
|
|
async def _empty_citation_rows(limit: int | None) -> list[dict]:
|
|
pool = await db.get_pool()
|
|
sql = (
|
|
"SELECT id, case_number, source_kind, source_type, precedent_level, "
|
|
" (full_text IS NOT NULL AND length(full_text) > 200) AS has_text "
|
|
"FROM case_law WHERE COALESCE(citation_formatted, '') = '' "
|
|
"ORDER BY created_at"
|
|
)
|
|
if limit:
|
|
sql += f" LIMIT {int(limit)}"
|
|
rows = await pool.fetch(sql)
|
|
return [dict(r) for r in rows]
|
|
|
|
|
|
async def main() -> None:
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("--apply", action="store_true", help="write changes (default: dry-run)")
|
|
ap.add_argument("--limit", type=int, default=None)
|
|
ap.add_argument("--no-llm", action="store_true", help="deterministic pass only (no API)")
|
|
args = ap.parse_args()
|
|
|
|
rows = await _empty_citation_rows(args.limit)
|
|
print(f"רשומות עם citation_formatted ריק: {len(rows)}\n")
|
|
|
|
n_pass1 = n_pass2 = n_abstain = n_errors = 0
|
|
for r in rows:
|
|
cid = r["id"]
|
|
# Pass 1 — deterministic from the stored row (no LLM).
|
|
record = await db.get_case_law(cid)
|
|
cit = db.format_precedent_citation(record)
|
|
if cit:
|
|
n_pass1 += 1
|
|
print(f" ✓ [det] {r['case_number']}: {cit}")
|
|
if args.apply:
|
|
await db.update_case_law(cid, citation_formatted=cit)
|
|
await db.recompute_searchable(cid)
|
|
continue
|
|
|
|
# Pass 2 — extract components via the LLM, then assemble.
|
|
if args.no_llm or not r["has_text"]:
|
|
n_abstain += 1
|
|
why = "no full_text" if not r["has_text"] else "no-llm"
|
|
print(f" · [skip:{why}] {r['case_number']} ({r['precedent_level'] or '—'})")
|
|
continue
|
|
|
|
if not args.apply:
|
|
print(f" ? [llm?] {r['case_number']} — would run extractor (dry-run)")
|
|
continue
|
|
|
|
# One bad row must never abort the batch — log and move on.
|
|
try:
|
|
res = await precedent_metadata_extractor.extract_and_apply(cid)
|
|
except Exception as e: # noqa: BLE001 — best-effort backfill, reported per-row
|
|
n_errors += 1
|
|
print(f" ✗ [error] {r['case_number']}: {type(e).__name__}: {e}")
|
|
continue
|
|
record2 = await db.get_case_law(cid)
|
|
new_cit = (record2.get("citation_formatted") or "").strip()
|
|
if new_cit:
|
|
n_pass2 += 1
|
|
print(f" ✓ [llm] {r['case_number']}: {new_cit}")
|
|
else:
|
|
n_abstain += 1
|
|
parties = (record2.get("parties") or "").strip()
|
|
print(
|
|
f" · [abstain] {r['case_number']} ({r['precedent_level'] or '—'}) — "
|
|
f"{'no parties in text' if not parties else 'missing component'} "
|
|
f"[extractor:{res.get('status')}]"
|
|
)
|
|
|
|
print(
|
|
f"\nסיכום: דטרמיניסטי={n_pass1} · LLM={n_pass2} · "
|
|
f"נמנע (חסר רכיב)={n_abstain} · שגיאות={n_errors}"
|
|
+ ("" if args.apply else " (dry-run — לא נכתב)")
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|