"""Backfill citation_formatted (מראה מקום) on case_law rows that lack it. Why this exists: a Flash model was asked to *format* the full citation and dropped the field outright on every run (#145). citation_formatted is now a DERIVED display field assembled deterministically (db.format_precedent_citation, X1 §3 / INV-ID2) from structured components. This script applies that derivation to the existing corpus. Two-pass per row (cheapest first, INV-AH abstention throughout — never invents): 1. NO-LLM: try db.format_precedent_citation on the STORED row. Fills committee rows that already have parties + docket + date (e.g. once parties were captured). No API cost. 2. LLM: if pass 1 abstains and the row has full_text, run the metadata extractor (extract_and_apply) — it extracts the COMPONENTS (parties, citation_prefix) and assembles the citation. This is what fills the 171 court rulings whose captions carry the parties+prefix. Rows where even the LLM can't recover a component (no rubric → no parties, e.g. our own caption-stripped internal decisions) are left empty and LOGGED — not back-filled with a guess (חוקה §6 — אין בליעה שקטה; the chair fills those by hand in /precedents/[id]). Idempotent (G3): only ever fills an EMPTY citation_formatted; re-running skips rows that already have one. Run (dry-run, default — reports what each pass WOULD do, writes nothing): HOME=/home/chaim mcp-server/.venv/bin/python scripts/backfill_precedent_citations.py Apply: HOME=/home/chaim mcp-server/.venv/bin/python scripts/backfill_precedent_citations.py --apply Options: --limit N process at most N empty-citation rows --no-llm pass-1 only (deterministic from stored fields; zero API cost) """ from __future__ import annotations import argparse import asyncio import os import sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "mcp-server", "src")) from legal_mcp.services import db, precedent_metadata_extractor # noqa: E402 async def _empty_citation_rows(limit: int | None) -> list[dict]: pool = await db.get_pool() sql = ( "SELECT id, case_number, source_kind, source_type, precedent_level, " " (full_text IS NOT NULL AND length(full_text) > 200) AS has_text " "FROM case_law WHERE COALESCE(citation_formatted, '') = '' " "ORDER BY created_at" ) if limit: sql += f" LIMIT {int(limit)}" rows = await pool.fetch(sql) return [dict(r) for r in rows] async def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("--apply", action="store_true", help="write changes (default: dry-run)") ap.add_argument("--limit", type=int, default=None) ap.add_argument("--no-llm", action="store_true", help="deterministic pass only (no API)") args = ap.parse_args() rows = await _empty_citation_rows(args.limit) print(f"רשומות עם citation_formatted ריק: {len(rows)}\n") n_pass1 = n_pass2 = n_abstain = n_errors = 0 for r in rows: cid = r["id"] # Pass 1 — deterministic from the stored row (no LLM). record = await db.get_case_law(cid) cit = db.format_precedent_citation(record) if cit: n_pass1 += 1 print(f" ✓ [det] {r['case_number']}: {cit}") if args.apply: await db.update_case_law(cid, citation_formatted=cit) await db.recompute_searchable(cid) continue # Pass 2 — extract components via the LLM, then assemble. if args.no_llm or not r["has_text"]: n_abstain += 1 why = "no full_text" if not r["has_text"] else "no-llm" print(f" · [skip:{why}] {r['case_number']} ({r['precedent_level'] or '—'})") continue if not args.apply: print(f" ? [llm?] {r['case_number']} — would run extractor (dry-run)") continue # One bad row must never abort the batch — log and move on. try: res = await precedent_metadata_extractor.extract_and_apply(cid) except Exception as e: # noqa: BLE001 — best-effort backfill, reported per-row n_errors += 1 print(f" ✗ [error] {r['case_number']}: {type(e).__name__}: {e}") continue record2 = await db.get_case_law(cid) new_cit = (record2.get("citation_formatted") or "").strip() if new_cit: n_pass2 += 1 print(f" ✓ [llm] {r['case_number']}: {new_cit}") else: n_abstain += 1 parties = (record2.get("parties") or "").strip() print( f" · [abstain] {r['case_number']} ({r['precedent_level'] or '—'}) — " f"{'no parties in text' if not parties else 'missing component'} " f"[extractor:{res.get('status')}]" ) print( f"\nסיכום: דטרמיניסטי={n_pass1} · LLM={n_pass2} · " f"נמנע (חסר רכיב)={n_abstain} · שגיאות={n_errors}" + ("" if args.apply else " (dry-run — לא נכתב)") ) if __name__ == "__main__": asyncio.run(main())