"""Backfill missing_precedents / court-fetch from unlinked digests (#136). The digest pipeline used to silently drop an underlying citation it couldn't autolink unless it was a fetchable court verdict — so ערר/בל"מ rulings mentioned in the daily yomon never surfaced as gaps. After the fix, ``try_autolink`` opens a missing_precedent for non-fetchable gaps (and a court-fetch job for fetchable). This re-runs that canonical path over every already-ingested digest that has an ``underlying_citation`` but no ``linked_case_law_id`` — so the historical backlog surfaces too. Reuses ``digest_library.try_autolink`` (one code path, G2): each digest is re-attempted (it may now link to a precedent added since) and, failing that, a deduped gap is opened. Idempotent (dedup designator-aware via citation_norm). Dry-run by default — classifies and counts without writing; ``--apply`` runs the autolink. Host-only. Run: HOME=/home/chaim mcp-server/.venv/bin/python scripts/backfill_digest_missing_precedents.py [--apply] """ from __future__ import annotations import asyncio import os import sys from collections import Counter sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "mcp-server", "src")) from legal_mcp.services import court_citation, db, digest_library async def main(apply: bool) -> int: pool = await db.get_pool() rows = await pool.fetch( "SELECT id, underlying_citation, yomon_number FROM digests " "WHERE COALESCE(underlying_citation, '') <> '' " "AND linked_case_law_id IS NULL" ) print(f"unlinked digests with a citation: {len(rows)}") tiers = Counter() for r in rows: tiers[court_citation.classify(r["underlying_citation"]).tier] += 1 print("by tier:", dict(tiers), "→ fetchable(supreme+admin)=%d, gap(skip+unknown)=%d" % (tiers["supreme"] + tiers["admin"], tiers["skip"] + tiers["unknown"])) if not apply: print("\n(dry-run — pass --apply to run autolink: links what it can, opens " "deduped missing_precedents for ערר/unknown, court-fetch for verdicts)") return 0 linked = gaps = 0 for r in rows: before = await db.find_missing_precedent_by_citation(r["underlying_citation"]) result = await digest_library.try_autolink(r["id"], r["underlying_citation"]) if result: linked += 1 elif before is None: # a new gap (court-fetch job or missing_precedent) was opened gaps += 1 print(f"\nlinked now: {linked} new gaps opened: {gaps} " f"(already-present deduped: {len(rows) - linked - gaps})") return 0 if __name__ == "__main__": sys.exit(asyncio.run(main("--apply" in sys.argv)))