"""Derive missing_precedents 'open' gaps from cited_only stubs (#143, G2). Two parallel systems described the same concept — "a cited precedent whose text isn't in the corpus": the ``missing_precedents`` queue (the chair's acquisition list) and ``case_law`` rows with ``source_kind='cited_only'`` (citation-only stubs seeded by the X11 / corpus-graph). Overlap was ~0, so the 31 cited_only stubs never surfaced on /missing-precedents. This makes ``missing_precedents`` the single source-of-truth FOR THE QUEUE and ``cited_only`` a DERIVED discovery source (like digests feed the radar): 1. Backfill ``citation_norm`` (designator-aware dedup key) for every existing missing_precedent — required before the dedup below can match. 2. For each cited_only stub, derive an 'open' missing_precedent (deduped on citation_norm), with ``discovery_source='cited_only'``, ``linked_case_law_id`` = the stub (its canonical identity is known; status stays 'open' until the text is uploaded → promote-in-place), and notes listing the precedents that cite it. Idempotent / re-runnable. Dry-run by default; ``--apply`` to write. Host-only. Run: HOME=/home/chaim mcp-server/.venv/bin/python scripts/derive_missing_from_cited_only.py [--apply] """ from __future__ import annotations import asyncio import os import sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "mcp-server", "src")) from legal_mcp.services import court_citation, db async def _backfill_citation_norm(pool, apply: bool) -> int: rows = await pool.fetch( "SELECT id, citation FROM missing_precedents " "WHERE COALESCE(citation_norm, '') = ''" ) n = 0 for r in rows: norm = court_citation.citation_dedup_key(r["citation"] or "") if not norm: continue if apply: await pool.execute( "UPDATE missing_precedents SET citation_norm = $2 WHERE id = $1", r["id"], norm, ) n += 1 return n async def _citing_precedents_note(pool, stub_id) -> str: rows = await pool.fetch( """SELECT DISTINCT cl.case_number FROM precedent_internal_citations p JOIN case_law cl ON cl.id = p.source_case_law_id WHERE p.cited_case_law_id = $1 AND COALESCE(cl.case_number,'') <> '' ORDER BY cl.case_number LIMIT 8""", stub_id, ) citers = [r["case_number"] for r in rows] base = "נגזר מ-cited_only (גרף-הציטוטים)" if citers: return f"{base}; מצוטט ע\"י: {', '.join(citers)}" return base async def main(apply: bool) -> int: pool = await db.get_pool() backfilled = await _backfill_citation_norm(pool, apply) print(f"citation_norm backfill (existing rows){'' if apply else ' [dry]'}: {backfilled}") stubs = await pool.fetch( "SELECT id, case_number, case_name FROM case_law " "WHERE source_kind = 'cited_only' ORDER BY case_number" ) print(f"cited_only stubs: {len(stubs)}") created = 0 skipped = 0 for s in stubs: citation = (s["case_number"] or "").strip() if not citation: print(f" SKIP (no case_number) id={s['id']}") continue existing = await db.find_missing_precedent_by_citation(citation) if existing: skipped += 1 continue norm = court_citation.citation_dedup_key(citation) print(f" + {citation:<22} norm={norm!r} name={(s['case_name'] or '')[:24]!r}") if apply: note = await _citing_precedents_note(pool, s["id"]) await db.create_missing_precedent( citation=citation, case_name=s["case_name"] or None, discovery_source="cited_only", linked_case_law_id=s["id"], notes=note, ) created += 1 print(f"\n{'created' if apply else 'would create'}: {created} already-present (deduped): {skipped}") if not apply: print("(dry-run — pass --apply to write)") return 0 if __name__ == "__main__": sys.exit(asyncio.run(main("--apply" in sys.argv)))