#!/usr/bin/env python3 """Backfill practice_area for external precedents (#153, step 4 — ingest contract). 87% of external court rulings (209/239) lack practice_area, so area-scoped retrieval misses them. The classifier infrastructure already exists (precedent_metadata_extractor.extract_and_apply → practice_area + metadata); it just never ran on these rows. This runs it on the unclassified, throttled by usage_limits. Deterministic shortcut first (derive_domain_practice_area from our case-number scheme, free); only rows it can't resolve go to the LLM classifier. cd ~/legal-ai/mcp-server HOME=/home/chaim .venv/bin/python ../scripts/backfill_practice_area.py --dry-run HOME=/home/chaim .venv/bin/python ../scripts/backfill_practice_area.py --apply """ from __future__ import annotations import argparse import asyncio import os import sys sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "mcp-server", "src")) from legal_mcp.services import db, precedent_metadata_extractor # noqa: E402 from legal_mcp.services.practice_area import derive_domain_practice_area # noqa: E402 try: from legal_mcp.services import usage_limits except Exception: # pragma: no cover usage_limits = None def _over_ceiling() -> tuple[bool, str]: if usage_limits is None: return False, "" u = usage_limits.subscription_usage() if u is None: return False, "" over, _r, detail = usage_limits.ceiling_status(u) return over, detail async def _run(apply: bool, limit: int | None, throttle: bool) -> int: pool = await db.get_pool() rows = await pool.fetch( "SELECT id, case_number FROM case_law " "WHERE source_kind='external_upload' AND COALESCE(practice_area,'')='' " " AND COALESCE(full_text,'')<>'' ORDER BY created_at") if limit: rows = rows[:limit] print(f"[{'APPLY' if apply else 'DRY-RUN'}] {len(rows)} unclassified external precedents\n", flush=True) det = llm = stopped = 0 by_area: dict[str, int] = {} for n, r in enumerate(rows, 1): # 1) deterministic from our case-number scheme (free) area = derive_domain_practice_area(r["case_number"] or "") if area: det += 1 by_area[area] = by_area.get(area, 0) + 1 if apply: await pool.execute("UPDATE case_law SET practice_area=$2 WHERE id=$1", r["id"], area) continue # 2) LLM classifier (throttled) if throttle: over, detail = _over_ceiling() if over: print(f"\n⏸ usage ceiling ({detail}) — stopping at {n-1}. Re-run to resume.", flush=True) stopped = 1 break if apply: res = await precedent_metadata_extractor.extract_and_apply(r["id"]) pa = (res or {}).get("practice_area") or "" if pa: llm += 1 by_area[pa] = by_area.get(pa, 0) + 1 else: llm += 1 if n % 20 == 0: print(f" …{n}/{len(rows)}", flush=True) print(f"\n── summary ── deterministic: {det} · LLM: {llm} · by_area: {by_area}" f"{' (stopped early)' if stopped else ''}") if not apply: print("dry-run — nothing written. Re-run with --apply.") return 0 def main() -> int: p = argparse.ArgumentParser(description="Backfill practice_area for external precedents (#153)") p.add_argument("--apply", action="store_true") p.add_argument("--limit", type=int, default=None) p.add_argument("--no-throttle", action="store_true") a = p.parse_args() return asyncio.run(_run(a.apply, a.limit, not a.no_throttle)) if __name__ == "__main__": raise SystemExit(main())