Implements chaim's 2026-06-20 directive (5 steps; step 6 deferred): 1. No review queue — HALACHA_NO_REVIEW_QUEUE=true (auto-approve all → background); migration cleared 2,416 pending_review → approved. 2. Verified layer — halachot.verified/cite_count from chair citations (db.refresh_verified_layer + scripts/build_verified_layer.py runs citator on ALL committee decisions). 2,775 verified / 137 precedents. 3. Retrieval ranks verified ≫ background — HALACHA_VERIFIED_BOOST in both semantic + lexical halacha queries; filter now includes background (<> rejected). 5. Disabled destructive panel cap/novelty — HALACHA_PANEL_REGIME_ENABLED=false (8508/1049/1200 proved it lost 22-30 genuine principles incl. Lustrenik). 4. Ingest contract — going-forward already queues metadata; backfill_practice_area.py + 206 re-queued to the metadata drain. Source of truth: docs/precedent-corpus-redesign/00-final-synthesis.md. Quality flags are 97% false-positive (nli-audit) → no longer gate. UI queue removal → Claude Design gate. 429 tests green (no regressions). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
99 lines
3.7 KiB
Python
99 lines
3.7 KiB
Python
#!/usr/bin/env python3
|
|
"""Backfill practice_area for external precedents (#153, step 4 — ingest contract).
|
|
|
|
87% of external court rulings (209/239) lack practice_area, so area-scoped retrieval
|
|
misses them. The classifier infrastructure already exists
|
|
(precedent_metadata_extractor.extract_and_apply → practice_area + metadata); it just
|
|
never ran on these rows. This runs it on the unclassified, throttled by usage_limits.
|
|
|
|
Deterministic shortcut first (derive_domain_practice_area from our case-number scheme,
|
|
free); only rows it can't resolve go to the LLM classifier.
|
|
|
|
cd ~/legal-ai/mcp-server
|
|
HOME=/home/chaim .venv/bin/python ../scripts/backfill_practice_area.py --dry-run
|
|
HOME=/home/chaim .venv/bin/python ../scripts/backfill_practice_area.py --apply
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import asyncio
|
|
import os
|
|
import sys
|
|
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "mcp-server", "src"))
|
|
|
|
from legal_mcp.services import db, precedent_metadata_extractor # noqa: E402
|
|
from legal_mcp.services.practice_area import derive_domain_practice_area # noqa: E402
|
|
|
|
try:
|
|
from legal_mcp.services import usage_limits
|
|
except Exception: # pragma: no cover
|
|
usage_limits = None
|
|
|
|
|
|
def _over_ceiling() -> tuple[bool, str]:
|
|
if usage_limits is None:
|
|
return False, ""
|
|
u = usage_limits.subscription_usage()
|
|
if u is None:
|
|
return False, ""
|
|
over, _r, detail = usage_limits.ceiling_status(u)
|
|
return over, detail
|
|
|
|
|
|
async def _run(apply: bool, limit: int | None, throttle: bool) -> int:
|
|
pool = await db.get_pool()
|
|
rows = await pool.fetch(
|
|
"SELECT id, case_number FROM case_law "
|
|
"WHERE source_kind='external_upload' AND COALESCE(practice_area,'')='' "
|
|
" AND COALESCE(full_text,'')<>'' ORDER BY created_at")
|
|
if limit:
|
|
rows = rows[:limit]
|
|
print(f"[{'APPLY' if apply else 'DRY-RUN'}] {len(rows)} unclassified external precedents\n", flush=True)
|
|
det = llm = stopped = 0
|
|
by_area: dict[str, int] = {}
|
|
for n, r in enumerate(rows, 1):
|
|
# 1) deterministic from our case-number scheme (free)
|
|
area = derive_domain_practice_area(r["case_number"] or "")
|
|
if area:
|
|
det += 1
|
|
by_area[area] = by_area.get(area, 0) + 1
|
|
if apply:
|
|
await pool.execute("UPDATE case_law SET practice_area=$2 WHERE id=$1", r["id"], area)
|
|
continue
|
|
# 2) LLM classifier (throttled)
|
|
if throttle:
|
|
over, detail = _over_ceiling()
|
|
if over:
|
|
print(f"\n⏸ usage ceiling ({detail}) — stopping at {n-1}. Re-run to resume.", flush=True)
|
|
stopped = 1
|
|
break
|
|
if apply:
|
|
res = await precedent_metadata_extractor.extract_and_apply(r["id"])
|
|
pa = (res or {}).get("practice_area") or ""
|
|
if pa:
|
|
llm += 1
|
|
by_area[pa] = by_area.get(pa, 0) + 1
|
|
else:
|
|
llm += 1
|
|
if n % 20 == 0:
|
|
print(f" …{n}/{len(rows)}", flush=True)
|
|
print(f"\n── summary ── deterministic: {det} · LLM: {llm} · by_area: {by_area}"
|
|
f"{' (stopped early)' if stopped else ''}")
|
|
if not apply:
|
|
print("dry-run — nothing written. Re-run with --apply.")
|
|
return 0
|
|
|
|
|
|
def main() -> int:
|
|
p = argparse.ArgumentParser(description="Backfill practice_area for external precedents (#153)")
|
|
p.add_argument("--apply", action="store_true")
|
|
p.add_argument("--limit", type=int, default=None)
|
|
p.add_argument("--no-throttle", action="store_true")
|
|
a = p.parse_args()
|
|
return asyncio.run(_run(a.apply, a.limit, not a.no_throttle))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|