#!/usr/bin/env python3 """#86.3 — benchmark halacha-extraction quality against Nevo's מיני-רציו gold-set. Nevo's editorial מיני-רציו is a free, professionally-written list of a ruling's holdings. By comparing the halachot WE extracted against it we get an honest, zero-cost measurement of extraction quality per ruling: * recall — fraction of Nevo's holdings that our halachot cover * precision — fraction of our halachot that map to a Nevo holding * granularity — our_count / nevo_holding_count (over-decomposition signal, the #81.5 concern: e.g. 14 ours vs 4 Nevo = 3.5x) The gold-truth ratio is read from ``case_law.nevo_ratio`` (populated by ``backfill_nevo_preamble.py`` / ingest). For rulings not yet backfilled it falls back to computing the ratio on-the-fly from the stored ``full_text``, so the harness works before and after the migration. An LLM-as-judge (local ``claude_session``, zero API cost) does the semantic mapping — string overlap can't tell "same holding, different words" from a genuinely new holding. The judge is asked to count, not to rewrite. Run with the MCP server venv (needs the local ``claude`` CLI): cd ~/legal-ai/mcp-server .venv/bin/python ../scripts/nevo_ratio_benchmark.py --case 'בג"ץ 1764/05' .venv/bin/python ../scripts/nevo_ratio_benchmark.py --all --limit 5 .venv/bin/python ../scripts/nevo_ratio_benchmark.py --all # full corpus """ from __future__ import annotations import argparse import asyncio import csv import json import sys from datetime import datetime, timezone from pathlib import Path from legal_mcp.services import claude_session, db from legal_mcp.services.extractor import extract_nevo_ratio REPO_ROOT = Path(__file__).resolve().parent.parent AUDIT_DIR = REPO_ROOT / "data" / "audit" _JUDGE_SYSTEM = ( "אתה בוחן-איכות משפטי. נתונים לך (א) רשימת ההלכות (מיני-רציו) שכתב עורך נבו " "עבור פסק-דין — אמת-המידה; (ב) רשימת ההלכות שמערכת אוטומטית חילצה מאותו " "פסק-דין. משימתך: למפות סמנטית בין השתיים (אותו עיקרון משפטי בניסוח שונה = " "התאמה), ולספור. החזר JSON בלבד, ללא טקסט נוסף." ) def _judge_prompt(ratio: str, ours: list[str]) -> str: ours_block = "\n".join(f"{i}. {s}" for i, s in enumerate(ours, 1)) or "(אין)" return ( f"מיני-רציו של נבו (אמת-מידה):\n{ratio}\n\n" f"ההלכות שחולצו על-ידי המערכת ({len(ours)}):\n{ours_block}\n\n" "החזר JSON עם המפתחות:\n" '{"nevo_holdings": <מספר העקרונות הנפרדים במיני-רציו>,\n' ' "covered": <כמה מעקרונות נבו מכוסים ע"י לפחות הלכה אחת שלנו>,\n' ' "ours_total": <מספר ההלכות שלנו>,\n' ' "ours_mapped": <כמה מההלכות שלנו ממופות לעיקרון נבו כלשהו>,\n' ' "notes": "<עד 2 משפטים: מה הוחמץ / מה עודף>"}' ) async def _bench_one(row: dict, model: str | None) -> dict: cn = row["case_number"] ratio = (row.get("nevo_ratio") or "").strip() or extract_nevo_ratio(row.get("full_text") or "") result = {"case_number": cn, "nevo_holdings": 0, "covered": 0, "ours_total": 0, "ours_mapped": 0, "recall": None, "precision": None, "granularity": None, "notes": "", "error": ""} if not ratio: result["error"] = "no mini-ratio" return result halachot = await db.list_halachot(case_law_id=row["id"], limit=500) ours = [h["rule_statement"] for h in halachot if h.get("review_status") in ("approved", "published", "pending_review") and (h.get("rule_statement") or "").strip()] result["ours_total"] = len(ours) if not ours: result["error"] = "no extracted halachot" return result try: verdict = await claude_session.query_json( _judge_prompt(ratio, ours), system=_JUDGE_SYSTEM, model=model, effort="low", ) except Exception as e: # noqa: BLE001 result["error"] = f"judge failed: {e}" return result if not isinstance(verdict, dict): result["error"] = "judge returned non-dict" return result nh = int(verdict.get("nevo_holdings") or 0) cov = int(verdict.get("covered") or 0) ot = int(verdict.get("ours_total") or len(ours)) om = int(verdict.get("ours_mapped") or 0) result.update({ "nevo_holdings": nh, "covered": cov, "ours_total": ot, "ours_mapped": om, "recall": round(cov / nh, 3) if nh else None, "precision": round(om / ot, 3) if ot else None, "granularity": round(ot / nh, 2) if nh else None, "notes": str(verdict.get("notes") or "")[:300], }) return result async def main(args: argparse.Namespace) -> int: pool = await db.get_pool() async with pool.acquire() as conn: if args.case: rows = await conn.fetch( "SELECT id, case_number, nevo_ratio, full_text FROM case_law " "WHERE case_number = $1", args.case, ) else: # rulings that have (or can derive) a ratio rows = await conn.fetch( "SELECT id, case_number, nevo_ratio, full_text FROM case_law " "WHERE nevo_ratio <> '' OR full_text LIKE '%מיני-רציו:%' " "ORDER BY case_number" ) rows = [dict(r) for r in rows] if args.limit: rows = rows[: args.limit] if not rows: print("no rulings with a mini-ratio found", flush=True) return 0 print(f"benchmarking {len(rows)} ruling(s)...", flush=True) results = [] for i, row in enumerate(rows, 1): res = await _bench_one(row, args.model) results.append(res) if res["error"]: print(f"[{i}/{len(rows)}] {res['case_number']}: SKIP ({res['error']})", flush=True) else: print(f"[{i}/{len(rows)}] {res['case_number']}: " f"recall={res['recall']} precision={res['precision']} " f"granularity={res['granularity']}x " f"(nevo={res['nevo_holdings']}, ours={res['ours_total']})", flush=True) scored = [r for r in results if r["recall"] is not None] if scored: avg = lambda k: round(sum(r[k] for r in scored) / len(scored), 3) # noqa: E731 print(f"\n=== {len(scored)} scored — mean recall={avg('recall')} " f"precision={avg('precision')} granularity={avg('granularity')}x ===", flush=True) ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") AUDIT_DIR.mkdir(parents=True, exist_ok=True) out = Path(args.out) if args.out else AUDIT_DIR / f"nevo-ratio-benchmark-{ts}.csv" with out.open("w", encoding="utf-8", newline="") as f: w = csv.DictWriter(f, fieldnames=list(results[0].keys())) w.writeheader() w.writerows(results) print(f"report: {out}", flush=True) return 0 if __name__ == "__main__": ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) g = ap.add_mutually_exclusive_group(required=True) g.add_argument("--case", help="benchmark a single case_number") g.add_argument("--all", action="store_true", help="benchmark all rulings with a mini-ratio") ap.add_argument("--limit", type=int, default=None, help="cap the number of rulings") ap.add_argument("--model", default=None, help="judge model (default: CLI session default)") ap.add_argument("--out", default=None, help="output CSV path (default: data/audit/)") args = ap.parse_args() sys.exit(asyncio.run(main(args)))