feat(nevo): backfill leaked preamble + ratio gold-set benchmark (#86)
#86.2 backfill + #86.3 benchmark, plus a #86.1 over-strip fix found en route.
extractor.py
- extract_nevo_ratio(): capture Nevo's מיני-רציו block (editorial holdings
summary) before it is stripped — a free professional gold-set (#86.3).
- _DECISION_START hardening (#86.2): the merged #86.1 regex over-stripped.
(a) פסק-דין headers are markdown-wrapped (**פסק דין**); the old anchor
required the keyword as the first line char with one separator, so it
missed the header and matched a citation 32K deep (עמ"נ 50567-07-21,
losing 45% of the body). Now tolerates leading markdown + 0-3 seps,
and the final-nun form (דין ן vs דינו נ).
(b) bare השופט/הנשיא matched CITATIONS ("השופט מ' חשין, פסקה 23"). The
authoring-judge line ends with a colon; we now require it.
ingest.py
- capture the ratio before stripping and store it on the row (best-effort,
non-fatal); also strip the text-upload path (was file-only).
db.py
- add case_law.nevo_ratio column (additive); allow it in update_case_law.
scripts/backfill_nevo_preamble.py (#86.2) — dry-run-by-default data migration:
finds historically-leaked rulings, captures ratio→nevo_ratio, rewrites
full_text (+content_hash), reindexes, and FLAGS (never deletes) halachot whose
quote lives in the removed preamble (review_status=pending_review +
nevo_preamble_leak flag). Safety guard: rows with keep%<--min-keep (60) are
excluded from --apply as suspected over-strip. --apply writes backup+manifest
to data/audit/ first. Chair-gated — NOT applied here.
scripts/nevo_ratio_benchmark.py (#86.3) — LLM-as-judge (local claude_session,
zero cost) measures recall/precision/granularity of our halachot vs the Nevo
ratio. Works pre- and post-backfill (reads nevo_ratio, falls back to full_text).
Verified:
- pytest tests/test_nevo_preamble.py — 12 passed (incl. citation/markdown
over-strip regressions).
- backfill dry-run: 19 leaked rulings, 27 contaminated halachot, all ≥75%
keep (the 32K over-strip is gone).
- benchmark on בג"ץ 1764/05: recall=0.875 precision=1.0 granularity=1.75x.
Invariants: G1 (normalize at source — strip/capture at ingest, not at read);
no silent swallow (contaminated halachot flagged + reported, not dropped);
data-migration is dry-run-default with backup+manifest, chair-gated.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
173
scripts/nevo_ratio_benchmark.py
Normal file
173
scripts/nevo_ratio_benchmark.py
Normal file
@@ -0,0 +1,173 @@
|
||||
#!/usr/bin/env python3
|
||||
"""#86.3 — benchmark halacha-extraction quality against Nevo's מיני-רציו gold-set.
|
||||
|
||||
Nevo's editorial מיני-רציו is a free, professionally-written list of a ruling's
|
||||
holdings. By comparing the halachot WE extracted against it we get an honest,
|
||||
zero-cost measurement of extraction quality per ruling:
|
||||
|
||||
* recall — fraction of Nevo's holdings that our halachot cover
|
||||
* precision — fraction of our halachot that map to a Nevo holding
|
||||
* granularity — our_count / nevo_holding_count (over-decomposition signal,
|
||||
the #81.5 concern: e.g. 14 ours vs 4 Nevo = 3.5x)
|
||||
|
||||
The gold-truth ratio is read from ``case_law.nevo_ratio`` (populated by
|
||||
``backfill_nevo_preamble.py`` / ingest). For rulings not yet backfilled it
|
||||
falls back to computing the ratio on-the-fly from the stored ``full_text``,
|
||||
so the harness works before and after the migration.
|
||||
|
||||
An LLM-as-judge (local ``claude_session``, zero API cost) does the semantic
|
||||
mapping — string overlap can't tell "same holding, different words" from a
|
||||
genuinely new holding. The judge is asked to count, not to rewrite.
|
||||
|
||||
Run with the MCP server venv (needs the local ``claude`` CLI):
|
||||
|
||||
cd ~/legal-ai/mcp-server
|
||||
.venv/bin/python ../scripts/nevo_ratio_benchmark.py --case 'בג"ץ 1764/05'
|
||||
.venv/bin/python ../scripts/nevo_ratio_benchmark.py --all --limit 5
|
||||
.venv/bin/python ../scripts/nevo_ratio_benchmark.py --all # full corpus
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import csv
|
||||
import json
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from legal_mcp.services import claude_session, db
|
||||
from legal_mcp.services.extractor import extract_nevo_ratio
|
||||
|
||||
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||
AUDIT_DIR = REPO_ROOT / "data" / "audit"
|
||||
|
||||
_JUDGE_SYSTEM = (
|
||||
"אתה בוחן-איכות משפטי. נתונים לך (א) רשימת ההלכות (מיני-רציו) שכתב עורך נבו "
|
||||
"עבור פסק-דין — אמת-המידה; (ב) רשימת ההלכות שמערכת אוטומטית חילצה מאותו "
|
||||
"פסק-דין. משימתך: למפות סמנטית בין השתיים (אותו עיקרון משפטי בניסוח שונה = "
|
||||
"התאמה), ולספור. החזר JSON בלבד, ללא טקסט נוסף."
|
||||
)
|
||||
|
||||
|
||||
def _judge_prompt(ratio: str, ours: list[str]) -> str:
|
||||
ours_block = "\n".join(f"{i}. {s}" for i, s in enumerate(ours, 1)) or "(אין)"
|
||||
return (
|
||||
f"מיני-רציו של נבו (אמת-מידה):\n{ratio}\n\n"
|
||||
f"ההלכות שחולצו על-ידי המערכת ({len(ours)}):\n{ours_block}\n\n"
|
||||
"החזר JSON עם המפתחות:\n"
|
||||
'{"nevo_holdings": <מספר העקרונות הנפרדים במיני-רציו>,\n'
|
||||
' "covered": <כמה מעקרונות נבו מכוסים ע"י לפחות הלכה אחת שלנו>,\n'
|
||||
' "ours_total": <מספר ההלכות שלנו>,\n'
|
||||
' "ours_mapped": <כמה מההלכות שלנו ממופות לעיקרון נבו כלשהו>,\n'
|
||||
' "notes": "<עד 2 משפטים: מה הוחמץ / מה עודף>"}'
|
||||
)
|
||||
|
||||
|
||||
async def _bench_one(row: dict, model: str | None) -> dict:
|
||||
cn = row["case_number"]
|
||||
ratio = (row.get("nevo_ratio") or "").strip() or extract_nevo_ratio(row.get("full_text") or "")
|
||||
result = {"case_number": cn, "nevo_holdings": 0, "covered": 0,
|
||||
"ours_total": 0, "ours_mapped": 0, "recall": None,
|
||||
"precision": None, "granularity": None, "notes": "", "error": ""}
|
||||
if not ratio:
|
||||
result["error"] = "no mini-ratio"
|
||||
return result
|
||||
|
||||
halachot = await db.list_halachot(case_law_id=row["id"], limit=500)
|
||||
ours = [h["rule_statement"] for h in halachot
|
||||
if h.get("review_status") in ("approved", "published", "pending_review")
|
||||
and (h.get("rule_statement") or "").strip()]
|
||||
result["ours_total"] = len(ours)
|
||||
if not ours:
|
||||
result["error"] = "no extracted halachot"
|
||||
return result
|
||||
|
||||
try:
|
||||
verdict = await claude_session.query_json(
|
||||
_judge_prompt(ratio, ours), system=_JUDGE_SYSTEM, model=model, effort="low",
|
||||
)
|
||||
except Exception as e: # noqa: BLE001
|
||||
result["error"] = f"judge failed: {e}"
|
||||
return result
|
||||
if not isinstance(verdict, dict):
|
||||
result["error"] = "judge returned non-dict"
|
||||
return result
|
||||
|
||||
nh = int(verdict.get("nevo_holdings") or 0)
|
||||
cov = int(verdict.get("covered") or 0)
|
||||
ot = int(verdict.get("ours_total") or len(ours))
|
||||
om = int(verdict.get("ours_mapped") or 0)
|
||||
result.update({
|
||||
"nevo_holdings": nh, "covered": cov, "ours_total": ot, "ours_mapped": om,
|
||||
"recall": round(cov / nh, 3) if nh else None,
|
||||
"precision": round(om / ot, 3) if ot else None,
|
||||
"granularity": round(ot / nh, 2) if nh else None,
|
||||
"notes": str(verdict.get("notes") or "")[:300],
|
||||
})
|
||||
return result
|
||||
|
||||
|
||||
async def main(args: argparse.Namespace) -> int:
|
||||
pool = await db.get_pool()
|
||||
async with pool.acquire() as conn:
|
||||
if args.case:
|
||||
rows = await conn.fetch(
|
||||
"SELECT id, case_number, nevo_ratio, full_text FROM case_law "
|
||||
"WHERE case_number = $1", args.case,
|
||||
)
|
||||
else:
|
||||
# rulings that have (or can derive) a ratio
|
||||
rows = await conn.fetch(
|
||||
"SELECT id, case_number, nevo_ratio, full_text FROM case_law "
|
||||
"WHERE nevo_ratio <> '' OR full_text LIKE '%מיני-רציו:%' "
|
||||
"ORDER BY case_number"
|
||||
)
|
||||
rows = [dict(r) for r in rows]
|
||||
if args.limit:
|
||||
rows = rows[: args.limit]
|
||||
if not rows:
|
||||
print("no rulings with a mini-ratio found", flush=True)
|
||||
return 0
|
||||
|
||||
print(f"benchmarking {len(rows)} ruling(s)...", flush=True)
|
||||
results = []
|
||||
for i, row in enumerate(rows, 1):
|
||||
res = await _bench_one(row, args.model)
|
||||
results.append(res)
|
||||
if res["error"]:
|
||||
print(f"[{i}/{len(rows)}] {res['case_number']}: SKIP ({res['error']})", flush=True)
|
||||
else:
|
||||
print(f"[{i}/{len(rows)}] {res['case_number']}: "
|
||||
f"recall={res['recall']} precision={res['precision']} "
|
||||
f"granularity={res['granularity']}x "
|
||||
f"(nevo={res['nevo_holdings']}, ours={res['ours_total']})", flush=True)
|
||||
|
||||
scored = [r for r in results if r["recall"] is not None]
|
||||
if scored:
|
||||
avg = lambda k: round(sum(r[k] for r in scored) / len(scored), 3) # noqa: E731
|
||||
print(f"\n=== {len(scored)} scored — mean recall={avg('recall')} "
|
||||
f"precision={avg('precision')} granularity={avg('granularity')}x ===", flush=True)
|
||||
|
||||
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
||||
AUDIT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
out = Path(args.out) if args.out else AUDIT_DIR / f"nevo-ratio-benchmark-{ts}.csv"
|
||||
with out.open("w", encoding="utf-8", newline="") as f:
|
||||
w = csv.DictWriter(f, fieldnames=list(results[0].keys()))
|
||||
w.writeheader()
|
||||
w.writerows(results)
|
||||
print(f"report: {out}", flush=True)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ap = argparse.ArgumentParser(description=__doc__,
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
g = ap.add_mutually_exclusive_group(required=True)
|
||||
g.add_argument("--case", help="benchmark a single case_number")
|
||||
g.add_argument("--all", action="store_true", help="benchmark all rulings with a mini-ratio")
|
||||
ap.add_argument("--limit", type=int, default=None, help="cap the number of rulings")
|
||||
ap.add_argument("--model", default=None, help="judge model (default: CLI session default)")
|
||||
ap.add_argument("--out", default=None, help="output CSV path (default: data/audit/)")
|
||||
args = ap.parse_args()
|
||||
sys.exit(asyncio.run(main(args)))
|
||||
Reference in New Issue
Block a user