feat(nevo): backfill leaked preamble + ratio gold-set benchmark (#86)
#86.2 backfill + #86.3 benchmark, plus a #86.1 over-strip fix found en route.
extractor.py
- extract_nevo_ratio(): capture Nevo's מיני-רציו block (editorial holdings
summary) before it is stripped — a free professional gold-set (#86.3).
- _DECISION_START hardening (#86.2): the merged #86.1 regex over-stripped.
(a) פסק-דין headers are markdown-wrapped (**פסק דין**); the old anchor
required the keyword as the first line char with one separator, so it
missed the header and matched a citation 32K deep (עמ"נ 50567-07-21,
losing 45% of the body). Now tolerates leading markdown + 0-3 seps,
and the final-nun form (דין ן vs דינו נ).
(b) bare השופט/הנשיא matched CITATIONS ("השופט מ' חשין, פסקה 23"). The
authoring-judge line ends with a colon; we now require it.
ingest.py
- capture the ratio before stripping and store it on the row (best-effort,
non-fatal); also strip the text-upload path (was file-only).
db.py
- add case_law.nevo_ratio column (additive); allow it in update_case_law.
scripts/backfill_nevo_preamble.py (#86.2) — dry-run-by-default data migration:
finds historically-leaked rulings, captures ratio→nevo_ratio, rewrites
full_text (+content_hash), reindexes, and FLAGS (never deletes) halachot whose
quote lives in the removed preamble (review_status=pending_review +
nevo_preamble_leak flag). Safety guard: rows with keep%<--min-keep (60) are
excluded from --apply as suspected over-strip. --apply writes backup+manifest
to data/audit/ first. Chair-gated — NOT applied here.
scripts/nevo_ratio_benchmark.py (#86.3) — LLM-as-judge (local claude_session,
zero cost) measures recall/precision/granularity of our halachot vs the Nevo
ratio. Works pre- and post-backfill (reads nevo_ratio, falls back to full_text).
Verified:
- pytest tests/test_nevo_preamble.py — 12 passed (incl. citation/markdown
over-strip regressions).
- backfill dry-run: 19 leaked rulings, 27 contaminated halachot, all ≥75%
keep (the 32K over-strip is gone).
- benchmark on בג"ץ 1764/05: recall=0.875 precision=1.0 granularity=1.75x.
Invariants: G1 (normalize at source — strip/capture at ingest, not at read);
no silent swallow (contaminated halachot flagged + reported, not dropped);
data-migration is dry-run-default with backup+manifest, chair-gated.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -619,6 +619,12 @@ ALTER TABLE case_law ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT '';
|
|||||||
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';
|
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';
|
||||||
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS headnote TEXT DEFAULT '';
|
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS headnote TEXT DEFAULT '';
|
||||||
-- chair-editable abstract shown in search results.
|
-- chair-editable abstract shown in search results.
|
||||||
|
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS nevo_ratio TEXT DEFAULT '';
|
||||||
|
-- The Nevo editorial מיני-רציו block, captured at ingest *before* it is
|
||||||
|
-- stripped from the body (#86.3). Kept separate from `headnote` (which is
|
||||||
|
-- our own abstract) so it can serve as a free professional gold-set for
|
||||||
|
-- benchmarking halacha-extraction recall/precision. Empty when the source
|
||||||
|
-- is not a Nevo export or carries no mini-ratio.
|
||||||
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS source_type TEXT DEFAULT '';
|
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS source_type TEXT DEFAULT '';
|
||||||
-- 'court_ruling' | 'appeals_committee'
|
-- 'court_ruling' | 'appeals_committee'
|
||||||
|
|
||||||
@@ -3263,7 +3269,7 @@ async def update_case_law(case_law_id: UUID, **fields) -> dict | None:
|
|||||||
"""
|
"""
|
||||||
allowed = {
|
allowed = {
|
||||||
"case_number", "case_name", "court", "date", "practice_area", "appeal_subtype",
|
"case_number", "case_name", "court", "date", "practice_area", "appeal_subtype",
|
||||||
"subject_tags", "summary", "headnote", "key_quote", "source_url",
|
"subject_tags", "summary", "headnote", "nevo_ratio", "key_quote", "source_url",
|
||||||
"source_type", "precedent_level", "is_binding", "district", "chair_name",
|
"source_type", "precedent_level", "is_binding", "district", "chair_name",
|
||||||
"proceeding_type", "citation_formatted",
|
"proceeding_type", "citation_formatted",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -362,12 +362,24 @@ _NEVO_MARKERS = ("ספרות:", "חקיקה שאוזכרה:", "מיני-רציו
|
|||||||
# preamble: bibliography + מיני-רציו). Two families:
|
# preamble: bibliography + מיני-רציו). Two families:
|
||||||
# - ועדת ערר / district openings (בפנינו / הערר שבנדון / ...)
|
# - ועדת ערר / district openings (בפנינו / הערר שבנדון / ...)
|
||||||
# - COURT-RULING openings (#86.1): a פסק-דין header or the authoring judge's
|
# - COURT-RULING openings (#86.1): a פסק-דין header or the authoring judge's
|
||||||
# line ("השופט/ת X:", "כב' השופט", "הנשיא"). Without these, Nevo court
|
# line. Without these, Nevo court judgments — exactly the ones carrying a
|
||||||
# judgments — exactly the ones carrying a מיני-רציו — slipped through unstripped
|
# מיני-רציו — slipped through unstripped (e.g. בג"ץ 1764/05).
|
||||||
# (e.g. בג"ץ 1764/05), risking that the extractor reads Nevo's answer key.
|
#
|
||||||
|
# #86.2 hardening — two over-strip bugs found while backfilling:
|
||||||
|
# 1. ``פסק-דין`` headers are often markdown-wrapped (``**פסק דין**``); the old
|
||||||
|
# ``^פסק[- ]דין`` required the keyword to be the very first char of the line
|
||||||
|
# and allowed only one separator, so it missed the header and fell through
|
||||||
|
# to a citation 32K deep (עמ"נ 50567-07-21). We now tolerate leading
|
||||||
|
# markdown/whitespace and 0-3 separators.
|
||||||
|
# 2. Bare ``השופט``/``הנשיא`` matched *citations* ("השופט מ' חשין, פסקה 23"),
|
||||||
|
# stripping real decision body. The authoring-judge line ends with a COLON
|
||||||
|
# ("השופט י' עמית:"); citations use a comma. We now require the colon.
|
||||||
_DECISION_START = re.compile(
|
_DECISION_START = re.compile(
|
||||||
r"^(בפנינו|לפנינו|לפניי|הערר שבנדון|ועדת הערר לתכנון|רקע עובדתי|עסקינן|"
|
r"^[ \t>*_#]{0,6}(?:"
|
||||||
r"פסק[- ]דין|פסק[- ]דינו|כב(?:וד)?['׳]?\s*השופט|המשנה לנשיא|הנשיא|השופט)",
|
r"בפנינו|לפנינו|לפניי|הערר שבנדון|ועדת הערר לתכנון|רקע עובדתי|עסקינן|"
|
||||||
|
r"פסק[ \t\-]{0,3}די(?:ן|נו)|" # פסק-דין / פסק דין / **פסק דין** header (final-nun ן vs דינו)
|
||||||
|
r"(?:כב(?:וד)?['׳\"]?\s*)?(?:ה?שופט[ת]?|ה?נשיא[ה]?|המשנה לנשיא)\s+[^\n,]{1,40}:" # author line → colon
|
||||||
|
r")",
|
||||||
re.MULTILINE,
|
re.MULTILINE,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -388,3 +400,41 @@ def strip_nevo_preamble(text: str) -> str:
|
|||||||
logger.debug("Stripped %d chars of Nevo preamble", m.start())
|
logger.debug("Stripped %d chars of Nevo preamble", m.start())
|
||||||
return stripped
|
return stripped
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
_RATIO_MARKER = "מיני-רציו:"
|
||||||
|
|
||||||
|
|
||||||
|
def extract_nevo_ratio(text: str) -> str:
|
||||||
|
"""Return the Nevo מיני-רציו block (editorial holdings summary), or ''.
|
||||||
|
|
||||||
|
The mini-ratio is Nevo's own headnote — a concise, professionally-written
|
||||||
|
list of the holdings. We capture it *before* :func:`strip_nevo_preamble`
|
||||||
|
discards it, to serve as a free gold-set for benchmarking how well our
|
||||||
|
halacha extractor covers the real holdings (#86.3).
|
||||||
|
|
||||||
|
The block runs from the ``מיני-רציו:`` marker to whichever comes first:
|
||||||
|
the decision body (``_DECISION_START``) or the next preamble marker
|
||||||
|
(bibliography / legislation). Returns '' when there is no mini-ratio.
|
||||||
|
"""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
start = text.find(_RATIO_MARKER)
|
||||||
|
if start == -1:
|
||||||
|
return ""
|
||||||
|
body = text[start + len(_RATIO_MARKER):]
|
||||||
|
|
||||||
|
# End at the earliest of: decision body start, or a following preamble
|
||||||
|
# marker (ספרות: / חקיקה שאוזכרה: / ...). Both are measured relative to
|
||||||
|
# the ratio body so we never run past it into the judgment itself.
|
||||||
|
end = len(body)
|
||||||
|
dm = _DECISION_START.search(body)
|
||||||
|
if dm:
|
||||||
|
end = min(end, dm.start())
|
||||||
|
for marker in _NEVO_MARKERS:
|
||||||
|
if marker == _RATIO_MARKER:
|
||||||
|
continue
|
||||||
|
pos = body.find(marker)
|
||||||
|
if pos != -1:
|
||||||
|
end = min(end, pos)
|
||||||
|
return body[:end].strip()
|
||||||
|
|||||||
@@ -158,9 +158,14 @@ async def ingest_document(
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
await progress("failed", 100, f"כשל בחילוץ טקסט: {e}")
|
await progress("failed", 100, f"כשל בחילוץ טקסט: {e}")
|
||||||
raise
|
raise
|
||||||
raw_text = extractor.strip_nevo_preamble((raw_text or "")).strip()
|
raw_text = (raw_text or "")
|
||||||
else:
|
else:
|
||||||
raw_text = (text or "").strip()
|
raw_text = (text or "")
|
||||||
|
# Capture the Nevo מיני-רציו (editorial holdings summary) BEFORE stripping
|
||||||
|
# it out — it is a free professional gold-set for benchmarking halacha
|
||||||
|
# extraction (#86.3). Stored on the case_law row below once we have its id.
|
||||||
|
nevo_ratio = extractor.extract_nevo_ratio(raw_text)
|
||||||
|
raw_text = extractor.strip_nevo_preamble(raw_text).strip()
|
||||||
if not raw_text:
|
if not raw_text:
|
||||||
await progress("failed", 100, "לא נמצא טקסט בקובץ")
|
await progress("failed", 100, "לא נמצא טקסט בקובץ")
|
||||||
raise ValueError("no extractable text in file")
|
raise ValueError("no extractable text in file")
|
||||||
@@ -180,6 +185,13 @@ async def ingest_document(
|
|||||||
)
|
)
|
||||||
case_law_id = UUID(str(record["id"]))
|
case_law_id = UUID(str(record["id"]))
|
||||||
|
|
||||||
|
# Persist the captured mini-ratio (best-effort; never block ingest on it).
|
||||||
|
if nevo_ratio:
|
||||||
|
try:
|
||||||
|
await db.update_case_law(case_law_id, nevo_ratio=nevo_ratio)
|
||||||
|
except Exception as e: # noqa: BLE001 — additive metadata, non-fatal
|
||||||
|
logger.warning("could not store nevo_ratio for %s: %s", case_law_id, e)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
stored_chunks = await _chunk_embed_store(case_law_id, raw_text, page_offsets, page_count, progress)
|
stored_chunks = await _chunk_embed_store(case_law_id, raw_text, page_offsets, page_count, progress)
|
||||||
await db.mark_indexed(case_law_id)
|
await db.mark_indexed(case_law_id)
|
||||||
|
|||||||
@@ -55,3 +55,64 @@ def test_markers_past_400_chars_still_detected():
|
|||||||
text = header + _PREAMBLE + "השופטת ע' ארבל:\n\nגוף ההחלטה..."
|
text = header + _PREAMBLE + "השופטת ע' ארבל:\n\nגוף ההחלטה..."
|
||||||
out = ex.strip_nevo_preamble(text)
|
out = ex.strip_nevo_preamble(text)
|
||||||
assert out.startswith("השופטת ע' ארבל:")
|
assert out.startswith("השופטת ע' ארבל:")
|
||||||
|
|
||||||
|
|
||||||
|
# ── extract_nevo_ratio (#86.3 gold-set capture) ──
|
||||||
|
|
||||||
|
def test_extract_ratio_returns_block_before_body():
|
||||||
|
text = _PREAMBLE + "השופט ס' ג'ובראן:\n\nגוף ההחלטה..."
|
||||||
|
ratio = ex.extract_nevo_ratio(text)
|
||||||
|
assert "העותרים לא הוכיחו טעם מיוחד" in ratio
|
||||||
|
assert "המחוקק הגביל את הזמן" in ratio
|
||||||
|
# must not bleed into the judgment body
|
||||||
|
assert "גוף ההחלטה" not in ratio
|
||||||
|
assert "השופט ס' ג'ובראן" not in ratio
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_ratio_stops_at_following_marker():
|
||||||
|
# ratio first, then a bibliography marker AFTER it
|
||||||
|
text = (
|
||||||
|
"מיני-רציו:\n* עיקרון אחד בלבד.\n\n"
|
||||||
|
"פסקי דין שאוזכרו:\nבג\"ץ 1/00\n\n"
|
||||||
|
"פסק-דין\nגוף..."
|
||||||
|
)
|
||||||
|
ratio = ex.extract_nevo_ratio(text)
|
||||||
|
assert "עיקרון אחד בלבד" in ratio
|
||||||
|
assert "פסקי דין שאוזכרו" not in ratio
|
||||||
|
assert "בג\"ץ 1/00" not in ratio
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_ratio_empty_when_no_marker():
|
||||||
|
assert ex.extract_nevo_ratio("פסק דין\nהשופט כהן: ...") == ""
|
||||||
|
assert ex.extract_nevo_ratio("") == ""
|
||||||
|
|
||||||
|
|
||||||
|
# ── #86.2 over-strip regressions ──
|
||||||
|
|
||||||
|
def test_citation_judge_line_is_not_a_decision_start():
|
||||||
|
# "השופט מ' חשין, פסקה 23" is a CITATION (comma, no colon) — must NOT be
|
||||||
|
# treated as the decision opening, or 32K of real body gets stripped.
|
||||||
|
body = (
|
||||||
|
"**פסק דין**\n\n"
|
||||||
|
"שני ערעורים לפניי. כפי שנפסק מפי כבוד \n\n"
|
||||||
|
"השופט מ' חשין, פסקה 23 (להלן עניין קהתי), יש לבחון...\n"
|
||||||
|
)
|
||||||
|
text = _PREAMBLE + body
|
||||||
|
out = ex.strip_nevo_preamble(text)
|
||||||
|
assert out.startswith("**פסק דין**")
|
||||||
|
assert "השופט מ' חשין, פסקה" in out # citation kept inside body
|
||||||
|
assert "מיני-רציו" not in out
|
||||||
|
|
||||||
|
|
||||||
|
def test_markdown_wrapped_pdin_header_is_stripped():
|
||||||
|
text = _PREAMBLE + "**פסק דין**\n\nשני ערעוריה הנדונים..."
|
||||||
|
out = ex.strip_nevo_preamble(text)
|
||||||
|
assert out.startswith("**פסק דין**")
|
||||||
|
assert "מיני-רציו" not in out
|
||||||
|
|
||||||
|
|
||||||
|
def test_author_line_with_colon_still_strips():
|
||||||
|
text = _PREAMBLE + "כב' השופטת ד' ברק-ארז:\n\nגוף ההחלטה..."
|
||||||
|
out = ex.strip_nevo_preamble(text)
|
||||||
|
assert out.startswith("כב' השופטת ד' ברק-ארז:")
|
||||||
|
assert "מיני-רציו" not in out
|
||||||
|
|||||||
@@ -36,6 +36,8 @@
|
|||||||
| `multimodal_backfill.py` | python | Backfill voyage-multimodal-3 page embeddings על מסמכי תיקים קיימים. idempotent (skips by default), forces `MULTIMODAL_ENABLED=true` ל-run, רץ מהקונטיינר. שלב C — ראה `docs/voyage-upgrades-plan.md` | ידני per-case (`python multimodal_backfill.py 8174-24 8137-24`) |
|
| `multimodal_backfill.py` | python | Backfill voyage-multimodal-3 page embeddings על מסמכי תיקים קיימים. idempotent (skips by default), forces `MULTIMODAL_ENABLED=true` ל-run, רץ מהקונטיינר. שלב C — ראה `docs/voyage-upgrades-plan.md` | ידני per-case (`python multimodal_backfill.py 8174-24 8137-24`) |
|
||||||
| `backfill_chunk_pages.py` | python | Backfill `page_number` ב-`document_chunks` קיימים. legacy chunker לא tracked עמודים → `page_number=NULL` חוסם boost של multimodal hybrid (text+image join על אותו עמוד). re-extracts כל PDF (re-OCR אם צריך, ~$0.0015/page), מחשב page_offsets, ומעדכן chunks. idempotent | ידני per-case (`python backfill_chunk_pages.py 8174-24 8137-24`) |
|
| `backfill_chunk_pages.py` | python | Backfill `page_number` ב-`document_chunks` קיימים. legacy chunker לא tracked עמודים → `page_number=NULL` חוסם boost של multimodal hybrid (text+image join על אותו עמוד). re-extracts כל PDF (re-OCR אם צריך, ~$0.0015/page), מחשב page_offsets, ומעדכן chunks. idempotent | ידני per-case (`python backfill_chunk_pages.py 8174-24 8137-24`) |
|
||||||
| `rechunk_legacy_precedents.py` | python | **#57** — re-chunk + re-embed פסיקה שהוטמעה לפני תיקון ה-chunker (#55). בוחר כל `case_law` עם chunk זעיר (`length(trim(content))<50` — טביעת-האצבע של ה-chunker הישן) ומריץ `ingest.reindex_case_law` (re-chunk+re-embed מ-`full_text` שמור בלבד — ללא re-OCR/LLM, feedback_no_reocr_retrofit; idempotent DELETE-then-INSERT). idempotent ברמת-הבאטץ' (שואב מחדש את הסט המושפע בכל ריצה). דגל `--limit N`. רץ עם venv של mcp-server (`cd mcp-server && .venv/bin/python ../scripts/rechunk_legacy_precedents.py`) | חד-פעמי — מיגרציית-נתונים של פסיקה legacy (תוקן 2026-06-03) |
|
| `rechunk_legacy_precedents.py` | python | **#57** — re-chunk + re-embed פסיקה שהוטמעה לפני תיקון ה-chunker (#55). בוחר כל `case_law` עם chunk זעיר (`length(trim(content))<50` — טביעת-האצבע של ה-chunker הישן) ומריץ `ingest.reindex_case_law` (re-chunk+re-embed מ-`full_text` שמור בלבד — ללא re-OCR/LLM, feedback_no_reocr_retrofit; idempotent DELETE-then-INSERT). idempotent ברמת-הבאטץ' (שואב מחדש את הסט המושפע בכל ריצה). דגל `--limit N`. רץ עם venv של mcp-server (`cd mcp-server && .venv/bin/python ../scripts/rechunk_legacy_precedents.py`) | חד-פעמי — מיגרציית-נתונים של פסיקה legacy (תוקן 2026-06-03) |
|
||||||
|
| `backfill_nevo_preamble.py` | python | **#86.2** — מיגרציית-נתונים: חיתוך preamble/רציו של נבו שדלף לפסיקה שהוטמעה לפני תיקון #86.1. מאתר כל `case_law` ש-`strip_nevo_preamble(full_text)` עדיין מקצר (דליפה היסטורית), ומבצע: (1) לכידת ה-מיני-רציו ל-`case_law.nevo_ratio` (gold-set ל-#86.3); (2) שכתוב `full_text` החתוך + חישוב-מחדש של `content_hash`; (3) `reindex_case_law` (re-chunk+embed, ללא re-OCR/LLM); (4) **סימון (לא מחיקה)** הלכות ש-`supporting_quote` שלהן בתוך ה-preamble שהוסר → `pending_review` + quality_flag `nevo_preamble_leak`. **שומר-בטיחות:** שורות עם keep%<`--min-keep` (ברירת-מחדל 60) מוחרגות מ-`--apply` כחשד over-strip (אלא אם `--include-suspicious`). **dry-run כברירת-מחדל**; `--apply` כותב backup JSON + manifest CSV ל-`data/audit/` תחילה. idempotent. רץ עם venv של mcp-server. **chair-gated** (לאמת manifest לפני apply) | מיגרציית-נתונים — dry-run בוצע (19 פסקים, 27 הלכות מזוהמות); apply ממתין לאישור |
|
||||||
|
| `nevo_ratio_benchmark.py` | python | **#86.3** — מדידת איכות חילוץ-הלכות מול ה-מיני-רציו של נבו (gold-set מקצועי חינמי). לכל פסק עם `nevo_ratio` (או נגזר מ-`full_text` אם טרם בוצע backfill): LLM-judge מקומי (`claude_session`, אפס עלות) ממפה סמנטית את הלכות-המערכת מול הלכות-נבו ומפיק **recall** (כיסוי הלכות-נבו), **precision** (אחוז הלכותינו הממופות), **granularity** (יחס פירוק — איתות over-extraction ל-#81.5). `--case <num>` / `--all [--limit N]` / `--model` / `--out`. כותב CSV ל-`data/audit/`. רץ עם venv של mcp-server (דורש Claude CLI מקומי). אומת על בג"ץ 1764/05: recall 0.875, precision 1.0, granularity 1.75x | ידני — מדידת-איכות (CI/ad-hoc) |
|
||||||
| `audit_corpus_integrity.py` | python | בדיקה תקופתית של עקביות הקורפוס — 3 בדיקות SQL read-only על `case_law` ו-`cases`: (A) `external_upload` עם prefix פנימי `ערר`/`בל"מ`; (B) `internal_committee` חסר `chair_name`/`district`; (C) `cases.practice_area` מחוץ ל-{`rishuy_uvniya`, `betterment_levy`, `compensation_197`, `''`}. כותב log מצטבר ל-`data/logs/corpus_integrity_audit.log` ובמצב הפרות שולח wakeup ל-CEO ב-Paperclip (best-effort, רק אם `PAPERCLIP_API_URL`+`PAPERCLIP_API_KEY` מוגדרים). דגל: `--no-notify`. Idempotent, יוצא 0. **Cron יומי 07:00**: `0 7 * * * /home/chaim/legal-ai/mcp-server/.venv/bin/python /home/chaim/legal-ai/scripts/audit_corpus_integrity.py` | `0 7 * * *` (cron) |
|
| `audit_corpus_integrity.py` | python | בדיקה תקופתית של עקביות הקורפוס — 3 בדיקות SQL read-only על `case_law` ו-`cases`: (A) `external_upload` עם prefix פנימי `ערר`/`בל"מ`; (B) `internal_committee` חסר `chair_name`/`district`; (C) `cases.practice_area` מחוץ ל-{`rishuy_uvniya`, `betterment_levy`, `compensation_197`, `''`}. כותב log מצטבר ל-`data/logs/corpus_integrity_audit.log` ובמצב הפרות שולח wakeup ל-CEO ב-Paperclip (best-effort, רק אם `PAPERCLIP_API_URL`+`PAPERCLIP_API_KEY` מוגדרים). דגל: `--no-notify`. Idempotent, יוצא 0. **Cron יומי 07:00**: `0 7 * * * /home/chaim/legal-ai/mcp-server/.venv/bin/python /home/chaim/legal-ai/scripts/audit_corpus_integrity.py` | `0 7 * * *` (cron) |
|
||||||
| `backfill_legal_arguments.py` | python | Backfill `legal_arguments` לתיקים עם `claims` קיימים (TaskMaster #36). מקבץ פרופוזיציות גולמיות לטיעונים משפטיים מובחנים (~6-12 לכל צד) דרך `argument_aggregator.aggregate_claims_to_arguments` (Claude CLI). תומך `--dry-run`/`--apply`/`--force`/`--case <num>...`. **חייב לרוץ מהמכונה המקומית** (לא קונטיינר) — `claude_session` דורש Claude CLI | ידני per-case (`python scripts/backfill_legal_arguments.py --apply --case 1017-03-26`) |
|
| `backfill_legal_arguments.py` | python | Backfill `legal_arguments` לתיקים עם `claims` קיימים (TaskMaster #36). מקבץ פרופוזיציות גולמיות לטיעונים משפטיים מובחנים (~6-12 לכל צד) דרך `argument_aggregator.aggregate_claims_to_arguments` (Claude CLI). תומך `--dry-run`/`--apply`/`--force`/`--case <num>...`. **חייב לרוץ מהמכונה המקומית** (לא קונטיינר) — `claude_session` דורש Claude CLI | ידני per-case (`python scripts/backfill_legal_arguments.py --apply --case 1017-03-26`) |
|
||||||
| `upload_blam_decisions.py` | python | חד-פעמי (2026-05-26) — העלאת 2 החלטות בל"מ ל-`case_law` (8126/24 סופר נוח, 8047/23 הרנון) דרך `ingest_internal_decision` ישיר, עוקף MCP server שטרם נטען מחדש אחרי הוספת `proceeding_type`. **לא להריץ שוב** | חד-פעמי — להעביר ל-`.archive/` בהזדמנות |
|
| `upload_blam_decisions.py` | python | חד-פעמי (2026-05-26) — העלאת 2 החלטות בל"מ ל-`case_law` (8126/24 סופר נוח, 8047/23 הרנון) דרך `ingest_internal_decision` ישיר, עוקף MCP server שטרם נטען מחדש אחרי הוספת `proceeding_type`. **לא להריץ שוב** | חד-פעמי — להעביר ל-`.archive/` בהזדמנות |
|
||||||
|
|||||||
240
scripts/backfill_nevo_preamble.py
Normal file
240
scripts/backfill_nevo_preamble.py
Normal file
@@ -0,0 +1,240 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""#86.2 — backfill: strip leaked Nevo preamble/ratio from already-ingested rulings.
|
||||||
|
|
||||||
|
Court rulings ingested BEFORE the #86.1 fix kept their Nevo preamble
|
||||||
|
(bibliography + מיני-רציו) because the old ``_DECISION_START`` regex only
|
||||||
|
matched ועדת-ערר openings, not ``פסק-דין``/judge openings. For those rows the
|
||||||
|
preamble is baked into the stored ``full_text`` AND into the chunks — and the
|
||||||
|
מיני-רציו (Nevo's editorial answer-key) may have leaked into extracted
|
||||||
|
halachot, contaminating the corpus.
|
||||||
|
|
||||||
|
This script finds every case_law row whose stored ``full_text`` would still be
|
||||||
|
shortened by the CURRENT ``strip_nevo_preamble`` (i.e. a pre-fix leak), and:
|
||||||
|
|
||||||
|
1. captures the מיני-רציו into ``case_law.nevo_ratio`` (gold-set for #86.3),
|
||||||
|
unless that column is already populated;
|
||||||
|
2. rewrites ``full_text`` to the stripped body + recomputes ``content_hash``;
|
||||||
|
3. re-chunks + re-embeds via ``ingest.reindex_case_law`` (no re-OCR, no LLM);
|
||||||
|
4. flags — never deletes — halachot whose supporting_quote lives entirely in
|
||||||
|
the removed preamble region: review_status -> 'pending_review' plus a
|
||||||
|
'nevo_preamble_leak' quality_flag, so the chair can re-judge them (#84).
|
||||||
|
|
||||||
|
DRY-RUN BY DEFAULT. ``--apply`` performs the migration and first writes a JSON
|
||||||
|
backup + CSV manifest to ``data/audit/`` (per the code-protocol data-migration
|
||||||
|
rule). Idempotent: a re-run finds nothing because stripped rows no longer match.
|
||||||
|
|
||||||
|
Run with the MCP server venv (config loads ~/.env / Infisical for POSTGRES +
|
||||||
|
VOYAGE, same as the live MCP tools):
|
||||||
|
|
||||||
|
cd ~/legal-ai/mcp-server
|
||||||
|
.venv/bin/python ../scripts/backfill_nevo_preamble.py # dry-run
|
||||||
|
.venv/bin/python ../scripts/backfill_nevo_preamble.py --apply # migrate
|
||||||
|
.venv/bin/python ../scripts/backfill_nevo_preamble.py --limit 3 # smoke
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import csv
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from legal_mcp.services import db, ingest
|
||||||
|
from legal_mcp.services.extractor import extract_nevo_ratio, strip_nevo_preamble
|
||||||
|
from legal_mcp.services.halacha_quality import normalize_text
|
||||||
|
|
||||||
|
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||||
|
AUDIT_DIR = REPO_ROOT / "data" / "audit"
|
||||||
|
|
||||||
|
# Safety: a clean strip removes only the Nevo preamble (a small head). If the
|
||||||
|
# strip would discard more than this fraction of the document, treat it as a
|
||||||
|
# suspected over-strip (a citation/heading false-match) and DO NOT auto-apply
|
||||||
|
# — surface it for manual review instead. Destroying real decision body is
|
||||||
|
# far worse than leaving a preamble in place.
|
||||||
|
DEFAULT_MIN_KEEP_PCT = 60
|
||||||
|
|
||||||
|
|
||||||
|
async def _scan(conn, limit: int | None) -> list[dict]:
|
||||||
|
"""Return rows whose stored full_text still carries a Nevo preamble."""
|
||||||
|
rows = await conn.fetch(
|
||||||
|
"SELECT id, case_number, full_text, nevo_ratio "
|
||||||
|
"FROM case_law WHERE full_text <> '' ORDER BY case_number"
|
||||||
|
)
|
||||||
|
hits: list[dict] = []
|
||||||
|
for r in rows:
|
||||||
|
full = r["full_text"] or ""
|
||||||
|
stripped = strip_nevo_preamble(full)
|
||||||
|
if stripped == full:
|
||||||
|
continue # no leak (already clean, or never had a preamble)
|
||||||
|
removed = full[: len(full) - len(stripped)]
|
||||||
|
ratio = extract_nevo_ratio(full)
|
||||||
|
keep_pct = round(100 * len(stripped) / len(full)) if full else 0
|
||||||
|
hits.append({
|
||||||
|
"id": r["id"],
|
||||||
|
"case_number": r["case_number"],
|
||||||
|
"full_text": full,
|
||||||
|
"stripped": stripped,
|
||||||
|
"removed": removed,
|
||||||
|
"ratio": ratio,
|
||||||
|
"keep_pct": keep_pct,
|
||||||
|
"had_ratio_stored": bool((r["nevo_ratio"] or "").strip()),
|
||||||
|
})
|
||||||
|
if limit and len(hits) >= limit:
|
||||||
|
break
|
||||||
|
return hits
|
||||||
|
|
||||||
|
|
||||||
|
async def _contaminated_halachot(conn, case_law_id, removed: str) -> list[dict]:
|
||||||
|
"""Halachot whose supporting_quote sits entirely inside the removed preamble."""
|
||||||
|
norm_removed = normalize_text(removed)
|
||||||
|
if not norm_removed:
|
||||||
|
return []
|
||||||
|
rows = await conn.fetch(
|
||||||
|
"SELECT id, halacha_index, supporting_quote, review_status, quality_flags "
|
||||||
|
"FROM halachot WHERE case_law_id = $1",
|
||||||
|
case_law_id,
|
||||||
|
)
|
||||||
|
bad = []
|
||||||
|
for r in rows:
|
||||||
|
q = normalize_text(r["supporting_quote"] or "")
|
||||||
|
if len(q) >= 20 and q in norm_removed:
|
||||||
|
bad.append(dict(r))
|
||||||
|
return bad
|
||||||
|
|
||||||
|
|
||||||
|
async def main(args: argparse.Namespace) -> int:
|
||||||
|
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
||||||
|
pool = await db.get_pool()
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
hits = await _scan(conn, args.limit)
|
||||||
|
for h in hits:
|
||||||
|
h["contaminated"] = await _contaminated_halachot(conn, h["id"], h["removed"])
|
||||||
|
|
||||||
|
# Partition into safe (auto-appliable) vs suspicious (manual review).
|
||||||
|
for h in hits:
|
||||||
|
h["suspicious"] = h["keep_pct"] < args.min_keep
|
||||||
|
safe = [h for h in hits if not h["suspicious"]]
|
||||||
|
suspicious = [h for h in hits if h["suspicious"]]
|
||||||
|
|
||||||
|
n = len(hits)
|
||||||
|
total_contam = sum(len(h["contaminated"]) for h in hits)
|
||||||
|
print(f"leaked rulings found: {n} (contaminated halachot: {total_contam}; "
|
||||||
|
f"safe: {len(safe)}, suspicious<{args.min_keep}%: {len(suspicious)})", flush=True)
|
||||||
|
for h in hits:
|
||||||
|
print(
|
||||||
|
f" {'⚠ ' if h['suspicious'] else ' '}{h['case_number']}: "
|
||||||
|
f"keep {h['keep_pct']}%, -{len(h['removed']):,} preamble chars, "
|
||||||
|
f"ratio={len(h['ratio'])} chars, "
|
||||||
|
f"{len(h['contaminated'])} contaminated halachot"
|
||||||
|
+ ("" if h["ratio"] else " [no mini-ratio]")
|
||||||
|
+ (" [ratio already stored]" if h["had_ratio_stored"] else ""),
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
if suspicious:
|
||||||
|
print(f"\n⚠ {len(suspicious)} ruling(s) below {args.min_keep}% keep — "
|
||||||
|
"EXCLUDED from --apply (suspected over-strip). Review manually or "
|
||||||
|
"pass --include-suspicious to force.", flush=True)
|
||||||
|
|
||||||
|
if not hits:
|
||||||
|
print("nothing to backfill — corpus clean ✓", flush=True)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
apply_set = hits if args.include_suspicious else safe
|
||||||
|
|
||||||
|
# Always write a manifest (dry-run included) for the audit trail.
|
||||||
|
AUDIT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
manifest = AUDIT_DIR / f"nevo-backfill-manifest-{ts}.csv"
|
||||||
|
with manifest.open("w", encoding="utf-8", newline="") as f:
|
||||||
|
w = csv.writer(f)
|
||||||
|
w.writerow(["case_law_id", "case_number", "keep_pct", "preamble_chars",
|
||||||
|
"ratio_chars", "contaminated_halachot", "suspicious", "applied"])
|
||||||
|
for h in hits:
|
||||||
|
will_apply = args.apply and (not h["suspicious"] or args.include_suspicious)
|
||||||
|
w.writerow([h["id"], h["case_number"], h["keep_pct"], len(h["removed"]),
|
||||||
|
len(h["ratio"]), len(h["contaminated"]), h["suspicious"], will_apply])
|
||||||
|
print(f"manifest: {manifest}", flush=True)
|
||||||
|
|
||||||
|
if not args.apply:
|
||||||
|
print("\nDRY-RUN — no changes written. Re-run with --apply to migrate.", flush=True)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Backup the BEFORE state before mutating anything.
|
||||||
|
backup = AUDIT_DIR / f"nevo-backfill-backup-{ts}.json"
|
||||||
|
with backup.open("w", encoding="utf-8") as f:
|
||||||
|
json.dump([
|
||||||
|
{
|
||||||
|
"id": str(h["id"]),
|
||||||
|
"case_number": h["case_number"],
|
||||||
|
"full_text": h["full_text"],
|
||||||
|
"ratio": h["ratio"],
|
||||||
|
"contaminated": [
|
||||||
|
{"id": str(c["id"]), "halacha_index": c["halacha_index"],
|
||||||
|
"review_status": c["review_status"],
|
||||||
|
"quality_flags": list(c["quality_flags"] or [])}
|
||||||
|
for c in h["contaminated"]
|
||||||
|
],
|
||||||
|
}
|
||||||
|
for h in apply_set
|
||||||
|
], f, ensure_ascii=False, indent=2)
|
||||||
|
print(f"backup: {backup}", flush=True)
|
||||||
|
|
||||||
|
n_apply = len(apply_set)
|
||||||
|
ok, failed = 0, []
|
||||||
|
for i, h in enumerate(apply_set, 1):
|
||||||
|
cid, cn = h["id"], h["case_number"]
|
||||||
|
try:
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
async with conn.transaction():
|
||||||
|
# 1+2: rewrite full_text + content_hash; store ratio if absent.
|
||||||
|
await conn.execute(
|
||||||
|
"UPDATE case_law SET full_text = $2, content_hash = $3 WHERE id = $1",
|
||||||
|
cid, h["stripped"], db._content_hash(h["stripped"]),
|
||||||
|
)
|
||||||
|
if h["ratio"] and not h["had_ratio_stored"]:
|
||||||
|
await conn.execute(
|
||||||
|
"UPDATE case_law SET nevo_ratio = $2 WHERE id = $1",
|
||||||
|
cid, h["ratio"],
|
||||||
|
)
|
||||||
|
# 4: flag (never delete) contaminated halachot.
|
||||||
|
for c in h["contaminated"]:
|
||||||
|
flags = list(c["quality_flags"] or [])
|
||||||
|
if "nevo_preamble_leak" not in flags:
|
||||||
|
flags.append("nevo_preamble_leak")
|
||||||
|
await conn.execute(
|
||||||
|
"UPDATE halachot SET review_status = 'pending_review', "
|
||||||
|
"quality_flags = $2 WHERE id = $1",
|
||||||
|
c["id"], flags,
|
||||||
|
)
|
||||||
|
# 3: reindex outside the txn (its own DELETE-then-INSERT + embeddings).
|
||||||
|
res = await ingest.reindex_case_law(cid)
|
||||||
|
ok += 1
|
||||||
|
print(f"[{i}/{n_apply}] OK {cn}: -> {res['chunks']} chunks, "
|
||||||
|
f"{len(h['contaminated'])} halachot flagged", flush=True)
|
||||||
|
except Exception as e: # noqa: BLE001 — per-row, keep going
|
||||||
|
failed.append((cn, str(e)))
|
||||||
|
print(f"[{i}/{n_apply}] FAIL {cn}: {e}", flush=True)
|
||||||
|
|
||||||
|
print(f"\nDONE — {ok}/{n_apply} migrated, {len(failed)} failed"
|
||||||
|
+ (f", {len(suspicious)} suspicious skipped" if suspicious and not args.include_suspicious else ""),
|
||||||
|
flush=True)
|
||||||
|
for cn, e in failed:
|
||||||
|
print(f" FAILED {cn}: {e}", flush=True)
|
||||||
|
return 0 if not failed else 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
ap.add_argument("--apply", action="store_true",
|
||||||
|
help="perform the migration (default: dry-run)")
|
||||||
|
ap.add_argument("--limit", type=int, default=None,
|
||||||
|
help="process only the first N leaked rulings")
|
||||||
|
ap.add_argument("--min-keep", type=int, default=DEFAULT_MIN_KEEP_PCT,
|
||||||
|
help=f"min%% of doc that must remain after strip to auto-apply "
|
||||||
|
f"(default {DEFAULT_MIN_KEEP_PCT}); lower = suspected over-strip")
|
||||||
|
ap.add_argument("--include-suspicious", action="store_true",
|
||||||
|
help="force --apply on rows below --min-keep (use with care)")
|
||||||
|
args = ap.parse_args()
|
||||||
|
sys.exit(asyncio.run(main(args)))
|
||||||
173
scripts/nevo_ratio_benchmark.py
Normal file
173
scripts/nevo_ratio_benchmark.py
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""#86.3 — benchmark halacha-extraction quality against Nevo's מיני-רציו gold-set.
|
||||||
|
|
||||||
|
Nevo's editorial מיני-רציו is a free, professionally-written list of a ruling's
|
||||||
|
holdings. By comparing the halachot WE extracted against it we get an honest,
|
||||||
|
zero-cost measurement of extraction quality per ruling:
|
||||||
|
|
||||||
|
* recall — fraction of Nevo's holdings that our halachot cover
|
||||||
|
* precision — fraction of our halachot that map to a Nevo holding
|
||||||
|
* granularity — our_count / nevo_holding_count (over-decomposition signal,
|
||||||
|
the #81.5 concern: e.g. 14 ours vs 4 Nevo = 3.5x)
|
||||||
|
|
||||||
|
The gold-truth ratio is read from ``case_law.nevo_ratio`` (populated by
|
||||||
|
``backfill_nevo_preamble.py`` / ingest). For rulings not yet backfilled it
|
||||||
|
falls back to computing the ratio on-the-fly from the stored ``full_text``,
|
||||||
|
so the harness works before and after the migration.
|
||||||
|
|
||||||
|
An LLM-as-judge (local ``claude_session``, zero API cost) does the semantic
|
||||||
|
mapping — string overlap can't tell "same holding, different words" from a
|
||||||
|
genuinely new holding. The judge is asked to count, not to rewrite.
|
||||||
|
|
||||||
|
Run with the MCP server venv (needs the local ``claude`` CLI):
|
||||||
|
|
||||||
|
cd ~/legal-ai/mcp-server
|
||||||
|
.venv/bin/python ../scripts/nevo_ratio_benchmark.py --case 'בג"ץ 1764/05'
|
||||||
|
.venv/bin/python ../scripts/nevo_ratio_benchmark.py --all --limit 5
|
||||||
|
.venv/bin/python ../scripts/nevo_ratio_benchmark.py --all # full corpus
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import asyncio
|
||||||
|
import csv
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from legal_mcp.services import claude_session, db
|
||||||
|
from legal_mcp.services.extractor import extract_nevo_ratio
|
||||||
|
|
||||||
|
REPO_ROOT = Path(__file__).resolve().parent.parent
|
||||||
|
AUDIT_DIR = REPO_ROOT / "data" / "audit"
|
||||||
|
|
||||||
|
_JUDGE_SYSTEM = (
|
||||||
|
"אתה בוחן-איכות משפטי. נתונים לך (א) רשימת ההלכות (מיני-רציו) שכתב עורך נבו "
|
||||||
|
"עבור פסק-דין — אמת-המידה; (ב) רשימת ההלכות שמערכת אוטומטית חילצה מאותו "
|
||||||
|
"פסק-דין. משימתך: למפות סמנטית בין השתיים (אותו עיקרון משפטי בניסוח שונה = "
|
||||||
|
"התאמה), ולספור. החזר JSON בלבד, ללא טקסט נוסף."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _judge_prompt(ratio: str, ours: list[str]) -> str:
|
||||||
|
ours_block = "\n".join(f"{i}. {s}" for i, s in enumerate(ours, 1)) or "(אין)"
|
||||||
|
return (
|
||||||
|
f"מיני-רציו של נבו (אמת-מידה):\n{ratio}\n\n"
|
||||||
|
f"ההלכות שחולצו על-ידי המערכת ({len(ours)}):\n{ours_block}\n\n"
|
||||||
|
"החזר JSON עם המפתחות:\n"
|
||||||
|
'{"nevo_holdings": <מספר העקרונות הנפרדים במיני-רציו>,\n'
|
||||||
|
' "covered": <כמה מעקרונות נבו מכוסים ע"י לפחות הלכה אחת שלנו>,\n'
|
||||||
|
' "ours_total": <מספר ההלכות שלנו>,\n'
|
||||||
|
' "ours_mapped": <כמה מההלכות שלנו ממופות לעיקרון נבו כלשהו>,\n'
|
||||||
|
' "notes": "<עד 2 משפטים: מה הוחמץ / מה עודף>"}'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _bench_one(row: dict, model: str | None) -> dict:
|
||||||
|
cn = row["case_number"]
|
||||||
|
ratio = (row.get("nevo_ratio") or "").strip() or extract_nevo_ratio(row.get("full_text") or "")
|
||||||
|
result = {"case_number": cn, "nevo_holdings": 0, "covered": 0,
|
||||||
|
"ours_total": 0, "ours_mapped": 0, "recall": None,
|
||||||
|
"precision": None, "granularity": None, "notes": "", "error": ""}
|
||||||
|
if not ratio:
|
||||||
|
result["error"] = "no mini-ratio"
|
||||||
|
return result
|
||||||
|
|
||||||
|
halachot = await db.list_halachot(case_law_id=row["id"], limit=500)
|
||||||
|
ours = [h["rule_statement"] for h in halachot
|
||||||
|
if h.get("review_status") in ("approved", "published", "pending_review")
|
||||||
|
and (h.get("rule_statement") or "").strip()]
|
||||||
|
result["ours_total"] = len(ours)
|
||||||
|
if not ours:
|
||||||
|
result["error"] = "no extracted halachot"
|
||||||
|
return result
|
||||||
|
|
||||||
|
try:
|
||||||
|
verdict = await claude_session.query_json(
|
||||||
|
_judge_prompt(ratio, ours), system=_JUDGE_SYSTEM, model=model, effort="low",
|
||||||
|
)
|
||||||
|
except Exception as e: # noqa: BLE001
|
||||||
|
result["error"] = f"judge failed: {e}"
|
||||||
|
return result
|
||||||
|
if not isinstance(verdict, dict):
|
||||||
|
result["error"] = "judge returned non-dict"
|
||||||
|
return result
|
||||||
|
|
||||||
|
nh = int(verdict.get("nevo_holdings") or 0)
|
||||||
|
cov = int(verdict.get("covered") or 0)
|
||||||
|
ot = int(verdict.get("ours_total") or len(ours))
|
||||||
|
om = int(verdict.get("ours_mapped") or 0)
|
||||||
|
result.update({
|
||||||
|
"nevo_holdings": nh, "covered": cov, "ours_total": ot, "ours_mapped": om,
|
||||||
|
"recall": round(cov / nh, 3) if nh else None,
|
||||||
|
"precision": round(om / ot, 3) if ot else None,
|
||||||
|
"granularity": round(ot / nh, 2) if nh else None,
|
||||||
|
"notes": str(verdict.get("notes") or "")[:300],
|
||||||
|
})
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
async def main(args: argparse.Namespace) -> int:
|
||||||
|
pool = await db.get_pool()
|
||||||
|
async with pool.acquire() as conn:
|
||||||
|
if args.case:
|
||||||
|
rows = await conn.fetch(
|
||||||
|
"SELECT id, case_number, nevo_ratio, full_text FROM case_law "
|
||||||
|
"WHERE case_number = $1", args.case,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# rulings that have (or can derive) a ratio
|
||||||
|
rows = await conn.fetch(
|
||||||
|
"SELECT id, case_number, nevo_ratio, full_text FROM case_law "
|
||||||
|
"WHERE nevo_ratio <> '' OR full_text LIKE '%מיני-רציו:%' "
|
||||||
|
"ORDER BY case_number"
|
||||||
|
)
|
||||||
|
rows = [dict(r) for r in rows]
|
||||||
|
if args.limit:
|
||||||
|
rows = rows[: args.limit]
|
||||||
|
if not rows:
|
||||||
|
print("no rulings with a mini-ratio found", flush=True)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
print(f"benchmarking {len(rows)} ruling(s)...", flush=True)
|
||||||
|
results = []
|
||||||
|
for i, row in enumerate(rows, 1):
|
||||||
|
res = await _bench_one(row, args.model)
|
||||||
|
results.append(res)
|
||||||
|
if res["error"]:
|
||||||
|
print(f"[{i}/{len(rows)}] {res['case_number']}: SKIP ({res['error']})", flush=True)
|
||||||
|
else:
|
||||||
|
print(f"[{i}/{len(rows)}] {res['case_number']}: "
|
||||||
|
f"recall={res['recall']} precision={res['precision']} "
|
||||||
|
f"granularity={res['granularity']}x "
|
||||||
|
f"(nevo={res['nevo_holdings']}, ours={res['ours_total']})", flush=True)
|
||||||
|
|
||||||
|
scored = [r for r in results if r["recall"] is not None]
|
||||||
|
if scored:
|
||||||
|
avg = lambda k: round(sum(r[k] for r in scored) / len(scored), 3) # noqa: E731
|
||||||
|
print(f"\n=== {len(scored)} scored — mean recall={avg('recall')} "
|
||||||
|
f"precision={avg('precision')} granularity={avg('granularity')}x ===", flush=True)
|
||||||
|
|
||||||
|
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
|
||||||
|
AUDIT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
out = Path(args.out) if args.out else AUDIT_DIR / f"nevo-ratio-benchmark-{ts}.csv"
|
||||||
|
with out.open("w", encoding="utf-8", newline="") as f:
|
||||||
|
w = csv.DictWriter(f, fieldnames=list(results[0].keys()))
|
||||||
|
w.writeheader()
|
||||||
|
w.writerows(results)
|
||||||
|
print(f"report: {out}", flush=True)
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
ap = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
g = ap.add_mutually_exclusive_group(required=True)
|
||||||
|
g.add_argument("--case", help="benchmark a single case_number")
|
||||||
|
g.add_argument("--all", action="store_true", help="benchmark all rulings with a mini-ratio")
|
||||||
|
ap.add_argument("--limit", type=int, default=None, help="cap the number of rulings")
|
||||||
|
ap.add_argument("--model", default=None, help="judge model (default: CLI session default)")
|
||||||
|
ap.add_argument("--out", default=None, help="output CSV path (default: data/audit/)")
|
||||||
|
args = ap.parse_args()
|
||||||
|
sys.exit(asyncio.run(main(args)))
|
||||||
Reference in New Issue
Block a user