#86.2 backfill + #86.3 benchmark, plus a #86.1 over-strip fix found en route.
extractor.py
- extract_nevo_ratio(): capture Nevo's מיני-רציו block (editorial holdings
summary) before it is stripped — a free professional gold-set (#86.3).
- _DECISION_START hardening (#86.2): the merged #86.1 regex over-stripped.
(a) פסק-דין headers are markdown-wrapped (**פסק דין**); the old anchor
required the keyword as the first line char with one separator, so it
missed the header and matched a citation 32K deep (עמ"נ 50567-07-21,
losing 45% of the body). Now tolerates leading markdown + 0-3 seps,
and the final-nun form (דין ן vs דינו נ).
(b) bare השופט/הנשיא matched CITATIONS ("השופט מ' חשין, פסקה 23"). The
authoring-judge line ends with a colon; we now require it.
ingest.py
- capture the ratio before stripping and store it on the row (best-effort,
non-fatal); also strip the text-upload path (was file-only).
db.py
- add case_law.nevo_ratio column (additive); allow it in update_case_law.
scripts/backfill_nevo_preamble.py (#86.2) — dry-run-by-default data migration:
finds historically-leaked rulings, captures ratio→nevo_ratio, rewrites
full_text (+content_hash), reindexes, and FLAGS (never deletes) halachot whose
quote lives in the removed preamble (review_status=pending_review +
nevo_preamble_leak flag). Safety guard: rows with keep%<--min-keep (60) are
excluded from --apply as suspected over-strip. --apply writes backup+manifest
to data/audit/ first. Chair-gated — NOT applied here.
scripts/nevo_ratio_benchmark.py (#86.3) — LLM-as-judge (local claude_session,
zero cost) measures recall/precision/granularity of our halachot vs the Nevo
ratio. Works pre- and post-backfill (reads nevo_ratio, falls back to full_text).
Verified:
- pytest tests/test_nevo_preamble.py — 12 passed (incl. citation/markdown
over-strip regressions).
- backfill dry-run: 19 leaked rulings, 27 contaminated halachot, all ≥75%
keep (the 32K over-strip is gone).
- benchmark on בג"ץ 1764/05: recall=0.875 precision=1.0 granularity=1.75x.
Invariants: G1 (normalize at source — strip/capture at ingest, not at read);
no silent swallow (contaminated halachot flagged + reported, not dropped);
data-migration is dry-run-default with backup+manifest, chair-gated.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
119 lines
4.7 KiB
Python
119 lines
4.7 KiB
Python
from __future__ import annotations
|
||
|
||
from legal_mcp.services import extractor as ex
|
||
|
||
# Nevo preamble block shared by the Nevo-sourced cases.
|
||
_PREAMBLE = (
|
||
"חקיקה שאוזכרה:\n"
|
||
"חוק התכנון והבניה, תשכ\"ה-1965: סע' 197\n\n"
|
||
"מיני-רציו:\n"
|
||
"* העותרים לא הוכיחו טעם מיוחד.\n"
|
||
"ביהמ\"ש העליון דחה את העתירה בקובעו:\n"
|
||
"המחוקק הגביל את הזמן ל-3 שנים.\n\n"
|
||
)
|
||
|
||
|
||
def test_strips_court_ruling_judge_opening():
|
||
# #86.1: court rulings open with the authoring judge — previously NOT stripped.
|
||
text = _PREAMBLE + "השופט ס' ג'ובראן:\n\nהאם קיימים טעמים מיוחדים..."
|
||
out = ex.strip_nevo_preamble(text)
|
||
assert out.startswith("השופט ס' ג'ובראן:")
|
||
assert "מיני-רציו" not in out
|
||
assert "דחה את העתירה בקובעו" not in out
|
||
|
||
|
||
def test_strips_court_ruling_pdin_header():
|
||
text = _PREAMBLE + "פסק-דין\n\nלפנינו עתירה..."
|
||
out = ex.strip_nevo_preamble(text)
|
||
assert out.startswith("פסק-דין")
|
||
assert "מיני-רציו" not in out
|
||
|
||
|
||
def test_strips_vaada_opening_regression():
|
||
# existing behaviour must keep working
|
||
text = _PREAMBLE + "בפנינו ערר על החלטת הוועדה המקומית..."
|
||
out = ex.strip_nevo_preamble(text)
|
||
assert out.startswith("בפנינו ערר")
|
||
assert "מיני-רציו" not in out
|
||
|
||
|
||
def test_non_nevo_unchanged():
|
||
# no Nevo markers → returned as-is even though it has a judge line
|
||
text = "פסק דין\nהשופט כהן: בעניין שלפנינו..."
|
||
assert ex.strip_nevo_preamble(text) == text
|
||
|
||
|
||
def test_nevo_markers_but_no_body_start_unchanged():
|
||
# markers present but nothing that looks like a decision body → leave intact
|
||
text = "מיני-רציו:\n* תקציר בלבד ללא גוף החלטה\n"
|
||
assert ex.strip_nevo_preamble(text) == text
|
||
|
||
|
||
def test_markers_past_400_chars_still_detected():
|
||
# a long court/parties header pushes the markers past the old 400-char window
|
||
header = "בבית המשפט העליון " + ("x " * 200) + "\n" # ~600 chars
|
||
text = header + _PREAMBLE + "השופטת ע' ארבל:\n\nגוף ההחלטה..."
|
||
out = ex.strip_nevo_preamble(text)
|
||
assert out.startswith("השופטת ע' ארבל:")
|
||
|
||
|
||
# ── extract_nevo_ratio (#86.3 gold-set capture) ──
|
||
|
||
def test_extract_ratio_returns_block_before_body():
|
||
text = _PREAMBLE + "השופט ס' ג'ובראן:\n\nגוף ההחלטה..."
|
||
ratio = ex.extract_nevo_ratio(text)
|
||
assert "העותרים לא הוכיחו טעם מיוחד" in ratio
|
||
assert "המחוקק הגביל את הזמן" in ratio
|
||
# must not bleed into the judgment body
|
||
assert "גוף ההחלטה" not in ratio
|
||
assert "השופט ס' ג'ובראן" not in ratio
|
||
|
||
|
||
def test_extract_ratio_stops_at_following_marker():
|
||
# ratio first, then a bibliography marker AFTER it
|
||
text = (
|
||
"מיני-רציו:\n* עיקרון אחד בלבד.\n\n"
|
||
"פסקי דין שאוזכרו:\nבג\"ץ 1/00\n\n"
|
||
"פסק-דין\nגוף..."
|
||
)
|
||
ratio = ex.extract_nevo_ratio(text)
|
||
assert "עיקרון אחד בלבד" in ratio
|
||
assert "פסקי דין שאוזכרו" not in ratio
|
||
assert "בג\"ץ 1/00" not in ratio
|
||
|
||
|
||
def test_extract_ratio_empty_when_no_marker():
|
||
assert ex.extract_nevo_ratio("פסק דין\nהשופט כהן: ...") == ""
|
||
assert ex.extract_nevo_ratio("") == ""
|
||
|
||
|
||
# ── #86.2 over-strip regressions ──
|
||
|
||
def test_citation_judge_line_is_not_a_decision_start():
|
||
# "השופט מ' חשין, פסקה 23" is a CITATION (comma, no colon) — must NOT be
|
||
# treated as the decision opening, or 32K of real body gets stripped.
|
||
body = (
|
||
"**פסק דין**\n\n"
|
||
"שני ערעורים לפניי. כפי שנפסק מפי כבוד \n\n"
|
||
"השופט מ' חשין, פסקה 23 (להלן עניין קהתי), יש לבחון...\n"
|
||
)
|
||
text = _PREAMBLE + body
|
||
out = ex.strip_nevo_preamble(text)
|
||
assert out.startswith("**פסק דין**")
|
||
assert "השופט מ' חשין, פסקה" in out # citation kept inside body
|
||
assert "מיני-רציו" not in out
|
||
|
||
|
||
def test_markdown_wrapped_pdin_header_is_stripped():
|
||
text = _PREAMBLE + "**פסק דין**\n\nשני ערעוריה הנדונים..."
|
||
out = ex.strip_nevo_preamble(text)
|
||
assert out.startswith("**פסק דין**")
|
||
assert "מיני-רציו" not in out
|
||
|
||
|
||
def test_author_line_with_colon_still_strips():
|
||
text = _PREAMBLE + "כב' השופטת ד' ברק-ארז:\n\nגוף ההחלטה..."
|
||
out = ex.strip_nevo_preamble(text)
|
||
assert out.startswith("כב' השופטת ד' ברק-ארז:")
|
||
assert "מיני-רציו" not in out
|