feat(nevo): backfill leaked preamble + ratio gold-set benchmark (#86)
#86.2 backfill + #86.3 benchmark, plus a #86.1 over-strip fix found en route.
extractor.py
- extract_nevo_ratio(): capture Nevo's מיני-רציו block (editorial holdings
summary) before it is stripped — a free professional gold-set (#86.3).
- _DECISION_START hardening (#86.2): the merged #86.1 regex over-stripped.
(a) פסק-דין headers are markdown-wrapped (**פסק דין**); the old anchor
required the keyword as the first line char with one separator, so it
missed the header and matched a citation 32K deep (עמ"נ 50567-07-21,
losing 45% of the body). Now tolerates leading markdown + 0-3 seps,
and the final-nun form (דין ן vs דינו נ).
(b) bare השופט/הנשיא matched CITATIONS ("השופט מ' חשין, פסקה 23"). The
authoring-judge line ends with a colon; we now require it.
ingest.py
- capture the ratio before stripping and store it on the row (best-effort,
non-fatal); also strip the text-upload path (was file-only).
db.py
- add case_law.nevo_ratio column (additive); allow it in update_case_law.
scripts/backfill_nevo_preamble.py (#86.2) — dry-run-by-default data migration:
finds historically-leaked rulings, captures ratio→nevo_ratio, rewrites
full_text (+content_hash), reindexes, and FLAGS (never deletes) halachot whose
quote lives in the removed preamble (review_status=pending_review +
nevo_preamble_leak flag). Safety guard: rows with keep%<--min-keep (60) are
excluded from --apply as suspected over-strip. --apply writes backup+manifest
to data/audit/ first. Chair-gated — NOT applied here.
scripts/nevo_ratio_benchmark.py (#86.3) — LLM-as-judge (local claude_session,
zero cost) measures recall/precision/granularity of our halachot vs the Nevo
ratio. Works pre- and post-backfill (reads nevo_ratio, falls back to full_text).
Verified:
- pytest tests/test_nevo_preamble.py — 12 passed (incl. citation/markdown
over-strip regressions).
- backfill dry-run: 19 leaked rulings, 27 contaminated halachot, all ≥75%
keep (the 32K over-strip is gone).
- benchmark on בג"ץ 1764/05: recall=0.875 precision=1.0 granularity=1.75x.
Invariants: G1 (normalize at source — strip/capture at ingest, not at read);
no silent swallow (contaminated halachot flagged + reported, not dropped);
data-migration is dry-run-default with backup+manifest, chair-gated.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -619,6 +619,12 @@ ALTER TABLE case_law ADD COLUMN IF NOT EXISTS practice_area TEXT DEFAULT '';
|
||||
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS appeal_subtype TEXT DEFAULT '';
|
||||
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS headnote TEXT DEFAULT '';
|
||||
-- chair-editable abstract shown in search results.
|
||||
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS nevo_ratio TEXT DEFAULT '';
|
||||
-- The Nevo editorial מיני-רציו block, captured at ingest *before* it is
|
||||
-- stripped from the body (#86.3). Kept separate from `headnote` (which is
|
||||
-- our own abstract) so it can serve as a free professional gold-set for
|
||||
-- benchmarking halacha-extraction recall/precision. Empty when the source
|
||||
-- is not a Nevo export or carries no mini-ratio.
|
||||
ALTER TABLE case_law ADD COLUMN IF NOT EXISTS source_type TEXT DEFAULT '';
|
||||
-- 'court_ruling' | 'appeals_committee'
|
||||
|
||||
@@ -3263,7 +3269,7 @@ async def update_case_law(case_law_id: UUID, **fields) -> dict | None:
|
||||
"""
|
||||
allowed = {
|
||||
"case_number", "case_name", "court", "date", "practice_area", "appeal_subtype",
|
||||
"subject_tags", "summary", "headnote", "key_quote", "source_url",
|
||||
"subject_tags", "summary", "headnote", "nevo_ratio", "key_quote", "source_url",
|
||||
"source_type", "precedent_level", "is_binding", "district", "chair_name",
|
||||
"proceeding_type", "citation_formatted",
|
||||
}
|
||||
|
||||
@@ -362,12 +362,24 @@ _NEVO_MARKERS = ("ספרות:", "חקיקה שאוזכרה:", "מיני-רציו
|
||||
# preamble: bibliography + מיני-רציו). Two families:
|
||||
# - ועדת ערר / district openings (בפנינו / הערר שבנדון / ...)
|
||||
# - COURT-RULING openings (#86.1): a פסק-דין header or the authoring judge's
|
||||
# line ("השופט/ת X:", "כב' השופט", "הנשיא"). Without these, Nevo court
|
||||
# judgments — exactly the ones carrying a מיני-רציו — slipped through unstripped
|
||||
# (e.g. בג"ץ 1764/05), risking that the extractor reads Nevo's answer key.
|
||||
# line. Without these, Nevo court judgments — exactly the ones carrying a
|
||||
# מיני-רציו — slipped through unstripped (e.g. בג"ץ 1764/05).
|
||||
#
|
||||
# #86.2 hardening — two over-strip bugs found while backfilling:
|
||||
# 1. ``פסק-דין`` headers are often markdown-wrapped (``**פסק דין**``); the old
|
||||
# ``^פסק[- ]דין`` required the keyword to be the very first char of the line
|
||||
# and allowed only one separator, so it missed the header and fell through
|
||||
# to a citation 32K deep (עמ"נ 50567-07-21). We now tolerate leading
|
||||
# markdown/whitespace and 0-3 separators.
|
||||
# 2. Bare ``השופט``/``הנשיא`` matched *citations* ("השופט מ' חשין, פסקה 23"),
|
||||
# stripping real decision body. The authoring-judge line ends with a COLON
|
||||
# ("השופט י' עמית:"); citations use a comma. We now require the colon.
|
||||
_DECISION_START = re.compile(
|
||||
r"^(בפנינו|לפנינו|לפניי|הערר שבנדון|ועדת הערר לתכנון|רקע עובדתי|עסקינן|"
|
||||
r"פסק[- ]דין|פסק[- ]דינו|כב(?:וד)?['׳]?\s*השופט|המשנה לנשיא|הנשיא|השופט)",
|
||||
r"^[ \t>*_#]{0,6}(?:"
|
||||
r"בפנינו|לפנינו|לפניי|הערר שבנדון|ועדת הערר לתכנון|רקע עובדתי|עסקינן|"
|
||||
r"פסק[ \t\-]{0,3}די(?:ן|נו)|" # פסק-דין / פסק דין / **פסק דין** header (final-nun ן vs דינו)
|
||||
r"(?:כב(?:וד)?['׳\"]?\s*)?(?:ה?שופט[ת]?|ה?נשיא[ה]?|המשנה לנשיא)\s+[^\n,]{1,40}:" # author line → colon
|
||||
r")",
|
||||
re.MULTILINE,
|
||||
)
|
||||
|
||||
@@ -388,3 +400,41 @@ def strip_nevo_preamble(text: str) -> str:
|
||||
logger.debug("Stripped %d chars of Nevo preamble", m.start())
|
||||
return stripped
|
||||
return text
|
||||
|
||||
|
||||
_RATIO_MARKER = "מיני-רציו:"
|
||||
|
||||
|
||||
def extract_nevo_ratio(text: str) -> str:
|
||||
"""Return the Nevo מיני-רציו block (editorial holdings summary), or ''.
|
||||
|
||||
The mini-ratio is Nevo's own headnote — a concise, professionally-written
|
||||
list of the holdings. We capture it *before* :func:`strip_nevo_preamble`
|
||||
discards it, to serve as a free gold-set for benchmarking how well our
|
||||
halacha extractor covers the real holdings (#86.3).
|
||||
|
||||
The block runs from the ``מיני-רציו:`` marker to whichever comes first:
|
||||
the decision body (``_DECISION_START``) or the next preamble marker
|
||||
(bibliography / legislation). Returns '' when there is no mini-ratio.
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
start = text.find(_RATIO_MARKER)
|
||||
if start == -1:
|
||||
return ""
|
||||
body = text[start + len(_RATIO_MARKER):]
|
||||
|
||||
# End at the earliest of: decision body start, or a following preamble
|
||||
# marker (ספרות: / חקיקה שאוזכרה: / ...). Both are measured relative to
|
||||
# the ratio body so we never run past it into the judgment itself.
|
||||
end = len(body)
|
||||
dm = _DECISION_START.search(body)
|
||||
if dm:
|
||||
end = min(end, dm.start())
|
||||
for marker in _NEVO_MARKERS:
|
||||
if marker == _RATIO_MARKER:
|
||||
continue
|
||||
pos = body.find(marker)
|
||||
if pos != -1:
|
||||
end = min(end, pos)
|
||||
return body[:end].strip()
|
||||
|
||||
@@ -158,9 +158,14 @@ async def ingest_document(
|
||||
except Exception as e:
|
||||
await progress("failed", 100, f"כשל בחילוץ טקסט: {e}")
|
||||
raise
|
||||
raw_text = extractor.strip_nevo_preamble((raw_text or "")).strip()
|
||||
raw_text = (raw_text or "")
|
||||
else:
|
||||
raw_text = (text or "").strip()
|
||||
raw_text = (text or "")
|
||||
# Capture the Nevo מיני-רציו (editorial holdings summary) BEFORE stripping
|
||||
# it out — it is a free professional gold-set for benchmarking halacha
|
||||
# extraction (#86.3). Stored on the case_law row below once we have its id.
|
||||
nevo_ratio = extractor.extract_nevo_ratio(raw_text)
|
||||
raw_text = extractor.strip_nevo_preamble(raw_text).strip()
|
||||
if not raw_text:
|
||||
await progress("failed", 100, "לא נמצא טקסט בקובץ")
|
||||
raise ValueError("no extractable text in file")
|
||||
@@ -180,6 +185,13 @@ async def ingest_document(
|
||||
)
|
||||
case_law_id = UUID(str(record["id"]))
|
||||
|
||||
# Persist the captured mini-ratio (best-effort; never block ingest on it).
|
||||
if nevo_ratio:
|
||||
try:
|
||||
await db.update_case_law(case_law_id, nevo_ratio=nevo_ratio)
|
||||
except Exception as e: # noqa: BLE001 — additive metadata, non-fatal
|
||||
logger.warning("could not store nevo_ratio for %s: %s", case_law_id, e)
|
||||
|
||||
try:
|
||||
stored_chunks = await _chunk_embed_store(case_law_id, raw_text, page_offsets, page_count, progress)
|
||||
await db.mark_indexed(case_law_id)
|
||||
|
||||
Reference in New Issue
Block a user