The extractor classified rule_type by SOURCE bindingness (higher-court→binding, committee→persuasive) instead of by rule KIND. The gold-set proved it: 'binding' appeared on 19/19 external rulings & 0 committees; 'persuasive' on 13/13 committees & 0 external — only 58% agreement with the human role tags. The two axes (authority vs rule role) were crammed into one enum. This splits them per INV-DM7: - authority (binding/persuasive) — DERIVED from case_law.precedent_level (עליון/מנהלי→binding, ועדת_ערר_מחוזית→persuasive), never stored, never LLM-guessed. New helper halacha_quality.derive_authority; surfaced read-only in list_halachot / goldset_list / search results. - rule_type — now the rule ROLE only: holding/interpretive/procedural/ application/obiter. Both extractor prompts unified to this vocabulary; _coerce_halacha no longer defaults rule_type from the source; legacy binding→holding / persuasive→interpretive fold for safety. UI: authority shown as a separate read-only badge (gold=מחייב / muted=משכנע) across the review queue, precedent detail, and gold-set; the gold-set role selector drops binding/persuasive and adds מהותי (holding). Migration: scripts/halacha_rule_role_backfill.py re-classifies the 276 pre-split binding/persuasive rows into a genuine role via local claude_session (run after deploy). Gold-set correct_type/ai_correct_type 'binding'→'holding' via SQL. Sources (≥3, per research-decision policy): OASIS LegalRuleML v1.0 (appliesAuthority/Strength as metadata orthogonal to rule logic) · SemEval-2023 Task 6 LegalEval (rhetorical roles by function, authority kept separate) · Bluebook signals (weight-of-authority is a separate dimension). Invariants: ESTABLISHES INV-DM7. Upholds G1 (normalize at source — extractor classifies role, system derives authority) and G2 (single source of truth — authority derived, not a parallel stored field). Tests: 211 pass + new derive_authority/coerce coverage. web-ui build + tsc clean. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
392 lines
16 KiB
Python
392 lines
16 KiB
Python
"""Pure quality validators + dedup helpers for halacha extraction.
|
||
|
||
These encode the "strict rules" rubric (docs/halacha-strict-rubric.md) that
|
||
drove the 2026-06-03 corpus cleanup (1454→534), so that future extraction
|
||
comes out clean instead of accumulating duplicates, obiter dicta, truncated
|
||
quotes and thin restatements that clog the review queue.
|
||
|
||
Everything here is a PURE function (no DB, no LLM) so it is fully unit-tested.
|
||
The DB-touching dedup-on-insert (uses these helpers) lives in
|
||
``db.store_halachot_for_chunk``.
|
||
|
||
Flags produced by :func:`compute_quality_flags` BLOCK auto-approval (the item
|
||
routes to ``pending_review`` regardless of confidence) but never delete — the
|
||
chair still sees flagged items, just out of the auto-approved stream.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
|
||
# ── Authority axis — DERIVED from the source, never LLM-classified (INV-DM7) ──
|
||
#
|
||
# A halacha's *authority* (binding vs persuasive) is a property of WHERE it came
|
||
# from, not of the rule's content. It is therefore derived deterministically
|
||
# from ``case_law.precedent_level`` and never stored on ``halachot`` or guessed
|
||
# by the extractor — keeping it orthogonal to ``rule_type`` (the rule ROLE).
|
||
# Higher courts (עליון/מנהלי) bind the appeals committee; another committee is
|
||
# only persuasive. See docs/spec/02-data-model.md INV-DM7.
|
||
|
||
AUTHORITY_BINDING = "binding"
|
||
AUTHORITY_PERSUASIVE = "persuasive"
|
||
|
||
_BINDING_LEVELS = {"עליון", "מנהלי"}
|
||
_PERSUASIVE_LEVELS = {"ועדת_ערר_מחוזית"}
|
||
|
||
|
||
def derive_authority(precedent_level: str | None) -> str | None:
|
||
"""Map a source's precedent_level to its authority over the committee.
|
||
|
||
Returns ``"binding"`` for higher courts (עליון/מנהלי), ``"persuasive"`` for
|
||
another appeals committee (ועדת_ערר_מחוזית), or ``None`` when the level is
|
||
unknown/empty (never guesses). Pure — the single source of truth for the
|
||
authority axis (INV-DM7).
|
||
"""
|
||
level = (precedent_level or "").strip()
|
||
if level in _BINDING_LEVELS:
|
||
return AUTHORITY_BINDING
|
||
if level in _PERSUASIVE_LEVELS:
|
||
return AUTHORITY_PERSUASIVE
|
||
return None
|
||
|
||
# ── Hebrew text normalization (shared with the extractor's quote check) ──
|
||
|
||
_HEB_QUOTE_VARIANTS = "\"'׳״‘’“”«»„′″"
|
||
|
||
|
||
def normalize_text(text: str) -> str:
|
||
"""Collapse whitespace and unify Hebrew quote-mark variants for matching.
|
||
|
||
Kept dependency-free (the extractor previously routed through
|
||
``proofreader._fix_hebrew_quotes``; here we inline a quote-class collapse so
|
||
this module stays pure and importable from anywhere).
|
||
"""
|
||
if not text:
|
||
return ""
|
||
# Unify the half-dozen quote/gershayim variants to a single ASCII quote.
|
||
unified = re.sub(f"[{re.escape(_HEB_QUOTE_VARIANTS)}]", '"', text)
|
||
return re.sub(r"\s+", " ", unified).strip()
|
||
|
||
|
||
# ── Non-decision / obiter detection (Wambaugh: the court did not decide) ──
|
||
#
|
||
# High-precision markers only. Phrases like "לכאורה" / "ניתן להניח" alone are
|
||
# too common to flag reliably, so we require the explicit "declined to rule"
|
||
# formulations the rubric calibration confirmed on שפר (idx 32: "איני רואה
|
||
# לקבוע מסמרות") and on 8027-25 (idx 18-19: "אין צורך להכריע").
|
||
|
||
NON_DECISION_MARKERS = (
|
||
"אין צורך להכריע",
|
||
"איני נדרש להכריע",
|
||
"איננו נדרשים להכריע",
|
||
"אין אנו נדרשים להכריע",
|
||
"מתייתר הצורך להכריע",
|
||
"אין צורך לקבוע מסמרות",
|
||
"מבלי לקבוע מסמרות",
|
||
"איני רואה לקבוע מסמרות",
|
||
"איננו רואים לקבוע מסמרות",
|
||
"אין לקבוע מסמרות",
|
||
"אין מקום לקבוע מסמרות",
|
||
"לא ראינו לקבוע מסמרות",
|
||
"למעלה מן הצורך",
|
||
"למעלה מהצורך",
|
||
"למעלה מן הדרוש",
|
||
"מעבר לנדרש",
|
||
"אגב אורחא",
|
||
"אגב אורחה",
|
||
)
|
||
|
||
|
||
def detect_non_decision(*texts: str) -> str | None:
|
||
"""Return the first non-decision marker found across ``texts`` (or None).
|
||
|
||
Scans rule_statement + reasoning_summary + supporting_quote — the court's
|
||
own hedge usually sits in the quote/reasoning, not the abstracted rule.
|
||
"""
|
||
joined = normalize_text(" ".join(t for t in texts if t))
|
||
for marker in NON_DECISION_MARKERS:
|
||
if marker in joined:
|
||
return marker
|
||
return None
|
||
|
||
|
||
# ── Truncated / incomplete supporting-quote detection ──
|
||
#
|
||
# Conservative: only flag a CLEAR mid-word cut — the quote's last whitespace-
|
||
# delimited token is a single Hebrew letter (a dangling construct/prefix such
|
||
# as the "...על ה" in 8099-02-17 idx 6). A complete clause ends in a full word,
|
||
# so this does not fire on quotes that merely lack a trailing period (the
|
||
# calibration showed ~1/3 of valid quotes drop the final period legitimately).
|
||
|
||
_HEB_LETTER = "א-ת"
|
||
|
||
|
||
def is_quote_truncated(quote: str) -> bool:
|
||
norm = normalize_text(quote)
|
||
if not norm:
|
||
return True
|
||
tokens = norm.split(" ")
|
||
last = tokens[-1].strip('".,;:)]')
|
||
# dangling single Hebrew letter at the end == cut mid-word
|
||
if len(last) == 1 and re.match(f"[{_HEB_LETTER}]", last):
|
||
return True
|
||
return False
|
||
|
||
|
||
# ── Thin restatement: rule_statement adds nothing over the quote ──
|
||
#
|
||
# Flag when the rule is essentially a copy of the quote: high token overlap AND
|
||
# the rule is no longer than the quote. A genuine halacha ABSTRACTS the rule, so
|
||
# it introduces wording the verbatim quote lacks and/or generalizes (longer or
|
||
# differently phrased).
|
||
|
||
_THIN_OVERLAP = 0.85
|
||
_THIN_LEN_RATIO = 1.10
|
||
|
||
|
||
def _tokens(text: str) -> set[str]:
|
||
norm = normalize_text(text)
|
||
return {t for t in re.split(r"[^א-ת0-9]+", norm) if len(t) > 1}
|
||
|
||
|
||
def is_thin_restatement(rule_statement: str, supporting_quote: str) -> bool:
|
||
rule_t = _tokens(rule_statement)
|
||
quote_t = _tokens(supporting_quote)
|
||
if not rule_t or not quote_t:
|
||
return False
|
||
overlap = len(rule_t & quote_t) / len(rule_t)
|
||
len_ratio = len(normalize_text(rule_statement)) / max(1, len(normalize_text(supporting_quote)))
|
||
return overlap >= _THIN_OVERLAP and len_ratio <= _THIN_LEN_RATIO
|
||
|
||
|
||
# ── Fact-dependent application: not a generalizable holding (#81.4) ──
|
||
#
|
||
# The strict rubric's cut_application (docs/halacha-strict-rubric.md §3, §27):
|
||
# a determination that rests on the case's specific facts/parties/amounts is an
|
||
# illustration, not a holding — it must not enter the corpus as a binding rule.
|
||
# The extractor already classifies ``rule_type='application'``; this is a
|
||
# HIGH-PRECISION secondary catch for rules the model mislabeled as binding,
|
||
# using only the unambiguous "applied to THIS case" deixis (bare party words
|
||
# like "המערער" appear in genuine rules too, so they are deliberately excluded).
|
||
|
||
_FACT_DEPENDENT_MARKERS = (
|
||
"במקרה דנן",
|
||
"במקרה שבפנינו",
|
||
"במקרה שלפנינו",
|
||
"במקרה שלפניי",
|
||
"בענייננו",
|
||
"בנדון דידן",
|
||
"בנדון דנן",
|
||
"במקרה שלנו",
|
||
"בנסיבות המקרה שלפנינו",
|
||
"בנסיבות תיק זה",
|
||
"בתיק שלפנינו",
|
||
"בערר שלפנינו",
|
||
"בערר דנן",
|
||
)
|
||
|
||
|
||
def is_fact_dependent(rule_statement: str) -> bool:
|
||
"""True when the rule is phrased as an application to THIS case (not a holding)."""
|
||
norm = normalize_text(rule_statement)
|
||
return any(marker in norm for marker in _FACT_DEPENDENT_MARKERS)
|
||
|
||
|
||
# ── Lexical near-duplicate signal (the 0.83–0.90 cosine tail) — #82.3 ──
|
||
#
|
||
# Embedding cosine alone misses paraphrases that float just below the dedup
|
||
# threshold (0.93). A secondary lexical signal — Jaccard over word-shingles +
|
||
# normalized Levenshtein on the rule_statement — catches "same rule, reworded"
|
||
# in that band without lowering the global cosine threshold. Hybrid
|
||
# lexical+semantic beats either alone (arXiv:1805.11611). Pure functions.
|
||
|
||
def _shingles(text: str, k: int = 2) -> set[str]:
|
||
words = [w for w in re.split(r"[^א-ת0-9]+", normalize_text(text)) if w]
|
||
if len(words) < k:
|
||
return {" ".join(words)} if words else set()
|
||
return {" ".join(words[i : i + k]) for i in range(len(words) - k + 1)}
|
||
|
||
|
||
def jaccard_shingles(a: str, b: str, k: int = 2) -> float:
|
||
sa, sb = _shingles(a, k), _shingles(b, k)
|
||
if not sa or not sb:
|
||
return 0.0
|
||
return len(sa & sb) / len(sa | sb)
|
||
|
||
|
||
def normalized_levenshtein(a: str, b: str) -> float:
|
||
"""1.0 == identical, 0.0 == fully different (edit distance / max len)."""
|
||
a, b = normalize_text(a), normalize_text(b)
|
||
if not a and not b:
|
||
return 1.0
|
||
if not a or not b:
|
||
return 0.0
|
||
# classic DP edit distance (rule_statements are short — a few hundred chars)
|
||
prev = list(range(len(b) + 1))
|
||
for i, ca in enumerate(a, 1):
|
||
cur = [i]
|
||
for j, cb in enumerate(b, 1):
|
||
cur.append(min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + (ca != cb)))
|
||
prev = cur
|
||
return 1.0 - prev[-1] / max(len(a), len(b))
|
||
|
||
|
||
_LEX_JACCARD_MIN = 0.55
|
||
_LEX_LEVENSHTEIN_MIN = 0.70
|
||
|
||
|
||
def lexical_near_duplicate(
|
||
a: str, b: str, jaccard_min: float = _LEX_JACCARD_MIN,
|
||
levenshtein_min: float = _LEX_LEVENSHTEIN_MIN,
|
||
) -> bool:
|
||
"""High lexical overlap → likely the same rule reworded (for the cosine tail)."""
|
||
return (jaccard_shingles(a, b) >= jaccard_min
|
||
or normalized_levenshtein(a, b) >= levenshtein_min)
|
||
|
||
|
||
# ── Aggregate ──
|
||
|
||
FLAG_NON_DECISION = "non_decision"
|
||
FLAG_TRUNCATED_QUOTE = "truncated_quote"
|
||
FLAG_THIN_RESTATEMENT = "thin_restatement"
|
||
FLAG_QUOTE_UNVERIFIED = "quote_unverified"
|
||
FLAG_NLI_UNSUPPORTED = "nli_unsupported" # rule not entailed by its quote (#81.3)
|
||
FLAG_APPLICATION = "application" # fact-dependent, not a holding (#81.4)
|
||
FLAG_NEAR_DUPLICATE = "near_duplicate" # cosine-tail lexical dup (#82.3)
|
||
|
||
|
||
# ── NLI entailment check (rule_statement ⊨ supporting_quote) — #81.3 ──
|
||
#
|
||
# Pure prompt-builder + verdict-parser; the LLM call itself runs through
|
||
# claude_session in halacha_extractor (local CLI, zero cost). A rule that the
|
||
# quote does not actually support (neutral) or contradicts is the model
|
||
# over-reaching beyond its source — flag it (blocks auto-approve). EVERYTHING
|
||
# here fails OPEN: any parse ambiguity resolves to "entailed" so a flaky judge
|
||
# never blocks a genuine halacha.
|
||
|
||
NLI_SYSTEM = (
|
||
"אתה בודק היסק (entailment) משפטי. לכל זוג {כלל, ציטוט} החלט האם **הכלל נובע מהציטוט** — "
|
||
"כלומר הציטוט תומך בכלל ואינו מרחיב מעבר למה שנכתב בו. שלוש תוויות בלבד:\n"
|
||
"- entailed = הכלל נתמך במלואו בציטוט.\n"
|
||
"- neutral = הציטוט אינו תומך בכלל (הכלל מרחיב/מוסיף מעבר לציטוט).\n"
|
||
"- contradiction = הכלל סותר את הציטוט.\n"
|
||
'החזר JSON array בלבד באורך מספר הזוגות, לדוגמה: ["entailed","neutral",...]. '
|
||
"ללא markdown, ללא הסבר."
|
||
)
|
||
|
||
_NLI_LABELS = {"entailed", "neutral", "contradiction"}
|
||
|
||
|
||
def build_nli_prompt(items: list[dict]) -> str:
|
||
"""Build the user message: a numbered list of {rule, quote} pairs."""
|
||
blocks = []
|
||
for i, h in enumerate(items, 1):
|
||
rule = (h.get("rule_statement") or "").strip()
|
||
quote = (h.get("supporting_quote") or "").strip()
|
||
blocks.append(f"### זוג {i}\nכלל: {rule}\nציטוט: {quote}")
|
||
return "\n\n".join(blocks)
|
||
|
||
|
||
def parse_nli_verdicts(raw, n: int) -> list[str]:
|
||
"""Coerce the judge's output into exactly ``n`` labels — fail-open.
|
||
|
||
Any shape mismatch / unknown label resolves to 'entailed' so a flaky or
|
||
unavailable judge never blocks a halacha.
|
||
"""
|
||
if not isinstance(raw, list) or len(raw) != n:
|
||
return ["entailed"] * n
|
||
out: list[str] = []
|
||
for item in raw:
|
||
v = item.get("verdict") if isinstance(item, dict) else item
|
||
v = str(v or "").strip().lower()
|
||
out.append(v if v in _NLI_LABELS else "entailed")
|
||
return out
|
||
|
||
|
||
# ── Over-extraction consolidation (fold facets of one legal question) — #81.5 ──
|
||
#
|
||
# #82 dedup-on-insert removes near-EXACT dups (cosine ≥ 0.93). #81.5 handles the
|
||
# remaining over-extraction: facets of the SAME legal question, phrased
|
||
# differently, that sit BELOW the dedup threshold (the שפר 14-vs-4 / 403-17→89
|
||
# granularity gap). A per-precedent claude_session pass groups such facets; the
|
||
# extractor keeps one canonical per group and marks the rest rejected (reversible,
|
||
# out of the active corpus + review queue). FOLD-ONLY — never merges distinct
|
||
# legal questions, never invents. Fails OPEN (parse error → no folds).
|
||
|
||
CONSOLIDATE_SYSTEM = (
|
||
"אתה מאחד פנים-כפולים של הלכות שחולצו מאותו פסק דין. בהינתן רשימה ממוספרת של הלכות, "
|
||
"זהה קבוצות של הלכות שהן **אותה שאלה משפטית** בניסוחים או פנים שונים. "
|
||
"כללים: (1) אַחֵד רק הלכות שעונות על אותה שאלה משפטית בדיוק; (2) **אל תאַחֵד** הלכות "
|
||
"שעונות על שאלות משפטיות שונות (גם אם קרובות בנושא); (3) הלכה ייחודית — אל תכלול בשום קבוצה. "
|
||
'החזר JSON array של קבוצות, כל קבוצה = array של מספרי-האינדקס שיש לאַחֵד (לפחות 2 חברים). '
|
||
"לדוגמה: [[2,5,9],[14,18]]. אם אין מה לאַחֵד החזר []. ללא markdown, ללא הסבר."
|
||
)
|
||
|
||
|
||
def build_consolidation_prompt(items: list[dict]) -> str:
|
||
"""Numbered list of a precedent's halachot (index + rule + reasoning)."""
|
||
blocks = []
|
||
for h in items:
|
||
idx = h.get("halacha_index")
|
||
rule = (h.get("rule_statement") or "").strip()
|
||
reason = (h.get("reasoning_summary") or "").strip()
|
||
line = f"[{idx}] {rule}"
|
||
if reason:
|
||
line += f" (היגיון: {reason})"
|
||
blocks.append(line)
|
||
return "\n".join(blocks)
|
||
|
||
|
||
def parse_fold_groups(raw) -> list[list[int]]:
|
||
"""Coerce judge output into a list of fold-groups (≥2 int indices each).
|
||
|
||
Fails SAFE: any malformed shape → [] (no folding). Non-int / <2-member
|
||
groups are dropped.
|
||
"""
|
||
if not isinstance(raw, list):
|
||
return []
|
||
groups: list[list[int]] = []
|
||
for g in raw:
|
||
if not isinstance(g, list):
|
||
continue
|
||
members: list[int] = []
|
||
for x in g:
|
||
try:
|
||
members.append(int(x))
|
||
except (TypeError, ValueError):
|
||
continue
|
||
# dedup within group, preserve order
|
||
seen: set[int] = set()
|
||
members = [m for m in members if not (m in seen or seen.add(m))]
|
||
if len(members) >= 2:
|
||
groups.append(members)
|
||
return groups
|
||
|
||
|
||
def compute_quality_flags(
|
||
rule_statement: str,
|
||
supporting_quote: str,
|
||
reasoning_summary: str = "",
|
||
quote_verified: bool = True,
|
||
rule_type: str = "interpretive",
|
||
) -> list[str]:
|
||
"""Return the list of quality flags for one halacha (empty == clean).
|
||
|
||
Any non-empty result blocks auto-approval (routes to pending_review).
|
||
"""
|
||
flags: list[str] = []
|
||
if detect_non_decision(rule_statement, reasoning_summary, supporting_quote):
|
||
flags.append(FLAG_NON_DECISION)
|
||
if is_quote_truncated(supporting_quote):
|
||
flags.append(FLAG_TRUNCATED_QUOTE)
|
||
if is_thin_restatement(rule_statement, supporting_quote):
|
||
flags.append(FLAG_THIN_RESTATEMENT)
|
||
if not quote_verified:
|
||
flags.append(FLAG_QUOTE_UNVERIFIED)
|
||
# #81.4 — an application (fact-dependent) item is an illustration, not a
|
||
# generalizable holding: never auto-approve it. Trust the model's
|
||
# rule_type='application' and add a high-precision deixis catch.
|
||
if rule_type == "application" or is_fact_dependent(rule_statement):
|
||
flags.append(FLAG_APPLICATION)
|
||
return flags
|