Files
legal-ai/mcp-server/src/legal_mcp/services/halacha_quality.py
Chaim 2e33cac043 fix(halacha): split authority (derived) from rule_role — stop source-conflation (INV-DM7)
The extractor classified rule_type by SOURCE bindingness (higher-court→binding,
committee→persuasive) instead of by rule KIND. The gold-set proved it: 'binding'
appeared on 19/19 external rulings & 0 committees; 'persuasive' on 13/13
committees & 0 external — only 58% agreement with the human role tags. The two
axes (authority vs rule role) were crammed into one enum.

This splits them per INV-DM7:
- authority (binding/persuasive) — DERIVED from case_law.precedent_level
  (עליון/מנהלי→binding, ועדת_ערר_מחוזית→persuasive), never stored, never
  LLM-guessed. New helper halacha_quality.derive_authority; surfaced read-only
  in list_halachot / goldset_list / search results.
- rule_type — now the rule ROLE only: holding/interpretive/procedural/
  application/obiter. Both extractor prompts unified to this vocabulary;
  _coerce_halacha no longer defaults rule_type from the source; legacy
  binding→holding / persuasive→interpretive fold for safety.

UI: authority shown as a separate read-only badge (gold=מחייב / muted=משכנע)
across the review queue, precedent detail, and gold-set; the gold-set role
selector drops binding/persuasive and adds מהותי (holding).

Migration: scripts/halacha_rule_role_backfill.py re-classifies the 276 pre-split
binding/persuasive rows into a genuine role via local claude_session (run after
deploy). Gold-set correct_type/ai_correct_type 'binding'→'holding' via SQL.

Sources (≥3, per research-decision policy): OASIS LegalRuleML v1.0
(appliesAuthority/Strength as metadata orthogonal to rule logic) · SemEval-2023
Task 6 LegalEval (rhetorical roles by function, authority kept separate) ·
Bluebook signals (weight-of-authority is a separate dimension).

Invariants: ESTABLISHES INV-DM7. Upholds G1 (normalize at source — extractor
classifies role, system derives authority) and G2 (single source of truth —
authority derived, not a parallel stored field). Tests: 211 pass + new
derive_authority/coerce coverage. web-ui build + tsc clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-07 18:18:41 +00:00

392 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Pure quality validators + dedup helpers for halacha extraction.
These encode the "strict rules" rubric (docs/halacha-strict-rubric.md) that
drove the 2026-06-03 corpus cleanup (1454→534), so that future extraction
comes out clean instead of accumulating duplicates, obiter dicta, truncated
quotes and thin restatements that clog the review queue.
Everything here is a PURE function (no DB, no LLM) so it is fully unit-tested.
The DB-touching dedup-on-insert (uses these helpers) lives in
``db.store_halachot_for_chunk``.
Flags produced by :func:`compute_quality_flags` BLOCK auto-approval (the item
routes to ``pending_review`` regardless of confidence) but never delete — the
chair still sees flagged items, just out of the auto-approved stream.
"""
from __future__ import annotations
import re
# ── Authority axis — DERIVED from the source, never LLM-classified (INV-DM7) ──
#
# A halacha's *authority* (binding vs persuasive) is a property of WHERE it came
# from, not of the rule's content. It is therefore derived deterministically
# from ``case_law.precedent_level`` and never stored on ``halachot`` or guessed
# by the extractor — keeping it orthogonal to ``rule_type`` (the rule ROLE).
# Higher courts (עליון/מנהלי) bind the appeals committee; another committee is
# only persuasive. See docs/spec/02-data-model.md INV-DM7.
AUTHORITY_BINDING = "binding"
AUTHORITY_PERSUASIVE = "persuasive"
_BINDING_LEVELS = {"עליון", "מנהלי"}
_PERSUASIVE_LEVELS = {"ועדת_ערר_מחוזית"}
def derive_authority(precedent_level: str | None) -> str | None:
"""Map a source's precedent_level to its authority over the committee.
Returns ``"binding"`` for higher courts (עליון/מנהלי), ``"persuasive"`` for
another appeals committee (ועדת_ערר_מחוזית), or ``None`` when the level is
unknown/empty (never guesses). Pure — the single source of truth for the
authority axis (INV-DM7).
"""
level = (precedent_level or "").strip()
if level in _BINDING_LEVELS:
return AUTHORITY_BINDING
if level in _PERSUASIVE_LEVELS:
return AUTHORITY_PERSUASIVE
return None
# ── Hebrew text normalization (shared with the extractor's quote check) ──
_HEB_QUOTE_VARIANTS = "\"'׳״‘’“”«»„′″"
def normalize_text(text: str) -> str:
"""Collapse whitespace and unify Hebrew quote-mark variants for matching.
Kept dependency-free (the extractor previously routed through
``proofreader._fix_hebrew_quotes``; here we inline a quote-class collapse so
this module stays pure and importable from anywhere).
"""
if not text:
return ""
# Unify the half-dozen quote/gershayim variants to a single ASCII quote.
unified = re.sub(f"[{re.escape(_HEB_QUOTE_VARIANTS)}]", '"', text)
return re.sub(r"\s+", " ", unified).strip()
# ── Non-decision / obiter detection (Wambaugh: the court did not decide) ──
#
# High-precision markers only. Phrases like "לכאורה" / "ניתן להניח" alone are
# too common to flag reliably, so we require the explicit "declined to rule"
# formulations the rubric calibration confirmed on שפר (idx 32: "איני רואה
# לקבוע מסמרות") and on 8027-25 (idx 18-19: "אין צורך להכריע").
NON_DECISION_MARKERS = (
"אין צורך להכריע",
"איני נדרש להכריע",
"איננו נדרשים להכריע",
"אין אנו נדרשים להכריע",
"מתייתר הצורך להכריע",
"אין צורך לקבוע מסמרות",
"מבלי לקבוע מסמרות",
"איני רואה לקבוע מסמרות",
"איננו רואים לקבוע מסמרות",
"אין לקבוע מסמרות",
"אין מקום לקבוע מסמרות",
"לא ראינו לקבוע מסמרות",
"למעלה מן הצורך",
"למעלה מהצורך",
"למעלה מן הדרוש",
"מעבר לנדרש",
"אגב אורחא",
"אגב אורחה",
)
def detect_non_decision(*texts: str) -> str | None:
"""Return the first non-decision marker found across ``texts`` (or None).
Scans rule_statement + reasoning_summary + supporting_quote — the court's
own hedge usually sits in the quote/reasoning, not the abstracted rule.
"""
joined = normalize_text(" ".join(t for t in texts if t))
for marker in NON_DECISION_MARKERS:
if marker in joined:
return marker
return None
# ── Truncated / incomplete supporting-quote detection ──
#
# Conservative: only flag a CLEAR mid-word cut — the quote's last whitespace-
# delimited token is a single Hebrew letter (a dangling construct/prefix such
# as the "...על ה" in 8099-02-17 idx 6). A complete clause ends in a full word,
# so this does not fire on quotes that merely lack a trailing period (the
# calibration showed ~1/3 of valid quotes drop the final period legitimately).
_HEB_LETTER = "א-ת"
def is_quote_truncated(quote: str) -> bool:
norm = normalize_text(quote)
if not norm:
return True
tokens = norm.split(" ")
last = tokens[-1].strip('".,;:)]')
# dangling single Hebrew letter at the end == cut mid-word
if len(last) == 1 and re.match(f"[{_HEB_LETTER}]", last):
return True
return False
# ── Thin restatement: rule_statement adds nothing over the quote ──
#
# Flag when the rule is essentially a copy of the quote: high token overlap AND
# the rule is no longer than the quote. A genuine halacha ABSTRACTS the rule, so
# it introduces wording the verbatim quote lacks and/or generalizes (longer or
# differently phrased).
_THIN_OVERLAP = 0.85
_THIN_LEN_RATIO = 1.10
def _tokens(text: str) -> set[str]:
norm = normalize_text(text)
return {t for t in re.split(r"[^א-ת0-9]+", norm) if len(t) > 1}
def is_thin_restatement(rule_statement: str, supporting_quote: str) -> bool:
rule_t = _tokens(rule_statement)
quote_t = _tokens(supporting_quote)
if not rule_t or not quote_t:
return False
overlap = len(rule_t & quote_t) / len(rule_t)
len_ratio = len(normalize_text(rule_statement)) / max(1, len(normalize_text(supporting_quote)))
return overlap >= _THIN_OVERLAP and len_ratio <= _THIN_LEN_RATIO
# ── Fact-dependent application: not a generalizable holding (#81.4) ──
#
# The strict rubric's cut_application (docs/halacha-strict-rubric.md §3, §27):
# a determination that rests on the case's specific facts/parties/amounts is an
# illustration, not a holding — it must not enter the corpus as a binding rule.
# The extractor already classifies ``rule_type='application'``; this is a
# HIGH-PRECISION secondary catch for rules the model mislabeled as binding,
# using only the unambiguous "applied to THIS case" deixis (bare party words
# like "המערער" appear in genuine rules too, so they are deliberately excluded).
_FACT_DEPENDENT_MARKERS = (
"במקרה דנן",
"במקרה שבפנינו",
"במקרה שלפנינו",
"במקרה שלפניי",
"בענייננו",
"בנדון דידן",
"בנדון דנן",
"במקרה שלנו",
"בנסיבות המקרה שלפנינו",
"בנסיבות תיק זה",
"בתיק שלפנינו",
"בערר שלפנינו",
"בערר דנן",
)
def is_fact_dependent(rule_statement: str) -> bool:
"""True when the rule is phrased as an application to THIS case (not a holding)."""
norm = normalize_text(rule_statement)
return any(marker in norm for marker in _FACT_DEPENDENT_MARKERS)
# ── Lexical near-duplicate signal (the 0.830.90 cosine tail) — #82.3 ──
#
# Embedding cosine alone misses paraphrases that float just below the dedup
# threshold (0.93). A secondary lexical signal — Jaccard over word-shingles +
# normalized Levenshtein on the rule_statement — catches "same rule, reworded"
# in that band without lowering the global cosine threshold. Hybrid
# lexical+semantic beats either alone (arXiv:1805.11611). Pure functions.
def _shingles(text: str, k: int = 2) -> set[str]:
words = [w for w in re.split(r"[^א-ת0-9]+", normalize_text(text)) if w]
if len(words) < k:
return {" ".join(words)} if words else set()
return {" ".join(words[i : i + k]) for i in range(len(words) - k + 1)}
def jaccard_shingles(a: str, b: str, k: int = 2) -> float:
sa, sb = _shingles(a, k), _shingles(b, k)
if not sa or not sb:
return 0.0
return len(sa & sb) / len(sa | sb)
def normalized_levenshtein(a: str, b: str) -> float:
"""1.0 == identical, 0.0 == fully different (edit distance / max len)."""
a, b = normalize_text(a), normalize_text(b)
if not a and not b:
return 1.0
if not a or not b:
return 0.0
# classic DP edit distance (rule_statements are short — a few hundred chars)
prev = list(range(len(b) + 1))
for i, ca in enumerate(a, 1):
cur = [i]
for j, cb in enumerate(b, 1):
cur.append(min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + (ca != cb)))
prev = cur
return 1.0 - prev[-1] / max(len(a), len(b))
_LEX_JACCARD_MIN = 0.55
_LEX_LEVENSHTEIN_MIN = 0.70
def lexical_near_duplicate(
a: str, b: str, jaccard_min: float = _LEX_JACCARD_MIN,
levenshtein_min: float = _LEX_LEVENSHTEIN_MIN,
) -> bool:
"""High lexical overlap → likely the same rule reworded (for the cosine tail)."""
return (jaccard_shingles(a, b) >= jaccard_min
or normalized_levenshtein(a, b) >= levenshtein_min)
# ── Aggregate ──
FLAG_NON_DECISION = "non_decision"
FLAG_TRUNCATED_QUOTE = "truncated_quote"
FLAG_THIN_RESTATEMENT = "thin_restatement"
FLAG_QUOTE_UNVERIFIED = "quote_unverified"
FLAG_NLI_UNSUPPORTED = "nli_unsupported" # rule not entailed by its quote (#81.3)
FLAG_APPLICATION = "application" # fact-dependent, not a holding (#81.4)
FLAG_NEAR_DUPLICATE = "near_duplicate" # cosine-tail lexical dup (#82.3)
# ── NLI entailment check (rule_statement ⊨ supporting_quote) — #81.3 ──
#
# Pure prompt-builder + verdict-parser; the LLM call itself runs through
# claude_session in halacha_extractor (local CLI, zero cost). A rule that the
# quote does not actually support (neutral) or contradicts is the model
# over-reaching beyond its source — flag it (blocks auto-approve). EVERYTHING
# here fails OPEN: any parse ambiguity resolves to "entailed" so a flaky judge
# never blocks a genuine halacha.
NLI_SYSTEM = (
"אתה בודק היסק (entailment) משפטי. לכל זוג {כלל, ציטוט} החלט האם **הכלל נובע מהציטוט** — "
"כלומר הציטוט תומך בכלל ואינו מרחיב מעבר למה שנכתב בו. שלוש תוויות בלבד:\n"
"- entailed = הכלל נתמך במלואו בציטוט.\n"
"- neutral = הציטוט אינו תומך בכלל (הכלל מרחיב/מוסיף מעבר לציטוט).\n"
"- contradiction = הכלל סותר את הציטוט.\n"
'החזר JSON array בלבד באורך מספר הזוגות, לדוגמה: ["entailed","neutral",...]. '
"ללא markdown, ללא הסבר."
)
_NLI_LABELS = {"entailed", "neutral", "contradiction"}
def build_nli_prompt(items: list[dict]) -> str:
"""Build the user message: a numbered list of {rule, quote} pairs."""
blocks = []
for i, h in enumerate(items, 1):
rule = (h.get("rule_statement") or "").strip()
quote = (h.get("supporting_quote") or "").strip()
blocks.append(f"### זוג {i}\nכלל: {rule}\nציטוט: {quote}")
return "\n\n".join(blocks)
def parse_nli_verdicts(raw, n: int) -> list[str]:
"""Coerce the judge's output into exactly ``n`` labels — fail-open.
Any shape mismatch / unknown label resolves to 'entailed' so a flaky or
unavailable judge never blocks a halacha.
"""
if not isinstance(raw, list) or len(raw) != n:
return ["entailed"] * n
out: list[str] = []
for item in raw:
v = item.get("verdict") if isinstance(item, dict) else item
v = str(v or "").strip().lower()
out.append(v if v in _NLI_LABELS else "entailed")
return out
# ── Over-extraction consolidation (fold facets of one legal question) — #81.5 ──
#
# #82 dedup-on-insert removes near-EXACT dups (cosine ≥ 0.93). #81.5 handles the
# remaining over-extraction: facets of the SAME legal question, phrased
# differently, that sit BELOW the dedup threshold (the שפר 14-vs-4 / 403-17→89
# granularity gap). A per-precedent claude_session pass groups such facets; the
# extractor keeps one canonical per group and marks the rest rejected (reversible,
# out of the active corpus + review queue). FOLD-ONLY — never merges distinct
# legal questions, never invents. Fails OPEN (parse error → no folds).
CONSOLIDATE_SYSTEM = (
"אתה מאחד פנים-כפולים של הלכות שחולצו מאותו פסק דין. בהינתן רשימה ממוספרת של הלכות, "
"זהה קבוצות של הלכות שהן **אותה שאלה משפטית** בניסוחים או פנים שונים. "
"כללים: (1) אַחֵד רק הלכות שעונות על אותה שאלה משפטית בדיוק; (2) **אל תאַחֵד** הלכות "
"שעונות על שאלות משפטיות שונות (גם אם קרובות בנושא); (3) הלכה ייחודית — אל תכלול בשום קבוצה. "
'החזר JSON array של קבוצות, כל קבוצה = array של מספרי-האינדקס שיש לאַחֵד (לפחות 2 חברים). '
"לדוגמה: [[2,5,9],[14,18]]. אם אין מה לאַחֵד החזר []. ללא markdown, ללא הסבר."
)
def build_consolidation_prompt(items: list[dict]) -> str:
"""Numbered list of a precedent's halachot (index + rule + reasoning)."""
blocks = []
for h in items:
idx = h.get("halacha_index")
rule = (h.get("rule_statement") or "").strip()
reason = (h.get("reasoning_summary") or "").strip()
line = f"[{idx}] {rule}"
if reason:
line += f" (היגיון: {reason})"
blocks.append(line)
return "\n".join(blocks)
def parse_fold_groups(raw) -> list[list[int]]:
"""Coerce judge output into a list of fold-groups (≥2 int indices each).
Fails SAFE: any malformed shape → [] (no folding). Non-int / <2-member
groups are dropped.
"""
if not isinstance(raw, list):
return []
groups: list[list[int]] = []
for g in raw:
if not isinstance(g, list):
continue
members: list[int] = []
for x in g:
try:
members.append(int(x))
except (TypeError, ValueError):
continue
# dedup within group, preserve order
seen: set[int] = set()
members = [m for m in members if not (m in seen or seen.add(m))]
if len(members) >= 2:
groups.append(members)
return groups
def compute_quality_flags(
rule_statement: str,
supporting_quote: str,
reasoning_summary: str = "",
quote_verified: bool = True,
rule_type: str = "interpretive",
) -> list[str]:
"""Return the list of quality flags for one halacha (empty == clean).
Any non-empty result blocks auto-approval (routes to pending_review).
"""
flags: list[str] = []
if detect_non_decision(rule_statement, reasoning_summary, supporting_quote):
flags.append(FLAG_NON_DECISION)
if is_quote_truncated(supporting_quote):
flags.append(FLAG_TRUNCATED_QUOTE)
if is_thin_restatement(rule_statement, supporting_quote):
flags.append(FLAG_THIN_RESTATEMENT)
if not quote_verified:
flags.append(FLAG_QUOTE_UNVERIFIED)
# #81.4 — an application (fact-dependent) item is an illustration, not a
# generalizable holding: never auto-approve it. Trust the model's
# rule_type='application' and add a high-precision deixis catch.
if rule_type == "application" or is_fact_dependent(rule_statement):
flags.append(FLAG_APPLICATION)
return flags