feat(halacha): strict-rubric quality gate + dedup-on-insert (#81,#82)
Bake the 2026-06-03 strict-cleanup rubric into the extraction pipeline so the corpus stays clean at the source instead of accumulating duplicates, obiter dicta, truncated quotes and thin restatements that clog the review queue. #81 — quality gate: - New pure module halacha_quality.py with unit-tested validators: non-decision/obiter (Wambaugh markers), truncated-quote (mid-word cut), thin-restatement (rule≈quote), quote-unverified. - Validators run in halacha_extractor._process; a non-decision is re-typed obiter; flags persist in new halachot.quality_flags column. - Auto-approve now requires confidence>=threshold AND no quality flags; flagged items route to pending_review regardless of confidence. - Both extraction prompts hardened: reject undecided dicta, exclude case-specific applications, require abstraction, forbid over-splitting. #82 — dedup-on-insert (store_halachot_for_chunk): - Within the same precedent, skip a halacha whose normalized supporting_quote already exists, or whose rule-embedding has cosine>=HALACHA_DEDUP_COSINE (0.93) against an already-stored one. Makes re-runs idempotent. Migration: halachot.quality_flags TEXT[] (additive, idempotent ALTER). Tests: 19 new unit tests; full suite 156 passed. Validated end-to-end against dev DB (dedup skips dups, flag blocks auto-approve, re-run inserts 0). Calibration: flags fire on only ~10% of current survivors (low false-positive). Spec: docs/halacha-strict-rubric.md Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
158
mcp-server/src/legal_mcp/services/halacha_quality.py
Normal file
158
mcp-server/src/legal_mcp/services/halacha_quality.py
Normal file
@@ -0,0 +1,158 @@
|
||||
"""Pure quality validators + dedup helpers for halacha extraction.
|
||||
|
||||
These encode the "strict rules" rubric (docs/halacha-strict-rubric.md) that
|
||||
drove the 2026-06-03 corpus cleanup (1454→534), so that future extraction
|
||||
comes out clean instead of accumulating duplicates, obiter dicta, truncated
|
||||
quotes and thin restatements that clog the review queue.
|
||||
|
||||
Everything here is a PURE function (no DB, no LLM) so it is fully unit-tested.
|
||||
The DB-touching dedup-on-insert (uses these helpers) lives in
|
||||
``db.store_halachot_for_chunk``.
|
||||
|
||||
Flags produced by :func:`compute_quality_flags` BLOCK auto-approval (the item
|
||||
routes to ``pending_review`` regardless of confidence) but never delete — the
|
||||
chair still sees flagged items, just out of the auto-approved stream.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
# ── Hebrew text normalization (shared with the extractor's quote check) ──
|
||||
|
||||
_HEB_QUOTE_VARIANTS = "\"'׳״‘’“”«»„′″"
|
||||
|
||||
|
||||
def normalize_text(text: str) -> str:
|
||||
"""Collapse whitespace and unify Hebrew quote-mark variants for matching.
|
||||
|
||||
Kept dependency-free (the extractor previously routed through
|
||||
``proofreader._fix_hebrew_quotes``; here we inline a quote-class collapse so
|
||||
this module stays pure and importable from anywhere).
|
||||
"""
|
||||
if not text:
|
||||
return ""
|
||||
# Unify the half-dozen quote/gershayim variants to a single ASCII quote.
|
||||
unified = re.sub(f"[{re.escape(_HEB_QUOTE_VARIANTS)}]", '"', text)
|
||||
return re.sub(r"\s+", " ", unified).strip()
|
||||
|
||||
|
||||
# ── Non-decision / obiter detection (Wambaugh: the court did not decide) ──
|
||||
#
|
||||
# High-precision markers only. Phrases like "לכאורה" / "ניתן להניח" alone are
|
||||
# too common to flag reliably, so we require the explicit "declined to rule"
|
||||
# formulations the rubric calibration confirmed on שפר (idx 32: "איני רואה
|
||||
# לקבוע מסמרות") and on 8027-25 (idx 18-19: "אין צורך להכריע").
|
||||
|
||||
NON_DECISION_MARKERS = (
|
||||
"אין צורך להכריע",
|
||||
"איני נדרש להכריע",
|
||||
"איננו נדרשים להכריע",
|
||||
"אין אנו נדרשים להכריע",
|
||||
"מתייתר הצורך להכריע",
|
||||
"אין צורך לקבוע מסמרות",
|
||||
"מבלי לקבוע מסמרות",
|
||||
"איני רואה לקבוע מסמרות",
|
||||
"איננו רואים לקבוע מסמרות",
|
||||
"אין לקבוע מסמרות",
|
||||
"אין מקום לקבוע מסמרות",
|
||||
"לא ראינו לקבוע מסמרות",
|
||||
"למעלה מן הצורך",
|
||||
"למעלה מהצורך",
|
||||
"למעלה מן הדרוש",
|
||||
"מעבר לנדרש",
|
||||
"אגב אורחא",
|
||||
"אגב אורחה",
|
||||
)
|
||||
|
||||
|
||||
def detect_non_decision(*texts: str) -> str | None:
|
||||
"""Return the first non-decision marker found across ``texts`` (or None).
|
||||
|
||||
Scans rule_statement + reasoning_summary + supporting_quote — the court's
|
||||
own hedge usually sits in the quote/reasoning, not the abstracted rule.
|
||||
"""
|
||||
joined = normalize_text(" ".join(t for t in texts if t))
|
||||
for marker in NON_DECISION_MARKERS:
|
||||
if marker in joined:
|
||||
return marker
|
||||
return None
|
||||
|
||||
|
||||
# ── Truncated / incomplete supporting-quote detection ──
|
||||
#
|
||||
# Conservative: only flag a CLEAR mid-word cut — the quote's last whitespace-
|
||||
# delimited token is a single Hebrew letter (a dangling construct/prefix such
|
||||
# as the "...על ה" in 8099-02-17 idx 6). A complete clause ends in a full word,
|
||||
# so this does not fire on quotes that merely lack a trailing period (the
|
||||
# calibration showed ~1/3 of valid quotes drop the final period legitimately).
|
||||
|
||||
_HEB_LETTER = "א-ת"
|
||||
|
||||
|
||||
def is_quote_truncated(quote: str) -> bool:
|
||||
norm = normalize_text(quote)
|
||||
if not norm:
|
||||
return True
|
||||
tokens = norm.split(" ")
|
||||
last = tokens[-1].strip('".,;:)]')
|
||||
# dangling single Hebrew letter at the end == cut mid-word
|
||||
if len(last) == 1 and re.match(f"[{_HEB_LETTER}]", last):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# ── Thin restatement: rule_statement adds nothing over the quote ──
|
||||
#
|
||||
# Flag when the rule is essentially a copy of the quote: high token overlap AND
|
||||
# the rule is no longer than the quote. A genuine halacha ABSTRACTS the rule, so
|
||||
# it introduces wording the verbatim quote lacks and/or generalizes (longer or
|
||||
# differently phrased).
|
||||
|
||||
_THIN_OVERLAP = 0.85
|
||||
_THIN_LEN_RATIO = 1.10
|
||||
|
||||
|
||||
def _tokens(text: str) -> set[str]:
|
||||
norm = normalize_text(text)
|
||||
return {t for t in re.split(r"[^א-ת0-9]+", norm) if len(t) > 1}
|
||||
|
||||
|
||||
def is_thin_restatement(rule_statement: str, supporting_quote: str) -> bool:
|
||||
rule_t = _tokens(rule_statement)
|
||||
quote_t = _tokens(supporting_quote)
|
||||
if not rule_t or not quote_t:
|
||||
return False
|
||||
overlap = len(rule_t & quote_t) / len(rule_t)
|
||||
len_ratio = len(normalize_text(rule_statement)) / max(1, len(normalize_text(supporting_quote)))
|
||||
return overlap >= _THIN_OVERLAP and len_ratio <= _THIN_LEN_RATIO
|
||||
|
||||
|
||||
# ── Aggregate ──
|
||||
|
||||
FLAG_NON_DECISION = "non_decision"
|
||||
FLAG_TRUNCATED_QUOTE = "truncated_quote"
|
||||
FLAG_THIN_RESTATEMENT = "thin_restatement"
|
||||
FLAG_QUOTE_UNVERIFIED = "quote_unverified"
|
||||
|
||||
|
||||
def compute_quality_flags(
|
||||
rule_statement: str,
|
||||
supporting_quote: str,
|
||||
reasoning_summary: str = "",
|
||||
quote_verified: bool = True,
|
||||
) -> list[str]:
|
||||
"""Return the list of quality flags for one halacha (empty == clean).
|
||||
|
||||
Any non-empty result blocks auto-approval (routes to pending_review).
|
||||
"""
|
||||
flags: list[str] = []
|
||||
if detect_non_decision(rule_statement, reasoning_summary, supporting_quote):
|
||||
flags.append(FLAG_NON_DECISION)
|
||||
if is_quote_truncated(supporting_quote):
|
||||
flags.append(FLAG_TRUNCATED_QUOTE)
|
||||
if is_thin_restatement(rule_statement, supporting_quote):
|
||||
flags.append(FLAG_THIN_RESTATEMENT)
|
||||
if not quote_verified:
|
||||
flags.append(FLAG_QUOTE_UNVERIFIED)
|
||||
return flags
|
||||
Reference in New Issue
Block a user