feat(halacha): strict-rubric quality gate + dedup-on-insert (#81,#82)

Bake the 2026-06-03 strict-cleanup rubric into the extraction pipeline so the corpus stays clean at the source instead of accumulating duplicates, obiter dicta, truncated quotes and thin restatements that clog the review queue. #81 — quality gate: - New pure module halacha_quality.py with unit-tested validators: non-decision/obiter (Wambaugh markers), truncated-quote (mid-word cut), thin-restatement (rule≈quote), quote-unverified. - Validators run in halacha_extractor._process; a non-decision is re-typed obiter; flags persist in new halachot.quality_flags column. - Auto-approve now requires confidence>=threshold AND no quality flags; flagged items route to pending_review regardless of confidence. - Both extraction prompts hardened: reject undecided dicta, exclude case-specific applications, require abstraction, forbid over-splitting. #82 — dedup-on-insert (store_halachot_for_chunk): - Within the same precedent, skip a halacha whose normalized supporting_quote already exists, or whose rule-embedding has cosine>=HALACHA_DEDUP_COSINE (0.93) against an already-stored one. Makes re-runs idempotent. Migration: halachot.quality_flags TEXT[] (additive, idempotent ALTER). Tests: 19 new unit tests; full suite 156 passed. Validated end-to-end against dev DB (dedup skips dups, flag blocks auto-approve, re-run inserts 0). Calibration: flags fire on only ~10% of current survivors (low false-positive). Spec: docs/halacha-strict-rubric.md Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-03 12:30:38 +00:00
parent b0ec24a9d5
commit ca959d4a9c
6 changed files with 386 additions and 18 deletions
--- a/mcp-server/src/legal_mcp/services/halacha_quality.py
+++ b/mcp-server/src/legal_mcp/services/halacha_quality.py
@@ -0,0 +1,158 @@
+"""Pure quality validators + dedup helpers for halacha extraction.
+
+These encode the "strict rules" rubric (docs/halacha-strict-rubric.md) that
+drove the 2026-06-03 corpus cleanup (1454→534), so that future extraction
+comes out clean instead of accumulating duplicates, obiter dicta, truncated
+quotes and thin restatements that clog the review queue.
+
+Everything here is a PURE function (no DB, no LLM) so it is fully unit-tested.
+The DB-touching dedup-on-insert (uses these helpers) lives in
+``db.store_halachot_for_chunk``.
+
+Flags produced by :func:`compute_quality_flags` BLOCK auto-approval (the item
+routes to ``pending_review`` regardless of confidence) but never delete — the
+chair still sees flagged items, just out of the auto-approved stream.
+"""
+
+from __future__ import annotations
+
+import re
+
+# ── Hebrew text normalization (shared with the extractor's quote check) ──
+
+_HEB_QUOTE_VARIANTS = "\"'׳״‘’“”«»„′″"
+
+
+def normalize_text(text: str) -> str:
+    """Collapse whitespace and unify Hebrew quote-mark variants for matching.
+
+    Kept dependency-free (the extractor previously routed through
+    ``proofreader._fix_hebrew_quotes``; here we inline a quote-class collapse so
+    this module stays pure and importable from anywhere).
+    """
+    if not text:
+        return ""
+    # Unify the half-dozen quote/gershayim variants to a single ASCII quote.
+    unified = re.sub(f"[{re.escape(_HEB_QUOTE_VARIANTS)}]", '"', text)
+    return re.sub(r"\s+", " ", unified).strip()
+
+
+# ── Non-decision / obiter detection (Wambaugh: the court did not decide) ──
+#
+# High-precision markers only. Phrases like "לכאורה" / "ניתן להניח" alone are
+# too common to flag reliably, so we require the explicit "declined to rule"
+# formulations the rubric calibration confirmed on שפר (idx 32: "איני רואה
+# לקבוע מסמרות") and on 8027-25 (idx 18-19: "אין צורך להכריע").
+
+NON_DECISION_MARKERS = (
+    "אין צורך להכריע",
+    "איני נדרש להכריע",
+    "איננו נדרשים להכריע",
+    "אין אנו נדרשים להכריע",
+    "מתייתר הצורך להכריע",
+    "אין צורך לקבוע מסמרות",
+    "מבלי לקבוע מסמרות",
+    "איני רואה לקבוע מסמרות",
+    "איננו רואים לקבוע מסמרות",
+    "אין לקבוע מסמרות",
+    "אין מקום לקבוע מסמרות",
+    "לא ראינו לקבוע מסמרות",
+    "למעלה מן הצורך",
+    "למעלה מהצורך",
+    "למעלה מן הדרוש",
+    "מעבר לנדרש",
+    "אגב אורחא",
+    "אגב אורחה",
+)
+
+
+def detect_non_decision(*texts: str) -> str | None:
+    """Return the first non-decision marker found across ``texts`` (or None).
+
+    Scans rule_statement + reasoning_summary + supporting_quote — the court's
+    own hedge usually sits in the quote/reasoning, not the abstracted rule.
+    """
+    joined = normalize_text(" ".join(t for t in texts if t))
+    for marker in NON_DECISION_MARKERS:
+        if marker in joined:
+            return marker
+    return None
+
+
+# ── Truncated / incomplete supporting-quote detection ──
+#
+# Conservative: only flag a CLEAR mid-word cut — the quote's last whitespace-
+# delimited token is a single Hebrew letter (a dangling construct/prefix such
+# as the "...על ה" in 8099-02-17 idx 6). A complete clause ends in a full word,
+# so this does not fire on quotes that merely lack a trailing period (the
+# calibration showed ~1/3 of valid quotes drop the final period legitimately).
+
+_HEB_LETTER = "א-ת"
+
+
+def is_quote_truncated(quote: str) -> bool:
+    norm = normalize_text(quote)
+    if not norm:
+        return True
+    tokens = norm.split(" ")
+    last = tokens[-1].strip('".,;:)]')
+    # dangling single Hebrew letter at the end == cut mid-word
+    if len(last) == 1 and re.match(f"[{_HEB_LETTER}]", last):
+        return True
+    return False
+
+
+# ── Thin restatement: rule_statement adds nothing over the quote ──
+#
+# Flag when the rule is essentially a copy of the quote: high token overlap AND
+# the rule is no longer than the quote. A genuine halacha ABSTRACTS the rule, so
+# it introduces wording the verbatim quote lacks and/or generalizes (longer or
+# differently phrased).
+
+_THIN_OVERLAP = 0.85
+_THIN_LEN_RATIO = 1.10
+
+
+def _tokens(text: str) -> set[str]:
+    norm = normalize_text(text)
+    return {t for t in re.split(r"[^א-ת0-9]+", norm) if len(t) > 1}
+
+
+def is_thin_restatement(rule_statement: str, supporting_quote: str) -> bool:
+    rule_t = _tokens(rule_statement)
+    quote_t = _tokens(supporting_quote)
+    if not rule_t or not quote_t:
+        return False
+    overlap = len(rule_t & quote_t) / len(rule_t)
+    len_ratio = len(normalize_text(rule_statement)) / max(1, len(normalize_text(supporting_quote)))
+    return overlap >= _THIN_OVERLAP and len_ratio <= _THIN_LEN_RATIO
+
+
+# ── Aggregate ──
+
+FLAG_NON_DECISION = "non_decision"
+FLAG_TRUNCATED_QUOTE = "truncated_quote"
+FLAG_THIN_RESTATEMENT = "thin_restatement"
+FLAG_QUOTE_UNVERIFIED = "quote_unverified"
+
+
+def compute_quality_flags(
+    rule_statement: str,
+    supporting_quote: str,
+    reasoning_summary: str = "",
+    quote_verified: bool = True,
+) -> list[str]:
+    """Return the list of quality flags for one halacha (empty == clean).
+
+    Any non-empty result blocks auto-approval (routes to pending_review).
+    """
+    flags: list[str] = []
+    if detect_non_decision(rule_statement, reasoning_summary, supporting_quote):
+        flags.append(FLAG_NON_DECISION)
+    if is_quote_truncated(supporting_quote):
+        flags.append(FLAG_TRUNCATED_QUOTE)
+    if is_thin_restatement(rule_statement, supporting_quote):
+        flags.append(FLAG_THIN_RESTATEMENT)
+    if not quote_verified:
+        flags.append(FLAG_QUOTE_UNVERIFIED)
+    return flags