Files
legal-ai/mcp-server/src/legal_mcp/services/halacha_quality.py
Chaim ca959d4a9c feat(halacha): strict-rubric quality gate + dedup-on-insert (#81,#82)
Bake the 2026-06-03 strict-cleanup rubric into the extraction pipeline so the
corpus stays clean at the source instead of accumulating duplicates, obiter
dicta, truncated quotes and thin restatements that clog the review queue.

#81 — quality gate:
- New pure module halacha_quality.py with unit-tested validators:
  non-decision/obiter (Wambaugh markers), truncated-quote (mid-word cut),
  thin-restatement (rule≈quote), quote-unverified.
- Validators run in halacha_extractor._process; a non-decision is re-typed
  obiter; flags persist in new halachot.quality_flags column.
- Auto-approve now requires confidence>=threshold AND no quality flags;
  flagged items route to pending_review regardless of confidence.
- Both extraction prompts hardened: reject undecided dicta, exclude
  case-specific applications, require abstraction, forbid over-splitting.

#82 — dedup-on-insert (store_halachot_for_chunk):
- Within the same precedent, skip a halacha whose normalized supporting_quote
  already exists, or whose rule-embedding has cosine>=HALACHA_DEDUP_COSINE
  (0.93) against an already-stored one. Makes re-runs idempotent.

Migration: halachot.quality_flags TEXT[] (additive, idempotent ALTER).
Tests: 19 new unit tests; full suite 156 passed. Validated end-to-end against
dev DB (dedup skips dups, flag blocks auto-approve, re-run inserts 0).
Calibration: flags fire on only ~10% of current survivors (low false-positive).

Spec: docs/halacha-strict-rubric.md

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-03 12:30:38 +00:00

159 lines
5.7 KiB
Python

"""Pure quality validators + dedup helpers for halacha extraction.
These encode the "strict rules" rubric (docs/halacha-strict-rubric.md) that
drove the 2026-06-03 corpus cleanup (1454→534), so that future extraction
comes out clean instead of accumulating duplicates, obiter dicta, truncated
quotes and thin restatements that clog the review queue.
Everything here is a PURE function (no DB, no LLM) so it is fully unit-tested.
The DB-touching dedup-on-insert (uses these helpers) lives in
``db.store_halachot_for_chunk``.
Flags produced by :func:`compute_quality_flags` BLOCK auto-approval (the item
routes to ``pending_review`` regardless of confidence) but never delete — the
chair still sees flagged items, just out of the auto-approved stream.
"""
from __future__ import annotations
import re
# ── Hebrew text normalization (shared with the extractor's quote check) ──
_HEB_QUOTE_VARIANTS = "\"'׳״‘’“”«»„′″"
def normalize_text(text: str) -> str:
"""Collapse whitespace and unify Hebrew quote-mark variants for matching.
Kept dependency-free (the extractor previously routed through
``proofreader._fix_hebrew_quotes``; here we inline a quote-class collapse so
this module stays pure and importable from anywhere).
"""
if not text:
return ""
# Unify the half-dozen quote/gershayim variants to a single ASCII quote.
unified = re.sub(f"[{re.escape(_HEB_QUOTE_VARIANTS)}]", '"', text)
return re.sub(r"\s+", " ", unified).strip()
# ── Non-decision / obiter detection (Wambaugh: the court did not decide) ──
#
# High-precision markers only. Phrases like "לכאורה" / "ניתן להניח" alone are
# too common to flag reliably, so we require the explicit "declined to rule"
# formulations the rubric calibration confirmed on שפר (idx 32: "איני רואה
# לקבוע מסמרות") and on 8027-25 (idx 18-19: "אין צורך להכריע").
NON_DECISION_MARKERS = (
"אין צורך להכריע",
"איני נדרש להכריע",
"איננו נדרשים להכריע",
"אין אנו נדרשים להכריע",
"מתייתר הצורך להכריע",
"אין צורך לקבוע מסמרות",
"מבלי לקבוע מסמרות",
"איני רואה לקבוע מסמרות",
"איננו רואים לקבוע מסמרות",
"אין לקבוע מסמרות",
"אין מקום לקבוע מסמרות",
"לא ראינו לקבוע מסמרות",
"למעלה מן הצורך",
"למעלה מהצורך",
"למעלה מן הדרוש",
"מעבר לנדרש",
"אגב אורחא",
"אגב אורחה",
)
def detect_non_decision(*texts: str) -> str | None:
"""Return the first non-decision marker found across ``texts`` (or None).
Scans rule_statement + reasoning_summary + supporting_quote — the court's
own hedge usually sits in the quote/reasoning, not the abstracted rule.
"""
joined = normalize_text(" ".join(t for t in texts if t))
for marker in NON_DECISION_MARKERS:
if marker in joined:
return marker
return None
# ── Truncated / incomplete supporting-quote detection ──
#
# Conservative: only flag a CLEAR mid-word cut — the quote's last whitespace-
# delimited token is a single Hebrew letter (a dangling construct/prefix such
# as the "...על ה" in 8099-02-17 idx 6). A complete clause ends in a full word,
# so this does not fire on quotes that merely lack a trailing period (the
# calibration showed ~1/3 of valid quotes drop the final period legitimately).
_HEB_LETTER = "א-ת"
def is_quote_truncated(quote: str) -> bool:
norm = normalize_text(quote)
if not norm:
return True
tokens = norm.split(" ")
last = tokens[-1].strip('".,;:)]')
# dangling single Hebrew letter at the end == cut mid-word
if len(last) == 1 and re.match(f"[{_HEB_LETTER}]", last):
return True
return False
# ── Thin restatement: rule_statement adds nothing over the quote ──
#
# Flag when the rule is essentially a copy of the quote: high token overlap AND
# the rule is no longer than the quote. A genuine halacha ABSTRACTS the rule, so
# it introduces wording the verbatim quote lacks and/or generalizes (longer or
# differently phrased).
_THIN_OVERLAP = 0.85
_THIN_LEN_RATIO = 1.10
def _tokens(text: str) -> set[str]:
norm = normalize_text(text)
return {t for t in re.split(r"[^א-ת0-9]+", norm) if len(t) > 1}
def is_thin_restatement(rule_statement: str, supporting_quote: str) -> bool:
rule_t = _tokens(rule_statement)
quote_t = _tokens(supporting_quote)
if not rule_t or not quote_t:
return False
overlap = len(rule_t & quote_t) / len(rule_t)
len_ratio = len(normalize_text(rule_statement)) / max(1, len(normalize_text(supporting_quote)))
return overlap >= _THIN_OVERLAP and len_ratio <= _THIN_LEN_RATIO
# ── Aggregate ──
FLAG_NON_DECISION = "non_decision"
FLAG_TRUNCATED_QUOTE = "truncated_quote"
FLAG_THIN_RESTATEMENT = "thin_restatement"
FLAG_QUOTE_UNVERIFIED = "quote_unverified"
def compute_quality_flags(
rule_statement: str,
supporting_quote: str,
reasoning_summary: str = "",
quote_verified: bool = True,
) -> list[str]:
"""Return the list of quality flags for one halacha (empty == clean).
Any non-empty result blocks auto-approval (routes to pending_review).
"""
flags: list[str] = []
if detect_non_decision(rule_statement, reasoning_summary, supporting_quote):
flags.append(FLAG_NON_DECISION)
if is_quote_truncated(supporting_quote):
flags.append(FLAG_TRUNCATED_QUOTE)
if is_thin_restatement(rule_statement, supporting_quote):
flags.append(FLAG_THIN_RESTATEMENT)
if not quote_verified:
flags.append(FLAG_QUOTE_UNVERIFIED)
return flags