Bake the 2026-06-03 strict-cleanup rubric into the extraction pipeline so the corpus stays clean at the source instead of accumulating duplicates, obiter dicta, truncated quotes and thin restatements that clog the review queue. #81 — quality gate: - New pure module halacha_quality.py with unit-tested validators: non-decision/obiter (Wambaugh markers), truncated-quote (mid-word cut), thin-restatement (rule≈quote), quote-unverified. - Validators run in halacha_extractor._process; a non-decision is re-typed obiter; flags persist in new halachot.quality_flags column. - Auto-approve now requires confidence>=threshold AND no quality flags; flagged items route to pending_review regardless of confidence. - Both extraction prompts hardened: reject undecided dicta, exclude case-specific applications, require abstraction, forbid over-splitting. #82 — dedup-on-insert (store_halachot_for_chunk): - Within the same precedent, skip a halacha whose normalized supporting_quote already exists, or whose rule-embedding has cosine>=HALACHA_DEDUP_COSINE (0.93) against an already-stored one. Makes re-runs idempotent. Migration: halachot.quality_flags TEXT[] (additive, idempotent ALTER). Tests: 19 new unit tests; full suite 156 passed. Validated end-to-end against dev DB (dedup skips dups, flag blocks auto-approve, re-run inserts 0). Calibration: flags fire on only ~10% of current survivors (low false-positive). Spec: docs/halacha-strict-rubric.md Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
159 lines
5.7 KiB
Python
159 lines
5.7 KiB
Python
"""Pure quality validators + dedup helpers for halacha extraction.
|
|
|
|
These encode the "strict rules" rubric (docs/halacha-strict-rubric.md) that
|
|
drove the 2026-06-03 corpus cleanup (1454→534), so that future extraction
|
|
comes out clean instead of accumulating duplicates, obiter dicta, truncated
|
|
quotes and thin restatements that clog the review queue.
|
|
|
|
Everything here is a PURE function (no DB, no LLM) so it is fully unit-tested.
|
|
The DB-touching dedup-on-insert (uses these helpers) lives in
|
|
``db.store_halachot_for_chunk``.
|
|
|
|
Flags produced by :func:`compute_quality_flags` BLOCK auto-approval (the item
|
|
routes to ``pending_review`` regardless of confidence) but never delete — the
|
|
chair still sees flagged items, just out of the auto-approved stream.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
|
|
# ── Hebrew text normalization (shared with the extractor's quote check) ──
|
|
|
|
_HEB_QUOTE_VARIANTS = "\"'׳״‘’“”«»„′″"
|
|
|
|
|
|
def normalize_text(text: str) -> str:
|
|
"""Collapse whitespace and unify Hebrew quote-mark variants for matching.
|
|
|
|
Kept dependency-free (the extractor previously routed through
|
|
``proofreader._fix_hebrew_quotes``; here we inline a quote-class collapse so
|
|
this module stays pure and importable from anywhere).
|
|
"""
|
|
if not text:
|
|
return ""
|
|
# Unify the half-dozen quote/gershayim variants to a single ASCII quote.
|
|
unified = re.sub(f"[{re.escape(_HEB_QUOTE_VARIANTS)}]", '"', text)
|
|
return re.sub(r"\s+", " ", unified).strip()
|
|
|
|
|
|
# ── Non-decision / obiter detection (Wambaugh: the court did not decide) ──
|
|
#
|
|
# High-precision markers only. Phrases like "לכאורה" / "ניתן להניח" alone are
|
|
# too common to flag reliably, so we require the explicit "declined to rule"
|
|
# formulations the rubric calibration confirmed on שפר (idx 32: "איני רואה
|
|
# לקבוע מסמרות") and on 8027-25 (idx 18-19: "אין צורך להכריע").
|
|
|
|
NON_DECISION_MARKERS = (
|
|
"אין צורך להכריע",
|
|
"איני נדרש להכריע",
|
|
"איננו נדרשים להכריע",
|
|
"אין אנו נדרשים להכריע",
|
|
"מתייתר הצורך להכריע",
|
|
"אין צורך לקבוע מסמרות",
|
|
"מבלי לקבוע מסמרות",
|
|
"איני רואה לקבוע מסמרות",
|
|
"איננו רואים לקבוע מסמרות",
|
|
"אין לקבוע מסמרות",
|
|
"אין מקום לקבוע מסמרות",
|
|
"לא ראינו לקבוע מסמרות",
|
|
"למעלה מן הצורך",
|
|
"למעלה מהצורך",
|
|
"למעלה מן הדרוש",
|
|
"מעבר לנדרש",
|
|
"אגב אורחא",
|
|
"אגב אורחה",
|
|
)
|
|
|
|
|
|
def detect_non_decision(*texts: str) -> str | None:
|
|
"""Return the first non-decision marker found across ``texts`` (or None).
|
|
|
|
Scans rule_statement + reasoning_summary + supporting_quote — the court's
|
|
own hedge usually sits in the quote/reasoning, not the abstracted rule.
|
|
"""
|
|
joined = normalize_text(" ".join(t for t in texts if t))
|
|
for marker in NON_DECISION_MARKERS:
|
|
if marker in joined:
|
|
return marker
|
|
return None
|
|
|
|
|
|
# ── Truncated / incomplete supporting-quote detection ──
|
|
#
|
|
# Conservative: only flag a CLEAR mid-word cut — the quote's last whitespace-
|
|
# delimited token is a single Hebrew letter (a dangling construct/prefix such
|
|
# as the "...על ה" in 8099-02-17 idx 6). A complete clause ends in a full word,
|
|
# so this does not fire on quotes that merely lack a trailing period (the
|
|
# calibration showed ~1/3 of valid quotes drop the final period legitimately).
|
|
|
|
_HEB_LETTER = "א-ת"
|
|
|
|
|
|
def is_quote_truncated(quote: str) -> bool:
|
|
norm = normalize_text(quote)
|
|
if not norm:
|
|
return True
|
|
tokens = norm.split(" ")
|
|
last = tokens[-1].strip('".,;:)]')
|
|
# dangling single Hebrew letter at the end == cut mid-word
|
|
if len(last) == 1 and re.match(f"[{_HEB_LETTER}]", last):
|
|
return True
|
|
return False
|
|
|
|
|
|
# ── Thin restatement: rule_statement adds nothing over the quote ──
|
|
#
|
|
# Flag when the rule is essentially a copy of the quote: high token overlap AND
|
|
# the rule is no longer than the quote. A genuine halacha ABSTRACTS the rule, so
|
|
# it introduces wording the verbatim quote lacks and/or generalizes (longer or
|
|
# differently phrased).
|
|
|
|
_THIN_OVERLAP = 0.85
|
|
_THIN_LEN_RATIO = 1.10
|
|
|
|
|
|
def _tokens(text: str) -> set[str]:
|
|
norm = normalize_text(text)
|
|
return {t for t in re.split(r"[^א-ת0-9]+", norm) if len(t) > 1}
|
|
|
|
|
|
def is_thin_restatement(rule_statement: str, supporting_quote: str) -> bool:
|
|
rule_t = _tokens(rule_statement)
|
|
quote_t = _tokens(supporting_quote)
|
|
if not rule_t or not quote_t:
|
|
return False
|
|
overlap = len(rule_t & quote_t) / len(rule_t)
|
|
len_ratio = len(normalize_text(rule_statement)) / max(1, len(normalize_text(supporting_quote)))
|
|
return overlap >= _THIN_OVERLAP and len_ratio <= _THIN_LEN_RATIO
|
|
|
|
|
|
# ── Aggregate ──
|
|
|
|
FLAG_NON_DECISION = "non_decision"
|
|
FLAG_TRUNCATED_QUOTE = "truncated_quote"
|
|
FLAG_THIN_RESTATEMENT = "thin_restatement"
|
|
FLAG_QUOTE_UNVERIFIED = "quote_unverified"
|
|
|
|
|
|
def compute_quality_flags(
|
|
rule_statement: str,
|
|
supporting_quote: str,
|
|
reasoning_summary: str = "",
|
|
quote_verified: bool = True,
|
|
) -> list[str]:
|
|
"""Return the list of quality flags for one halacha (empty == clean).
|
|
|
|
Any non-empty result blocks auto-approval (routes to pending_review).
|
|
"""
|
|
flags: list[str] = []
|
|
if detect_non_decision(rule_statement, reasoning_summary, supporting_quote):
|
|
flags.append(FLAG_NON_DECISION)
|
|
if is_quote_truncated(supporting_quote):
|
|
flags.append(FLAG_TRUNCATED_QUOTE)
|
|
if is_thin_restatement(rule_statement, supporting_quote):
|
|
flags.append(FLAG_THIN_RESTATEMENT)
|
|
if not quote_verified:
|
|
flags.append(FLAG_QUOTE_UNVERIFIED)
|
|
return flags
|