feat(halacha): application gate + lexical dedup tail + quality harnesses (#81,#82)
Halacha-extraction quality (#81) and dedup-on-insert (#82) — engine changes (pure + tested) plus measurement/ops tooling. halacha_quality.py - #81.4 application gate: is_fact_dependent() (high-precision "applied to THIS case" deixis per the strict rubric §3/§27) + FLAG_APPLICATION. compute_quality_flags now takes rule_type and flags rule_type=='application' OR fact-dependent — blocking auto-approve (an illustration is not a generalizable holding). - #82.3 lexical tail signal: jaccard_shingles / normalized_levenshtein / lexical_near_duplicate + FLAG_NEAR_DUPLICATE, for the 0.83–0.93 cosine band. halacha_extractor.py — pass rule_type to the flag computation; re-type a binding-labeled fact-application to 'application' (mirrors non_decision→obiter). db.py (store_halachot_for_chunk) — dedup now fetches the nearest same-precedent neighbor once: cosine ≥ DEDUP → skip (unchanged); cosine in [BAND, DEDUP) with high lexical overlap → FLAG_NEAR_DUPLICATE (review, not skip — never drop a possibly-distinct principle unreviewed). config.py — HALACHA_DEDUP_BAND_COSINE (0.83). Scripts: - scripts/halacha_goldset.py (#81.7) — export stratified sample for human tagging; score validators (P/R/F1) against the tags. Backbone for #81.8. - scripts/halacha_batch_reconcile.py (#82.7) — conservative cross-precedent dedup (cosine ≥0.95), dry-run report only. - scripts/calibrate_halacha_dedup.py (#82.1) — calibrate the lexical thresholds against the 2026-06-03 cleanup gold-set. Deferred (documented): #82.4 merge-provenance and #82.5 DB ON CONFLICT/UNIQUE on normalized quote are NOT included — the current skip+flag behavior is safe, whereas a UNIQUE on normalized_quote would fail on existing dups and a blind merge risks losing provenance; they need their own chair-reviewed migration. #82.6 over-merge guard is moot until merge lands. #81.6 full rhetorical-role classifier deferred (section pre-filter + application flag cover the practical case); #81.8 blocked on the human-tagged gold-set (harness now provided). Verified: - pytest tests/test_halacha_quality.py — 52 passed (14 new). - calibrate: configured (0.55,0.70) → precision 1.0 (zero false-merge), recall 0.30 — correct profile for an auto-approve-blocking signal. - goldset export: 15-row sample CSV. batch reconcile: 819 halachot → 5 cross-precedent candidate pairs. Invariants: G1 (normalize at source — flag at insert, not at read); §6 (no silent swallow — suspect items flagged to review, never dropped); G2 (no parallel path — same store_halachot_for_chunk / compute_quality_flags). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -154,6 +154,14 @@ HALACHA_AUTO_APPROVE_THRESHOLD = float(
|
||||
# principle. Set > 1.0 to disable semantic dedup (exact-quote dedup still runs).
|
||||
HALACHA_DEDUP_COSINE = float(os.environ.get("HALACHA_DEDUP_COSINE", "0.93"))
|
||||
|
||||
# Halacha dedup TAIL band (#82.3) — the [BAND_COSINE, DEDUP_COSINE) range is too
|
||||
# low to auto-skip but suspicious. A halacha whose nearest same-precedent
|
||||
# neighbor sits in this band AND has high LEXICAL overlap (Jaccard/Levenshtein
|
||||
# on rule_statement) is flagged 'near_duplicate' (blocks auto-approve → review),
|
||||
# not skipped — catching paraphrases the cosine threshold misses without
|
||||
# dropping a possibly-distinct principle unreviewed. 0.83 from the same cleanup.
|
||||
HALACHA_DEDUP_BAND_COSINE = float(os.environ.get("HALACHA_DEDUP_BAND_COSINE", "0.83"))
|
||||
|
||||
# Halacha NLI entailment validator (#81.3) — after extraction, a claude_session
|
||||
# judge checks each halacha's rule_statement is entailed by its supporting_quote.
|
||||
# Non-entailed (neutral/contradiction) → quality flag 'nli_unsupported' that
|
||||
|
||||
@@ -3699,6 +3699,7 @@ async def store_halachot_for_chunk(
|
||||
"""
|
||||
threshold = config.HALACHA_AUTO_APPROVE_THRESHOLD
|
||||
dedup_distance = 1.0 - config.HALACHA_DEDUP_COSINE # cosine sim → distance
|
||||
band_distance = 1.0 - config.HALACHA_DEDUP_BAND_COSINE # tail-band ceiling (#82.3)
|
||||
pool = await get_pool()
|
||||
inserted = 0
|
||||
skipped = 0
|
||||
@@ -3722,21 +3723,32 @@ async def store_halachot_for_chunk(
|
||||
if norm_quote and norm_quote in existing_quotes:
|
||||
skipped += 1
|
||||
continue
|
||||
# 2) semantic near-duplicate (rule embedding cosine)
|
||||
# 2) semantic near-duplicate (rule embedding cosine) — fetch the
|
||||
# nearest same-precedent neighbor once so we can both auto-skip
|
||||
# (cosine ≥ DEDUP) and flag the lexical tail (#82.3).
|
||||
emb = h.get("embedding")
|
||||
flags = list(h.get("quality_flags") or [])
|
||||
if emb is not None and config.HALACHA_DEDUP_COSINE <= 1.0:
|
||||
dup = await conn.fetchval(
|
||||
"SELECT 1 FROM halachot WHERE case_law_id = $1 "
|
||||
"AND embedding IS NOT NULL AND (embedding <=> $2) <= $3 "
|
||||
"LIMIT 1",
|
||||
case_law_id, emb, dedup_distance,
|
||||
neighbor = await conn.fetchrow(
|
||||
"SELECT rule_statement, (embedding <=> $2) AS dist "
|
||||
"FROM halachot WHERE case_law_id = $1 "
|
||||
"AND embedding IS NOT NULL "
|
||||
"ORDER BY embedding <=> $2 LIMIT 1",
|
||||
case_law_id, emb,
|
||||
)
|
||||
if dup:
|
||||
skipped += 1
|
||||
continue
|
||||
if neighbor is not None:
|
||||
dist = float(neighbor["dist"])
|
||||
if dist <= dedup_distance:
|
||||
skipped += 1
|
||||
continue
|
||||
# tail band: below auto-skip but lexically near → flag.
|
||||
if (dist <= band_distance
|
||||
and halacha_quality.FLAG_NEAR_DUPLICATE not in flags
|
||||
and halacha_quality.lexical_near_duplicate(
|
||||
h["rule_statement"], neighbor["rule_statement"])):
|
||||
flags.append(halacha_quality.FLAG_NEAR_DUPLICATE)
|
||||
|
||||
confidence = float(h.get("confidence", 0.0))
|
||||
flags = h.get("quality_flags") or []
|
||||
auto_approve = confidence >= threshold and not flags
|
||||
review_status = "approved" if auto_approve else "pending_review"
|
||||
reviewer = (
|
||||
|
||||
@@ -592,10 +592,16 @@ async def _extract_impl(case_law_id: UUID, force: bool = False,
|
||||
flags = halacha_quality.compute_quality_flags(
|
||||
coerced["rule_statement"], coerced["supporting_quote"],
|
||||
coerced["reasoning_summary"], coerced["quote_verified"],
|
||||
coerced["rule_type"],
|
||||
)
|
||||
coerced["quality_flags"] = flags
|
||||
if halacha_quality.FLAG_NON_DECISION in flags and coerced["rule_type"] != "obiter":
|
||||
coerced["rule_type"] = "obiter"
|
||||
# #81.4 — a binding-labeled rule that reads as a case-application is
|
||||
# re-typed application (it carries FLAG_APPLICATION either way).
|
||||
elif (halacha_quality.FLAG_APPLICATION in flags
|
||||
and coerced["rule_type"] == "binding"):
|
||||
coerced["rule_type"] = "application"
|
||||
cleaned.append(coerced)
|
||||
# #81.3 NLI entailment — one batched judge call per chunk (fail-open).
|
||||
if config.HALACHA_NLI_ENABLED and cleaned:
|
||||
|
||||
@@ -128,6 +128,91 @@ def is_thin_restatement(rule_statement: str, supporting_quote: str) -> bool:
|
||||
return overlap >= _THIN_OVERLAP and len_ratio <= _THIN_LEN_RATIO
|
||||
|
||||
|
||||
# ── Fact-dependent application: not a generalizable holding (#81.4) ──
|
||||
#
|
||||
# The strict rubric's cut_application (docs/halacha-strict-rubric.md §3, §27):
|
||||
# a determination that rests on the case's specific facts/parties/amounts is an
|
||||
# illustration, not a holding — it must not enter the corpus as a binding rule.
|
||||
# The extractor already classifies ``rule_type='application'``; this is a
|
||||
# HIGH-PRECISION secondary catch for rules the model mislabeled as binding,
|
||||
# using only the unambiguous "applied to THIS case" deixis (bare party words
|
||||
# like "המערער" appear in genuine rules too, so they are deliberately excluded).
|
||||
|
||||
_FACT_DEPENDENT_MARKERS = (
|
||||
"במקרה דנן",
|
||||
"במקרה שבפנינו",
|
||||
"במקרה שלפנינו",
|
||||
"במקרה שלפניי",
|
||||
"בענייננו",
|
||||
"בנדון דידן",
|
||||
"בנדון דנן",
|
||||
"במקרה שלנו",
|
||||
"בנסיבות המקרה שלפנינו",
|
||||
"בנסיבות תיק זה",
|
||||
"בתיק שלפנינו",
|
||||
"בערר שלפנינו",
|
||||
"בערר דנן",
|
||||
)
|
||||
|
||||
|
||||
def is_fact_dependent(rule_statement: str) -> bool:
|
||||
"""True when the rule is phrased as an application to THIS case (not a holding)."""
|
||||
norm = normalize_text(rule_statement)
|
||||
return any(marker in norm for marker in _FACT_DEPENDENT_MARKERS)
|
||||
|
||||
|
||||
# ── Lexical near-duplicate signal (the 0.83–0.90 cosine tail) — #82.3 ──
|
||||
#
|
||||
# Embedding cosine alone misses paraphrases that float just below the dedup
|
||||
# threshold (0.93). A secondary lexical signal — Jaccard over word-shingles +
|
||||
# normalized Levenshtein on the rule_statement — catches "same rule, reworded"
|
||||
# in that band without lowering the global cosine threshold. Hybrid
|
||||
# lexical+semantic beats either alone (arXiv:1805.11611). Pure functions.
|
||||
|
||||
def _shingles(text: str, k: int = 2) -> set[str]:
|
||||
words = [w for w in re.split(r"[^א-ת0-9]+", normalize_text(text)) if w]
|
||||
if len(words) < k:
|
||||
return {" ".join(words)} if words else set()
|
||||
return {" ".join(words[i : i + k]) for i in range(len(words) - k + 1)}
|
||||
|
||||
|
||||
def jaccard_shingles(a: str, b: str, k: int = 2) -> float:
|
||||
sa, sb = _shingles(a, k), _shingles(b, k)
|
||||
if not sa or not sb:
|
||||
return 0.0
|
||||
return len(sa & sb) / len(sa | sb)
|
||||
|
||||
|
||||
def normalized_levenshtein(a: str, b: str) -> float:
|
||||
"""1.0 == identical, 0.0 == fully different (edit distance / max len)."""
|
||||
a, b = normalize_text(a), normalize_text(b)
|
||||
if not a and not b:
|
||||
return 1.0
|
||||
if not a or not b:
|
||||
return 0.0
|
||||
# classic DP edit distance (rule_statements are short — a few hundred chars)
|
||||
prev = list(range(len(b) + 1))
|
||||
for i, ca in enumerate(a, 1):
|
||||
cur = [i]
|
||||
for j, cb in enumerate(b, 1):
|
||||
cur.append(min(prev[j] + 1, cur[j - 1] + 1, prev[j - 1] + (ca != cb)))
|
||||
prev = cur
|
||||
return 1.0 - prev[-1] / max(len(a), len(b))
|
||||
|
||||
|
||||
_LEX_JACCARD_MIN = 0.55
|
||||
_LEX_LEVENSHTEIN_MIN = 0.70
|
||||
|
||||
|
||||
def lexical_near_duplicate(
|
||||
a: str, b: str, jaccard_min: float = _LEX_JACCARD_MIN,
|
||||
levenshtein_min: float = _LEX_LEVENSHTEIN_MIN,
|
||||
) -> bool:
|
||||
"""High lexical overlap → likely the same rule reworded (for the cosine tail)."""
|
||||
return (jaccard_shingles(a, b) >= jaccard_min
|
||||
or normalized_levenshtein(a, b) >= levenshtein_min)
|
||||
|
||||
|
||||
# ── Aggregate ──
|
||||
|
||||
FLAG_NON_DECISION = "non_decision"
|
||||
@@ -135,6 +220,8 @@ FLAG_TRUNCATED_QUOTE = "truncated_quote"
|
||||
FLAG_THIN_RESTATEMENT = "thin_restatement"
|
||||
FLAG_QUOTE_UNVERIFIED = "quote_unverified"
|
||||
FLAG_NLI_UNSUPPORTED = "nli_unsupported" # rule not entailed by its quote (#81.3)
|
||||
FLAG_APPLICATION = "application" # fact-dependent, not a holding (#81.4)
|
||||
FLAG_NEAR_DUPLICATE = "near_duplicate" # cosine-tail lexical dup (#82.3)
|
||||
|
||||
|
||||
# ── NLI entailment check (rule_statement ⊨ supporting_quote) — #81.3 ──
|
||||
@@ -250,6 +337,7 @@ def compute_quality_flags(
|
||||
supporting_quote: str,
|
||||
reasoning_summary: str = "",
|
||||
quote_verified: bool = True,
|
||||
rule_type: str = "binding",
|
||||
) -> list[str]:
|
||||
"""Return the list of quality flags for one halacha (empty == clean).
|
||||
|
||||
@@ -264,4 +352,9 @@ def compute_quality_flags(
|
||||
flags.append(FLAG_THIN_RESTATEMENT)
|
||||
if not quote_verified:
|
||||
flags.append(FLAG_QUOTE_UNVERIFIED)
|
||||
# #81.4 — an application (fact-dependent) item is an illustration, not a
|
||||
# generalizable holding: never auto-approve it. Trust the model's
|
||||
# rule_type='application' and add a high-precision deixis catch.
|
||||
if rule_type == "application" or is_fact_dependent(rule_statement):
|
||||
flags.append(FLAG_APPLICATION)
|
||||
return flags
|
||||
|
||||
Reference in New Issue
Block a user