Halacha-extraction quality (#81) and dedup-on-insert (#82) — engine changes (pure + tested) plus measurement/ops tooling. halacha_quality.py - #81.4 application gate: is_fact_dependent() (high-precision "applied to THIS case" deixis per the strict rubric §3/§27) + FLAG_APPLICATION. compute_quality_flags now takes rule_type and flags rule_type=='application' OR fact-dependent — blocking auto-approve (an illustration is not a generalizable holding). - #82.3 lexical tail signal: jaccard_shingles / normalized_levenshtein / lexical_near_duplicate + FLAG_NEAR_DUPLICATE, for the 0.83–0.93 cosine band. halacha_extractor.py — pass rule_type to the flag computation; re-type a binding-labeled fact-application to 'application' (mirrors non_decision→obiter). db.py (store_halachot_for_chunk) — dedup now fetches the nearest same-precedent neighbor once: cosine ≥ DEDUP → skip (unchanged); cosine in [BAND, DEDUP) with high lexical overlap → FLAG_NEAR_DUPLICATE (review, not skip — never drop a possibly-distinct principle unreviewed). config.py — HALACHA_DEDUP_BAND_COSINE (0.83). Scripts: - scripts/halacha_goldset.py (#81.7) — export stratified sample for human tagging; score validators (P/R/F1) against the tags. Backbone for #81.8. - scripts/halacha_batch_reconcile.py (#82.7) — conservative cross-precedent dedup (cosine ≥0.95), dry-run report only. - scripts/calibrate_halacha_dedup.py (#82.1) — calibrate the lexical thresholds against the 2026-06-03 cleanup gold-set. Deferred (documented): #82.4 merge-provenance and #82.5 DB ON CONFLICT/UNIQUE on normalized quote are NOT included — the current skip+flag behavior is safe, whereas a UNIQUE on normalized_quote would fail on existing dups and a blind merge risks losing provenance; they need their own chair-reviewed migration. #82.6 over-merge guard is moot until merge lands. #81.6 full rhetorical-role classifier deferred (section pre-filter + application flag cover the practical case); #81.8 blocked on the human-tagged gold-set (harness now provided). Verified: - pytest tests/test_halacha_quality.py — 52 passed (14 new). - calibrate: configured (0.55,0.70) → precision 1.0 (zero false-merge), recall 0.30 — correct profile for an auto-approve-blocking signal. - goldset export: 15-row sample CSV. batch reconcile: 819 halachot → 5 cross-precedent candidate pairs. Invariants: G1 (normalize at source — flag at insert, not at read); §6 (no silent swallow — suspect items flagged to review, never dropped); G2 (no parallel path — same store_halachot_for_chunk / compute_quality_flags). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
256 lines
10 KiB
Python
256 lines
10 KiB
Python
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from legal_mcp.services import halacha_quality as hq
|
|
|
|
|
|
# ── non-decision / obiter ──
|
|
|
|
@pytest.mark.parametrize("text", [
|
|
"איני רואה לקבוע מסמרות בשאלה זו",
|
|
"אין צורך להכריע בטענה זו",
|
|
"למעלה מן הצורך נעיר כי",
|
|
"הערה זו ניתנת אגב אורחא",
|
|
])
|
|
def test_detect_non_decision_hits(text):
|
|
assert hq.detect_non_decision(text) is not None
|
|
|
|
|
|
@pytest.mark.parametrize("text", [
|
|
"בית המשפט קבע כי ההיתר בטל",
|
|
"ועדת הערר מוסמכת לדון בטענת סטייה מתכנית",
|
|
"",
|
|
])
|
|
def test_detect_non_decision_misses(text):
|
|
assert hq.detect_non_decision(text) is None
|
|
|
|
|
|
def test_non_decision_scans_all_fields():
|
|
# marker sits in the quote, not the abstracted rule
|
|
assert hq.detect_non_decision("כלל כללי", "", "וכאן אין צורך להכריע") is not None
|
|
|
|
|
|
# ── truncated quote ──
|
|
|
|
def test_truncated_dangling_letter():
|
|
assert hq.is_quote_truncated("ראוי כי תהיה השפעה על ה") is True
|
|
|
|
|
|
def test_truncated_empty():
|
|
assert hq.is_quote_truncated(" ") is True
|
|
|
|
|
|
@pytest.mark.parametrize("quote", [
|
|
"ועדת הערר היא הגוף המקצועי האמון על בחינת ההיבטים התכנוניים.",
|
|
"אין לועדה סמכות לסטות מתקנות התכנון והבניה", # no period, but full word
|
|
"ההיתר תואם את התכנית החלה על האיזור",
|
|
])
|
|
def test_not_truncated_complete_clauses(quote):
|
|
assert hq.is_quote_truncated(quote) is False
|
|
|
|
|
|
# ── thin restatement ──
|
|
|
|
def test_thin_restatement_near_copy():
|
|
quote = "ביטול היתר מחייב טעמים כבדי משקל של אינטרס ציבורי"
|
|
rule = "ביטול היתר מחייב טעמים כבדי משקל של אינטרס ציבורי"
|
|
assert hq.is_thin_restatement(rule, quote) is True
|
|
|
|
|
|
def test_not_thin_when_abstracted():
|
|
quote = "אין חולק כי אין לועדה סמכות לסטות מתקנות"
|
|
rule = ("ועדה מקומית לתכנון ובניה אינה מוסמכת לסטות מהוראות תקנות התכנון "
|
|
"והבניה, ובכלל זה מהוראות התוספת השנייה, ואין בידה ליתן היתר הסוטה מהן.")
|
|
assert hq.is_thin_restatement(rule, quote) is False
|
|
|
|
|
|
def test_thin_handles_empty():
|
|
assert hq.is_thin_restatement("", "something") is False
|
|
|
|
|
|
# ── aggregate flags + auto-approve gate semantics ──
|
|
|
|
def test_clean_halacha_no_flags():
|
|
rule = ("ועדת הערר מוסמכת לדון בערר על החלטה ליתן היתר בנייה גם כאשר נטען "
|
|
"כי ההיתר סוטה מתכנית, בהתאם למגמת תיקון 43 לחוק.")
|
|
quote = ("פרשנות מרחיבה המאפשרת הגשת ערר גם במקרה של מתן היתר כאשר נטען כי "
|
|
"ההיתר סוטה מתכנית הולמת את מגמת המחוקק בתיקון 43.")
|
|
assert hq.compute_quality_flags(rule, quote, "", quote_verified=True) == []
|
|
|
|
|
|
def test_flags_accumulate():
|
|
flags = hq.compute_quality_flags(
|
|
"כלל אגב אורחא על ה", "כלל אגב אורחא על ה",
|
|
quote_verified=False,
|
|
)
|
|
assert hq.FLAG_NON_DECISION in flags
|
|
assert hq.FLAG_TRUNCATED_QUOTE in flags
|
|
assert hq.FLAG_QUOTE_UNVERIFIED in flags
|
|
|
|
|
|
def test_normalize_text_quote_variants():
|
|
assert hq.normalize_text('עע"מ 317/10') == hq.normalize_text("עע״מ 317/10")
|
|
|
|
|
|
# ── #81.3 NLI entailment — pure prompt + parser ──
|
|
|
|
def test_build_nli_prompt_contains_pairs():
|
|
items = [
|
|
{"rule_statement": "כלל אלף", "supporting_quote": "ציטוט אלף"},
|
|
{"rule_statement": "כלל בית", "supporting_quote": "ציטוט בית"},
|
|
]
|
|
p = hq.build_nli_prompt(items)
|
|
assert "כלל אלף" in p and "ציטוט בית" in p
|
|
assert "זוג 1" in p and "זוג 2" in p
|
|
|
|
|
|
@pytest.mark.parametrize("raw,n,expected", [
|
|
(["entailed", "neutral"], 2, ["entailed", "neutral"]),
|
|
(["ENTAILED", "Contradiction"], 2, ["entailed", "contradiction"]), # case-insensitive
|
|
([{"verdict": "neutral"}, {"verdict": "entailed"}], 2, ["neutral", "entailed"]), # dict shape
|
|
(["entailed"], 2, ["entailed", "entailed"]), # length mismatch -> fail-open
|
|
(None, 2, ["entailed", "entailed"]), # non-list -> fail-open
|
|
(["bananas", "neutral"], 2, ["entailed", "neutral"]), # unknown label -> entailed
|
|
])
|
|
def test_parse_nli_verdicts(raw, n, expected):
|
|
assert hq.parse_nli_verdicts(raw, n) == expected
|
|
|
|
|
|
# ── _nli_check (async, via claude_session) — fail-open + verdict mapping ──
|
|
|
|
def test_nli_check_fail_open(monkeypatch):
|
|
import asyncio
|
|
from legal_mcp.services import halacha_extractor as he
|
|
|
|
async def boom(*a, **k):
|
|
raise RuntimeError("no claude CLI here")
|
|
monkeypatch.setattr(he.claude_session, "query_json", boom)
|
|
items = [{"rule_statement": "a", "supporting_quote": "b"}]
|
|
assert asyncio.run(he._nli_check(items)) == ["entailed"] # never blocks
|
|
|
|
|
|
def test_nli_check_maps_verdicts(monkeypatch):
|
|
import asyncio
|
|
from legal_mcp.services import halacha_extractor as he
|
|
|
|
async def fake(*a, **k):
|
|
return ["entailed", "neutral"]
|
|
monkeypatch.setattr(he.claude_session, "query_json", fake)
|
|
items = [{"rule_statement": "a", "supporting_quote": "b"},
|
|
{"rule_statement": "c", "supporting_quote": "d"}]
|
|
assert asyncio.run(he._nli_check(items)) == ["entailed", "neutral"]
|
|
|
|
|
|
def test_nli_check_empty():
|
|
import asyncio
|
|
from legal_mcp.services import halacha_extractor as he
|
|
assert asyncio.run(he._nli_check([])) == []
|
|
|
|
|
|
# ── #81.5 consolidation — pure prompt + fold-group parser ──
|
|
|
|
def test_build_consolidation_prompt():
|
|
items = [
|
|
{"halacha_index": 3, "rule_statement": "כלל גימל", "reasoning_summary": "כי"},
|
|
{"halacha_index": 7, "rule_statement": "כלל זין", "reasoning_summary": ""},
|
|
]
|
|
p = hq.build_consolidation_prompt(items)
|
|
assert "[3] כלל גימל" in p and "[7] כלל זין" in p and "היגיון: כי" in p
|
|
|
|
|
|
@pytest.mark.parametrize("raw,expected", [
|
|
([[2, 5, 9], [14, 18]], [[2, 5, 9], [14, 18]]),
|
|
([[2, 5], [7]], [[2, 5]]), # singleton group dropped
|
|
([["2", "5"]], [[2, 5]]), # string ints coerced
|
|
([[2, 2, 5]], [[2, 5]]), # dedup within group
|
|
([], []), # nothing to fold
|
|
("garbage", []), # non-list -> safe
|
|
(None, []), # None -> safe
|
|
([[1, "x"], [3, 4]], [[3, 4]]), # drop group that falls below 2 valid
|
|
])
|
|
def test_parse_fold_groups(raw, expected):
|
|
assert hq.parse_fold_groups(raw) == expected
|
|
|
|
|
|
def test_consolidation_priority_prefers_approved_then_confidence():
|
|
from legal_mcp.services import halacha_extractor as he
|
|
approved = {"id": "a", "review_status": "approved", "confidence": 0.7,
|
|
"quote_verified": True, "rule_statement": "x"}
|
|
pending_hi = {"id": "b", "review_status": "pending_review", "confidence": 0.95,
|
|
"quote_verified": True, "rule_statement": "x"}
|
|
# approved sorts before higher-confidence pending → kept as canonical
|
|
assert min([approved, pending_hi], key=he._consolidation_priority)["id"] == "a"
|
|
|
|
|
|
# ── #81.4 fact-dependent / application ──
|
|
|
|
@pytest.mark.parametrize("rule", [
|
|
"במקרה דנן ועדת הערר קבעה כי ההיתר בטל",
|
|
"בענייננו אין הצדקה לפיצוי",
|
|
"בערר שלפנינו הוכח כי השומה שגויה",
|
|
])
|
|
def test_is_fact_dependent_hits(rule):
|
|
assert hq.is_fact_dependent(rule) is True
|
|
|
|
|
|
@pytest.mark.parametrize("rule", [
|
|
"ועדת הערר מוסמכת לדון בהיטל השבחה",
|
|
"נטל ההוכחה מוטל על המבקש",
|
|
"פגיעה תכנונית מזכה בפיצוי לפי סעיף 197",
|
|
])
|
|
def test_is_fact_dependent_misses(rule):
|
|
assert hq.is_fact_dependent(rule) is False
|
|
|
|
|
|
def test_application_flag_from_rule_type():
|
|
flags = hq.compute_quality_flags(
|
|
"נטל ההוכחה על המבקש", "נטל ההוכחה על המבקש כאמור",
|
|
rule_type="application",
|
|
)
|
|
assert hq.FLAG_APPLICATION in flags
|
|
|
|
|
|
def test_application_flag_from_deixis_even_if_binding():
|
|
flags = hq.compute_quality_flags(
|
|
"במקרה דנן נדחה הערר", "כפי שקבענו במקרה דנן נדחה הערר",
|
|
rule_type="binding",
|
|
)
|
|
assert hq.FLAG_APPLICATION in flags
|
|
|
|
|
|
def test_clean_binding_rule_has_no_flags():
|
|
flags = hq.compute_quality_flags(
|
|
"ועדת הערר מוסמכת לדון בטענות חוקתיות הנוגעות לתכנית",
|
|
"הוועדה מוסמכת לדון אף בטענות מסוג זה, ככל שהן נוגעות לתכנית שבנדון.",
|
|
rule_type="binding",
|
|
)
|
|
assert flags == []
|
|
|
|
|
|
# ── #82.3 lexical near-duplicate signal ──
|
|
|
|
def test_jaccard_high_for_reworded_same_rule():
|
|
a = "נטל ההוכחה בהיטל השבחה מוטל על הוועדה המקומית"
|
|
b = "נטל ההוכחה בהיטל השבחה מוטל על הוועדה המקומית בלבד"
|
|
assert hq.jaccard_shingles(a, b) >= 0.5
|
|
|
|
|
|
def test_jaccard_low_for_distinct_rules():
|
|
a = "ועדת הערר מוסמכת לדון בהיטל השבחה"
|
|
b = "המועד להגשת ערר הוא שלושים יום"
|
|
assert hq.jaccard_shingles(a, b) < 0.2
|
|
|
|
|
|
def test_normalized_levenshtein_identical_and_disjoint():
|
|
assert hq.normalized_levenshtein("אבג", "אבג") == 1.0
|
|
assert hq.normalized_levenshtein("", "אבג") == 0.0
|
|
|
|
|
|
def test_lexical_near_duplicate_band():
|
|
a = "נטל ההוכחה בהיטל השבחה מוטל על הוועדה המקומית"
|
|
b = "נטל ההוכחה בהיטל השבחה מוטל על הוועדה המקומית, כך נפסק"
|
|
assert hq.lexical_near_duplicate(a, b) is True
|
|
c = "המועד להגשת ערר על שומה הוא שלושים ימים"
|
|
assert hq.lexical_near_duplicate(a, c) is False
|