feat(halacha): NLI entailment validator via claude_session (#81.3) + task #86
#81.3 — a post-extraction validator that flags halachot whose rule_statement is NOT entailed by its supporting_quote (the model over-reaching beyond its source). - Engine: claude_session-as-judge (local CLI, zero API cost) per chaim's standing preference — one batched judge call per chunk, NOT a hosted NLI model. - Pure, unit-tested helpers in halacha_quality: NLI_SYSTEM, build_nli_prompt, parse_nli_verdicts (fails OPEN — any shape/label ambiguity → 'entailed'). - halacha_extractor._nli_check wraps the call; fails OPEN on any error (e.g. no CLI in the container) so a flaky judge never blocks a genuine halacha. - Non-entailed (neutral/contradiction) → quality_flag 'nli_unsupported' which blocks auto-approve (routes to pending_review) via the existing store gate. - config: HALACHA_NLI_ENABLED/MODEL/EFFORT (effort 'low' — entailment is simple). Verified: suite 166 passed (10 new); LIVE smoke test against the real claude CLI returned ['entailed','neutral'] for a supported vs unsupported rule. Also commits TaskMaster #86 (Nevo preamble/ratio: anti-contamination strip fix + gold-set benchmark) capturing today's strip_nevo_preamble findings. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -91,3 +91,58 @@ def test_flags_accumulate():
|
||||
|
||||
def test_normalize_text_quote_variants():
|
||||
assert hq.normalize_text('עע"מ 317/10') == hq.normalize_text("עע״מ 317/10")
|
||||
|
||||
|
||||
# ── #81.3 NLI entailment — pure prompt + parser ──
|
||||
|
||||
def test_build_nli_prompt_contains_pairs():
|
||||
items = [
|
||||
{"rule_statement": "כלל אלף", "supporting_quote": "ציטוט אלף"},
|
||||
{"rule_statement": "כלל בית", "supporting_quote": "ציטוט בית"},
|
||||
]
|
||||
p = hq.build_nli_prompt(items)
|
||||
assert "כלל אלף" in p and "ציטוט בית" in p
|
||||
assert "זוג 1" in p and "זוג 2" in p
|
||||
|
||||
|
||||
@pytest.mark.parametrize("raw,n,expected", [
|
||||
(["entailed", "neutral"], 2, ["entailed", "neutral"]),
|
||||
(["ENTAILED", "Contradiction"], 2, ["entailed", "contradiction"]), # case-insensitive
|
||||
([{"verdict": "neutral"}, {"verdict": "entailed"}], 2, ["neutral", "entailed"]), # dict shape
|
||||
(["entailed"], 2, ["entailed", "entailed"]), # length mismatch -> fail-open
|
||||
(None, 2, ["entailed", "entailed"]), # non-list -> fail-open
|
||||
(["bananas", "neutral"], 2, ["entailed", "neutral"]), # unknown label -> entailed
|
||||
])
|
||||
def test_parse_nli_verdicts(raw, n, expected):
|
||||
assert hq.parse_nli_verdicts(raw, n) == expected
|
||||
|
||||
|
||||
# ── _nli_check (async, via claude_session) — fail-open + verdict mapping ──
|
||||
|
||||
def test_nli_check_fail_open(monkeypatch):
|
||||
import asyncio
|
||||
from legal_mcp.services import halacha_extractor as he
|
||||
|
||||
async def boom(*a, **k):
|
||||
raise RuntimeError("no claude CLI here")
|
||||
monkeypatch.setattr(he.claude_session, "query_json", boom)
|
||||
items = [{"rule_statement": "a", "supporting_quote": "b"}]
|
||||
assert asyncio.run(he._nli_check(items)) == ["entailed"] # never blocks
|
||||
|
||||
|
||||
def test_nli_check_maps_verdicts(monkeypatch):
|
||||
import asyncio
|
||||
from legal_mcp.services import halacha_extractor as he
|
||||
|
||||
async def fake(*a, **k):
|
||||
return ["entailed", "neutral"]
|
||||
monkeypatch.setattr(he.claude_session, "query_json", fake)
|
||||
items = [{"rule_statement": "a", "supporting_quote": "b"},
|
||||
{"rule_statement": "c", "supporting_quote": "d"}]
|
||||
assert asyncio.run(he._nli_check(items)) == ["entailed", "neutral"]
|
||||
|
||||
|
||||
def test_nli_check_empty():
|
||||
import asyncio
|
||||
from legal_mcp.services import halacha_extractor as he
|
||||
assert asyncio.run(he._nli_check([])) == []
|
||||
|
||||
Reference in New Issue
Block a user