Phase C — scripts/cull_principles.py: re-adjudicates every existing 'original' principle with the SAME panel regime (panel_keep_score → classify → apply_cap), reversible (CSV backup + rejected canonical recoverable), usage-throttled. panel_extraction.panel_keep_score + apply_cap (shared, G2). Dry-run on 3 decisions: 37→15 survive. Phase D — services/principles.py: source-derived label הלכה (binding court) / כלל פרשני (committee) / עיקרון (persuasive); umbrella עקרונות משפטיים. Wired into canonical_halacha_get/list (principle_class+principle_label). UI string changes deferred to the Claude Design gate. spec INV-LRN7; SCRIPTS.md; 7 new tests; 428 green. Phase E needs no new code — synthesis already targets pending_synthesis, which the cull leaves only on survivors (rejected canonicals → 'rejected'). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
136 lines
5.5 KiB
Python
136 lines
5.5 KiB
Python
"""Unit tests for the tri-model panel extraction core (#152, Phase A).
|
|
|
|
Pure logic only — classify (the chair's approval rule), _coerce_list (judge-reply
|
|
normalisation), and cluster_candidates (cross-model matching/voting) with injected
|
|
embeddings. No LLM, no Voyage, no DB.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from legal_mcp import config
|
|
from legal_mcp.services import panel_extraction as pe
|
|
|
|
|
|
# ── classify — chaim's rule ────────────────────────────────────────
|
|
|
|
def test_classify_three_votes_approves_regardless_of_score():
|
|
assert pe.classify(3, 0.10) == "approved"
|
|
assert pe.classify(3, 0.99) == "approved"
|
|
|
|
|
|
def test_classify_two_votes_gated_by_floor():
|
|
floor = config.HALACHA_PANEL_SCORE_FLOOR
|
|
assert pe.classify(2, floor) == "approved"
|
|
assert pe.classify(2, floor + 0.05) == "approved"
|
|
assert pe.classify(2, floor - 0.01) == "pending_review"
|
|
|
|
|
|
def test_classify_one_or_zero_votes_rejected():
|
|
assert pe.classify(1, 0.99) == "rejected"
|
|
assert pe.classify(0, 0.99) == "rejected"
|
|
|
|
|
|
# ── _coerce_list — judge reply normalisation ───────────────────────
|
|
|
|
def test_coerce_list_accepts_bare_list():
|
|
raw = [{"rule_statement": "כלל", "supporting_quote": "ציטוט", "score": 0.9}]
|
|
out = pe._coerce_list(raw)
|
|
assert len(out) == 1 and out[0]["rule_type"] == "interpretive"
|
|
|
|
|
|
def test_coerce_list_unwraps_dict_wrapper_and_drops_incomplete():
|
|
raw = {"principles": [
|
|
{"rule_statement": "כלל", "supporting_quote": "ציטוט", "rule_type": "holding", "score": 1.5},
|
|
{"rule_statement": "", "supporting_quote": "ציטוט"}, # no rule → drop
|
|
{"rule_statement": "כלל2", "supporting_quote": ""}, # no quote → drop
|
|
]}
|
|
out = pe._coerce_list(raw)
|
|
assert len(out) == 1
|
|
assert out[0]["rule_type"] == "holding"
|
|
assert out[0]["score"] == 1.0 # clamped to [0,1]
|
|
|
|
|
|
def test_coerce_list_bad_rule_type_falls_back():
|
|
out = pe._coerce_list([{"rule_statement": "כלל", "supporting_quote": "צ", "rule_type": "obiter", "score": 0.5}])
|
|
assert out[0]["rule_type"] == "interpretive"
|
|
|
|
|
|
def test_coerce_list_junk_returns_empty():
|
|
assert pe._coerce_list("nonsense") == []
|
|
assert pe._coerce_list(None) == []
|
|
|
|
|
|
# ── cluster_candidates — cross-model matching & voting ─────────────
|
|
|
|
def _c(rule, score):
|
|
return {"rule_statement": rule, "supporting_quote": "q", "reasoning_summary": "",
|
|
"rule_type": "interpretive", "score": score}
|
|
|
|
|
|
def test_cluster_merges_across_models_counts_votes_and_means_score():
|
|
# same principle proposed by all three (identical embedding) → 1 cluster, 3 votes
|
|
a, b, c = _c("X", 0.9), _c("X", 0.8), _c("X", 0.7)
|
|
per_model = {"claude": [a], "deepseek": [b], "gemini": [c]}
|
|
embs = {id(a): [1.0, 0.0], id(b): [1.0, 0.0], id(c): [1.0, 0.0]}
|
|
out = pe.cluster_candidates(per_model, embs)
|
|
assert len(out) == 1
|
|
cl = out[0]
|
|
assert cl["votes"] == 3
|
|
assert cl["score"] == pytest.approx((0.9 + 0.8 + 0.7) / 3, abs=1e-3)
|
|
assert cl["verdict"] == "approved"
|
|
assert cl["voters"] == ["claude", "deepseek", "gemini"]
|
|
|
|
|
|
def test_cluster_separates_distinct_principles():
|
|
a, b = _c("X", 0.9), _c("Y", 0.9)
|
|
per_model = {"claude": [a, b]}
|
|
embs = {id(a): [1.0, 0.0], id(b): [0.0, 1.0]} # orthogonal → 2 clusters
|
|
out = pe.cluster_candidates(per_model, embs)
|
|
assert len(out) == 2
|
|
assert all(cl["votes"] == 1 and cl["verdict"] == "rejected" for cl in out)
|
|
|
|
|
|
def test_cluster_same_model_twice_counts_one_vote_keeps_best_score():
|
|
# one model proposes two near-dupes; another proposes the same → 2 votes, not 3
|
|
a1, a2 = _c("X", 0.6), _c("X", 0.95)
|
|
b = _c("X", 0.88)
|
|
per_model = {"claude": [a1, a2], "deepseek": [b]}
|
|
embs = {id(a1): [1.0, 0.0], id(a2): [1.0, 0.0], id(b): [1.0, 0.0]}
|
|
out = pe.cluster_candidates(per_model, embs)
|
|
assert len(out) == 1
|
|
cl = out[0]
|
|
assert cl["votes"] == 2 # claude counts once
|
|
# claude's best (0.95) and deepseek (0.88) → mean
|
|
assert cl["score"] == pytest.approx((0.95 + 0.88) / 2, abs=1e-3)
|
|
assert cl["rule_statement"] == "X"
|
|
|
|
|
|
def test_apply_cap_downgrades_over_cap_survivors_by_score():
|
|
judged = [
|
|
{"verdict": "approved", "score": 0.9},
|
|
{"verdict": "approved", "score": 0.7},
|
|
{"verdict": "pending_review", "score": 0.8},
|
|
{"verdict": "rejected", "score": 0.95}, # already rejected stays
|
|
]
|
|
out = pe.apply_cap(judged, max_new=2)
|
|
fv = [j["final_verdict"] for j in out]
|
|
# top-2 survivors by score = 0.9(approved) + 0.8(pending); 0.7 → over cap → rejected
|
|
assert fv == ["approved", "rejected", "pending_review", "rejected"]
|
|
|
|
|
|
def test_apply_cap_keeps_all_when_under_cap():
|
|
judged = [{"verdict": "approved", "score": 0.9}, {"verdict": "pending_review", "score": 0.5}]
|
|
out = pe.apply_cap(judged, max_new=5)
|
|
assert [j["final_verdict"] for j in out] == ["approved", "pending_review"]
|
|
|
|
|
|
def test_cluster_sorted_strongest_first():
|
|
a = _c("X", 0.9) # 1 vote
|
|
b, c = _c("Y", 0.9), _c("Y", 0.9) # 2 votes
|
|
per_model = {"claude": [a, b], "deepseek": [c]}
|
|
embs = {id(a): [1.0, 0.0], id(b): [0.0, 1.0], id(c): [0.0, 1.0]}
|
|
out = pe.cluster_candidates(per_model, embs)
|
|
assert out[0]["rule_statement"] == "Y" and out[0]["votes"] == 2
|
|
assert out[1]["rule_statement"] == "X" and out[1]["votes"] == 1
|