"""Tests for #133 / FU-4 — rubric distillation from chair decisions. Covers the PURE deterministic core (analyze_pairs): given (panel ⋈ chair) pairs it must correctly classify the systematic-failure buckets — false-keep (panel auto-kept, chair dropped), false-drop, chair-resolved splits — and the per-judge disagreement-with-chair rate. Fully OFFLINE (no DB, no LLM). The LLM proposal and the report rendering are exercised by the integration smoke run. The invariant this locks down: the only label compared against is the chair's human ruling — never the panel's own votes (echo-chamber guard, INV-LRN1). """ from __future__ import annotations import sys from pathlib import Path # the script lives in ../scripts relative to mcp-server/ sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "scripts")) import halacha_rubric_distill as rd # noqa: E402 def _pair(chair, verdict, action, c, ds, gm, rule="כלל כלשהו"): return { "chair_keep": chair, "verdict": verdict, "applied_action": action, "rule_statement": rule, "claude_vote": c, "claude_reason": "rc", "deepseek_vote": ds, "deepseek_reason": "rd", "gemini_vote": gm, "gemini_reason": "rg", } def test_false_keep_detected(): """Panel auto-approved but the chair dropped → a false-keep (the costly error).""" a = rd.analyze_pairs([_pair(False, "unanimous_yes", "approved", True, True, True)]) assert a["n_false_keep"] == 1 assert a["n_false_drop"] == 0 def test_nli_cleared_counts_as_keep(): a = rd.analyze_pairs([_pair(False, "unanimous_yes", "nli_cleared", True, True, True)]) assert a["n_false_keep"] == 1 def test_false_drop_detected(): a = rd.analyze_pairs([_pair(True, "unanimous_no", "rejected", False, False, False)]) assert a["n_false_drop"] == 1 assert a["n_false_keep"] == 0 def test_split_resolved_counted_not_a_false_decision(): """A split escalates to the chair — no auto-decision, so it is neither a false-keep nor a false-drop, but it IS a resolved split (learning signal).""" a = rd.analyze_pairs([_pair(True, "split", "chair", True, False, None)]) assert a["n_splits_resolved"] == 1 assert a["n_false_keep"] == 0 and a["n_false_drop"] == 0 def test_judge_disagreement_rate_vs_chair(): """Disagreement is measured against the chair, never against the other judges.""" pairs = [ _pair(True, "split", "chair", True, False, None), # claude agree, deepseek disagree _pair(True, "split", "chair", True, False, True), # claude agree, deepseek disagree, gemini agree ] a = rd.analyze_pairs(pairs) assert a["judge_stats"]["claude"]["disagree_rate"] == 0.0 assert a["judge_stats"]["deepseek"]["disagree_rate"] == 1.0 # gemini voted once (agree) and abstained once → rate 0.0 over 1 vote assert a["judge_stats"]["gemini"]["voted"] == 1 assert a["judge_stats"]["gemini"]["disagree_rate"] == 0.0 def test_none_chair_label_ignored(): a = rd.analyze_pairs([_pair(None, "split", "chair", True, False, None)]) assert a["n_pairs"] == 1 # counted in total … assert a["judge_stats"]["claude"]["voted"] == 0 # … but contributes no signal def test_evidence_capped(): pairs = [_pair(False, "unanimous_yes", "approved", True, True, True) for _ in range(40)] a = rd.analyze_pairs(pairs) assert a["n_false_keep"] == 40 assert len(a["evidence"]) == rd.MAX_EVIDENCE def test_min_pairs_threshold_is_sane(): assert rd.MIN_PAIRS >= 1