legal-ai/mcp-server/tests/test_rubric_distill.py

"""Tests for #133 / FU-4 — rubric distillation from chair decisions.

Covers the PURE deterministic core (analyze_pairs): given (panel ⋈ chair) pairs
it must correctly classify the systematic-failure buckets — false-keep (panel
auto-kept, chair dropped), false-drop, chair-resolved splits — and the per-judge
disagreement-with-chair rate. Fully OFFLINE (no DB, no LLM). The LLM proposal
and the report rendering are exercised by the integration smoke run.

The invariant this locks down: the only label compared against is the chair's
human ruling — never the panel's own votes (echo-chamber guard, INV-LRN1).
"""

from __future__ import annotations

import sys
from pathlib import Path

# the script lives in ../scripts relative to mcp-server/
sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "scripts"))
import halacha_rubric_distill as rd  # noqa: E402


def _pair(chair, verdict, action, c, ds, gm, rule="כלל כלשהו"):
    return {
        "chair_keep": chair, "verdict": verdict, "applied_action": action,
        "rule_statement": rule,
        "claude_vote": c, "claude_reason": "rc",
        "deepseek_vote": ds, "deepseek_reason": "rd",
        "gemini_vote": gm, "gemini_reason": "rg",
    }


def test_false_keep_detected():
    """Panel auto-approved but the chair dropped → a false-keep (the costly error)."""
    a = rd.analyze_pairs([_pair(False, "unanimous_yes", "approved", True, True, True)])
    assert a["n_false_keep"] == 1
    assert a["n_false_drop"] == 0


def test_nli_cleared_counts_as_keep():
    a = rd.analyze_pairs([_pair(False, "unanimous_yes", "nli_cleared", True, True, True)])
    assert a["n_false_keep"] == 1


def test_false_drop_detected():
    a = rd.analyze_pairs([_pair(True, "unanimous_no", "rejected", False, False, False)])
    assert a["n_false_drop"] == 1
    assert a["n_false_keep"] == 0


def test_split_resolved_counted_not_a_false_decision():
    """A split escalates to the chair — no auto-decision, so it is neither a
    false-keep nor a false-drop, but it IS a resolved split (learning signal)."""
    a = rd.analyze_pairs([_pair(True, "split", "chair", True, False, None)])
    assert a["n_splits_resolved"] == 1
    assert a["n_false_keep"] == 0 and a["n_false_drop"] == 0


def test_judge_disagreement_rate_vs_chair():
    """Disagreement is measured against the chair, never against the other judges."""
    pairs = [
        _pair(True, "split", "chair", True, False, None),   # claude agree, deepseek disagree
        _pair(True, "split", "chair", True, False, True),   # claude agree, deepseek disagree, gemini agree
    ]
    a = rd.analyze_pairs(pairs)
    assert a["judge_stats"]["claude"]["disagree_rate"] == 0.0
    assert a["judge_stats"]["deepseek"]["disagree_rate"] == 1.0
    # gemini voted once (agree) and abstained once → rate 0.0 over 1 vote
    assert a["judge_stats"]["gemini"]["voted"] == 1
    assert a["judge_stats"]["gemini"]["disagree_rate"] == 0.0


def test_none_chair_label_ignored():
    a = rd.analyze_pairs([_pair(None, "split", "chair", True, False, None)])
    assert a["n_pairs"] == 1  # counted in total …
    assert a["judge_stats"]["claude"]["voted"] == 0  # … but contributes no signal


def test_evidence_capped():
    pairs = [_pair(False, "unanimous_yes", "approved", True, True, True) for _ in range(40)]
    a = rd.analyze_pairs(pairs)
    assert a["n_false_keep"] == 40
    assert len(a["evidence"]) == rd.MAX_EVIDENCE


def test_min_pairs_threshold_is_sane():
    assert rd.MIN_PAIRS >= 1