legal-ai/mcp-server/tests/test_panel_calibrate_captured.py

"""Tests for #133 / FU-5 — captured-mode calibration of the halacha panel.

Covers the PURE helpers in scripts/halacha_panel_calibrate.py
(summarize_calibration, bucket_by_round): from captured (panel ⋈ chair) pairs
they must report the split-rate and auto-precision the panel ACTUALLY delivered
against the chair's ground-truth, and break it down per round-day so the loop's
trend is visible. Fully OFFLINE (no DB, no LLM, no re-voting).
"""

from __future__ import annotations

import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "scripts"))
import halacha_panel_calibrate as cal  # noqa: E402


def _pair(chair, verdict, action, ts="2026-06-12T04:00:00Z"):
    # judge votes are irrelevant to split-rate/precision here; keep them aligned
    return {
        "chair_keep": chair, "verdict": verdict, "applied_action": action,
        "round_ts": ts, "rule_statement": "r",
        "claude_vote": chair, "claude_reason": "",
        "deepseek_vote": chair, "deepseek_reason": "",
        "gemini_vote": chair, "gemini_reason": "",
    }


def test_split_rate_and_precision():
    pairs = [
        _pair(True, "unanimous_yes", "approved"),   # auto-correct
        _pair(False, "unanimous_no", "rejected"),   # auto-correct
        _pair(False, "unanimous_yes", "approved"),  # auto WRONG (false-keep)
        _pair(True, "split", "chair"),              # escalated
    ]
    s = cal.summarize_calibration(pairs)
    assert s["n"] == 4
    assert s["escalated"] == 1
    assert s["auto_decided"] == 3
    assert s["split_rate"] == 0.25
    # 3 auto-decisions, 1 wrong (false-keep) → precision 2/3
    assert s["false_keep"] == 1 and s["false_drop"] == 0
    assert round(s["auto_precision"], 2) == 0.67


def test_empty_pairs_safe():
    s = cal.summarize_calibration([])
    assert s["n"] == 0
    assert s["split_rate"] is None and s["auto_precision"] is None


def test_unlabeled_pairs_filtered():
    s = cal.summarize_calibration([_pair(None, "split", "chair")])
    assert s["n"] == 0  # chair=None contributes no calibration signal


def test_bucket_by_round_trend():
    pairs = [
        _pair(True, "unanimous_yes", "approved", ts="2026-06-10T04:00:00Z"),
        _pair(False, "split", "chair", ts="2026-06-10T04:00:00Z"),
        _pair(True, "unanimous_yes", "approved", ts="2026-06-12T05:00:00Z"),
    ]
    trend = cal.bucket_by_round(pairs)
    days = [d for d, _ in trend]
    assert days == ["2026-06-10", "2026-06-12"]  # sorted by day
    assert trend[0][1]["n"] == 2 and trend[0][1]["split_rate"] == 0.5
    assert trend[1][1]["n"] == 1 and trend[1][1]["split_rate"] == 0.0


def test_missing_round_ts_bucketed_unknown():
    p = _pair(True, "unanimous_yes", "approved")
    del p["round_ts"]
    trend = cal.bucket_by_round([p])
    assert trend[0][0] == "unknown"