legal-ai/mcp-server/tests/test_goldset_calibrate.py

"""Tests for #81.8 — db.goldset_calibrate (auto-approve gate calibration).

Verifies the confidence-threshold sweep and the panel-policy precision/coverage
against synthetic gold-set rows. Fully OFFLINE — monkeypatches db.goldset_list,
no Postgres.
"""

from __future__ import annotations

import asyncio

import pytest

from legal_mcp.services import db


def _item(tag, keep, conf, votes):
    c, d, g = votes
    return {
        "tagged_by": tag, "is_holding": keep, "confidence": conf,
        "ai_is_holding": c, "ds_is_holding": d, "gm_is_holding": g,
    }


# A,B,C,D are chair-labeled; E is panel-labeled (excluded under ground_truth='chair').
ITEMS = [
    _item("chair", True, 0.90, (True, True, True)),     # A unanimous keep
    _item("chair", True, 0.80, (True, True, False)),    # B majority keep
    _item("chair", False, 0.60, (False, False, False)),  # C unanimous drop
    _item("chair", True, 0.75, (True, True, True)),     # D unanimous keep
    _item("panel:opus+deepseek+gemini", False, 0.99, (False, False, False)),  # E excluded
]


@pytest.fixture()
def patched(monkeypatch: pytest.MonkeyPatch):
    async def _fake(batch="default"):
        return list(ITEMS)
    monkeypatch.setattr(db, "goldset_list", _fake)


def _run(coro):
    loop = asyncio.new_event_loop()
    try:
        return loop.run_until_complete(coro)
    finally:
        loop.close()


def test_ground_truth_chair_excludes_panel_rows(patched):
    r = _run(db.goldset_calibrate("default", ground_truth="chair"))
    assert r["n"] == 4 and r["positives"] == 3 and r["negatives"] == 1


def test_confidence_sweep_precision_recall(patched):
    r = _run(db.goldset_calibrate("default", ground_truth="chair"))
    sweep = {round(g["threshold"], 2): g for g in r["confidence_sweep"]}
    # T=0.80 approves A(0.90)+B(0.80) → both keep → P=1.0, recall 2/3
    assert sweep[0.80]["approved"] == 2
    assert sweep[0.80]["precision"] == 1.0
    assert sweep[0.80]["recall"] == pytest.approx(0.667, abs=0.01)
    # T=0.70 approves A,B,D (C's 0.60 excluded) → all keep → P=1.0, recall 1.0
    assert sweep[0.70]["approved"] == 3
    assert sweep[0.70]["recall"] == 1.0


def test_panel_policies(patched):
    r = _run(db.goldset_calibrate("default", ground_truth="chair"))
    # majority: A,B,D keep / C drop → approves 3, all keep, full coverage
    maj = r["panel_majority_2of3"]
    assert maj["approved"] == 3 and maj["precision"] == 1.0 and maj["coverage"] == 1.0
    # unanimous: A,D approved (B is T,T,F → not unanimous), all decided → P=1.0
    un = r["panel_unanimous_3of3"]
    assert un["approved"] == 2 and un["precision"] == 1.0 and un["coverage"] == 1.0


def test_current_threshold_surfaced(patched):
    r = _run(db.goldset_calibrate("default"))
    assert r["current_threshold"] == db.config.HALACHA_AUTO_APPROVE_THRESHOLD