"""Tests for #81.8 — db.goldset_calibrate (auto-approve gate calibration). Verifies the confidence-threshold sweep and the panel-policy precision/coverage against synthetic gold-set rows. Fully OFFLINE — monkeypatches db.goldset_list, no Postgres. """ from __future__ import annotations import asyncio import pytest from legal_mcp.services import db def _item(tag, keep, conf, votes): c, d, g = votes return { "tagged_by": tag, "is_holding": keep, "confidence": conf, "ai_is_holding": c, "ds_is_holding": d, "gm_is_holding": g, } # A,B,C,D are chair-labeled; E is panel-labeled (excluded under ground_truth='chair'). ITEMS = [ _item("chair", True, 0.90, (True, True, True)), # A unanimous keep _item("chair", True, 0.80, (True, True, False)), # B majority keep _item("chair", False, 0.60, (False, False, False)), # C unanimous drop _item("chair", True, 0.75, (True, True, True)), # D unanimous keep _item("panel:opus+deepseek+gemini", False, 0.99, (False, False, False)), # E excluded ] @pytest.fixture() def patched(monkeypatch: pytest.MonkeyPatch): async def _fake(batch="default"): return list(ITEMS) monkeypatch.setattr(db, "goldset_list", _fake) def _run(coro): loop = asyncio.new_event_loop() try: return loop.run_until_complete(coro) finally: loop.close() def test_ground_truth_chair_excludes_panel_rows(patched): r = _run(db.goldset_calibrate("default", ground_truth="chair")) assert r["n"] == 4 and r["positives"] == 3 and r["negatives"] == 1 def test_confidence_sweep_precision_recall(patched): r = _run(db.goldset_calibrate("default", ground_truth="chair")) sweep = {round(g["threshold"], 2): g for g in r["confidence_sweep"]} # T=0.80 approves A(0.90)+B(0.80) → both keep → P=1.0, recall 2/3 assert sweep[0.80]["approved"] == 2 assert sweep[0.80]["precision"] == 1.0 assert sweep[0.80]["recall"] == pytest.approx(0.667, abs=0.01) # T=0.70 approves A,B,D (C's 0.60 excluded) → all keep → P=1.0, recall 1.0 assert sweep[0.70]["approved"] == 3 assert sweep[0.70]["recall"] == 1.0 def test_panel_policies(patched): r = _run(db.goldset_calibrate("default", ground_truth="chair")) # majority: A,B,D keep / C drop → approves 3, all keep, full coverage maj = r["panel_majority_2of3"] assert maj["approved"] == 3 and maj["precision"] == 1.0 and maj["coverage"] == 1.0 # unanimous: A,D approved (B is T,T,F → not unanimous), all decided → P=1.0 un = r["panel_unanimous_3of3"] assert un["approved"] == 2 and un["precision"] == 1.0 and un["coverage"] == 1.0 def test_current_threshold_surfaced(patched): r = _run(db.goldset_calibrate("default")) assert r["current_threshold"] == db.config.HALACHA_AUTO_APPROVE_THRESHOLD