#!/usr/bin/env python3 """Calibrate the approval-panel voting policy on the gold-set (Trust-or-Escalate). The literature (Trust or Escalate, ICLR 2025; PoLL; selective prediction) says: don't guess the aggregation policy — calibrate it to a target risk α on a calibration set, and ESCALATE disagreement to the human. We have a calibration set: the gold-set's ``is_holding`` is the COARSE "is this a real, keepable rule?" label — the axis we already proved is reliable across models (92%). This runs the panel's KEEP question (3 independent judges) on every gold-set item that has an is_holding label, then reports, FOR EACH POLICY, the auto-decision precision (vs is_holding) and coverage (how many it decides vs escalates): - unanimous : auto-decide only on 3/3 agreement, else escalate - majority : auto-decide on 2/3, else escalate Pick the policy whose auto-error stays under your tolerance while covering the most items. Read-only. Local-only (claude_session needs the CLI). cd ~/legal-ai/mcp-server .venv/bin/python ../scripts/halacha_panel_calibrate.py """ from __future__ import annotations import argparse import asyncio import httpx from legal_mcp.services import db # reuse the exact panel judges + KEEP question (single source of truth) from halacha_panel_approve import ( # noqa: E402 KEEP_SYSTEM, _bool, _keep_user, judge_claude, judge_deepseek, judge_gemini, ) async def _votes(client, h) -> list[bool]: user = _keep_user(h) c, ds, gm = await asyncio.gather( judge_claude(KEEP_SYSTEM, user), judge_deepseek(client, KEEP_SYSTEM, user), judge_gemini(client, KEEP_SYSTEM, user), ) return [v for v in (_bool(c, "keep"), _bool(ds, "keep"), _bool(gm, "keep")) if v is not None] def _decide(votes: list[bool], policy: str) -> bool | None: """Auto-decision (True=keep / False=drop) or None=escalate.""" if len(votes) < 2: return None yes, no = sum(votes), len(votes) - sum(votes) if policy == "unanimous": if len(votes) == 3 and yes == 3: return True if len(votes) == 3 and no == 3: return False return None # majority if yes > no: return True if no > yes: return False return None # tie async def main(args: argparse.Namespace) -> int: items = [it for it in await db.goldset_list(args.batch) if it.get("is_holding") is not None] if args.limit: items = items[: args.limit] print(f"calibrating panel KEEP vs is_holding on {len(items)} gold-set items\n", flush=True) sem = asyncio.Semaphore(args.concurrency) rows = [] async with httpx.AsyncClient() as client: async def one(it): async with sem: v = await _votes(client, it) rows.append({"truth": bool(it["is_holding"]), "votes": v}) tasks = [one(it) for it in items] for i in range(0, len(tasks), args.concurrency): await asyncio.gather(*tasks[i : i + args.concurrency]) print(f" …{len(rows)}/{len(items)}", flush=True) print("\n" + "=" * 64) print(f"{'policy':<11}{'auto':>6}{'escalate':>10}{'correct':>9}{'wrong':>7}{'precision':>11}{'coverage':>10}") print("-" * 64) for policy in ("unanimous", "majority"): auto = wrong = correct = 0 for r in rows: d = _decide(r["votes"], policy) if d is None: continue auto += 1 if d == r["truth"]: correct += 1 else: wrong += 1 esc = len(rows) - auto prec = correct / auto if auto else 0.0 cov = auto / len(rows) if rows else 0.0 print(f"{policy:<11}{auto:>6}{esc:>10}{correct:>9}{wrong:>7}{prec:>10.1%}{cov:>10.1%}") # where do the WRONG auto-decisions fall? (false-keep is the costly one) print("\n=== costly errors: panel auto-KEEPS but human says NOT-holding (per policy) ===") for policy in ("unanimous", "majority"): fk = sum(1 for r in rows if _decide(r["votes"], policy) is True and not r["truth"]) fd = sum(1 for r in rows if _decide(r["votes"], policy) is False and r["truth"]) print(f" {policy:<11} false-KEEP (bad rule approved): {fk} false-DROP (good rule rejected): {fd}") return 0 if __name__ == "__main__": ap = argparse.ArgumentParser() ap.add_argument("--batch", default="default") ap.add_argument("--limit", type=int, default=0) ap.add_argument("--concurrency", type=int, default=6) raise SystemExit(asyncio.run(main(ap.parse_args())))