#!/usr/bin/env python3 """Calibrate the approval-panel voting policy on the gold-set (Trust-or-Escalate). The literature (Trust or Escalate, ICLR 2025; PoLL; selective prediction) says: don't guess the aggregation policy — calibrate it to a target risk α on a calibration set, and ESCALATE disagreement to the human. We have a calibration set: the gold-set's ``is_holding`` is the COARSE "is this a real, keepable rule?" label — the axis we already proved is reliable across models (92%). This runs the panel's KEEP question (3 independent judges) on every gold-set item that has an is_holding label, then reports, FOR EACH POLICY, the auto-decision precision (vs is_holding) and coverage (how many it decides vs escalates): - unanimous : auto-decide only on 3/3 agreement, else escalate - majority : auto-decide on 2/3, else escalate Pick the policy whose auto-error stays under your tolerance while covering the most items. Read-only. Local-only (claude_session needs the CLI). FU-5 (#133) adds two things on top of the original live calibration: - ``--source captured``: ZERO-COST measurement that reads the stored panel rounds (FU-1) joined with the chair's rulings (FU-2) and reports, PER ROUND, the split-rate + auto-precision the panel actually delivered against the GROWING chair gold-set. This is how we see the active-learning loop working: as the rubric improves (FU-4 → chair adopts v2), precision should hold while the split-rate falls. - anon-stability: the share of gold-set items whose verdict survives masking the case names (#81.7 probe) — an echo-chamber health metric. A falling rate means the panel is recognizing cases, not reasoning. cd ~/legal-ai/mcp-server .venv/bin/python ../scripts/halacha_panel_calibrate.py # live calibration .venv/bin/python ../scripts/halacha_panel_calibrate.py --source captured # per-round trend (free) """ from __future__ import annotations import argparse import asyncio import httpx from legal_mcp.services import db # reuse the exact panel judges + KEEP question (single source of truth) from halacha_panel_approve import ( # noqa: E402 KEEP_SYSTEM, _bool, _keep_user, judge_claude, judge_deepseek, judge_gemini, ) # reuse FU-4's stats core so "what failed" is computed in ONE place (no drift) from halacha_rubric_distill import _JUDGES, analyze_pairs # noqa: E402 def summarize_calibration(pairs: list[dict]) -> dict: """Pure: from captured (panel ⋈ chair) pairs, the calibration health metrics the panel ACTUALLY delivered against the chair's ground-truth (#133/FU-5). split_rate — fraction the panel escalated to the chair (split/incomplete); the cost of caution. auto_precision — of the auto-decisions, the fraction the chair later agreed with. As the rubric improves (FU-4 → chair adopts), precision should hold while split_rate falls. Reuses FU-4's analyze_pairs so the failure buckets are defined once.""" labeled = [p for p in pairs if p.get("chair_keep") is not None] a = analyze_pairs(labeled) n = a["n_pairs"] escalated = a["n_splits_resolved"] auto = n - escalated wrong = a["n_false_keep"] + a["n_false_drop"] return { "n": n, "auto_decided": auto, "escalated": escalated, "split_rate": round(escalated / n, 3) if n else None, "auto_precision": round((auto - wrong) / auto, 3) if auto else None, "false_keep": a["n_false_keep"], "false_drop": a["n_false_drop"], "judge_disagree": {j: a["judge_stats"][j]["disagree_rate"] for j in _JUDGES}, } def bucket_by_round(pairs: list[dict]) -> list[tuple[str, dict]]: """Pure: split the pairs by the panel round's CALENDAR DAY and summarize each bucket — the per-round trend on the GROWING chair gold-set. round_ts is the captured round's ISO timestamp; the day prefix groups a run together.""" buckets: dict[str, list[dict]] = {} for p in pairs: day = (p.get("round_ts") or "")[:10] or "unknown" buckets.setdefault(day, []).append(p) return [(day, summarize_calibration(buckets[day])) for day in sorted(buckets)] async def _anon_stability(batch: str) -> dict: """Echo-chamber health metric (#133/FU-5). The anon probe (#81.7) re-judges gold-set items with case names masked; ``anon_stable`` is True when the masked verdict still matches the consensus. A FALLING stable-rate means the panel is recognizing cases rather than reasoning — the echo-chamber symptom the loop must watch. Reads stored columns (populated by goldset_panel_label for the 'default' batch); returns counts so callers can report or skip.""" items = await db.goldset_list(batch) probed = [it for it in items if it.get("anon_stable") is not None] stable = sum(1 for it in probed if it["anon_stable"]) return { "probed": len(probed), "stable": stable, "stable_rate": round(stable / len(probed), 3) if probed else None, } async def _votes(client, h) -> list[bool]: user = _keep_user(h) c, ds, gm = await asyncio.gather( judge_claude(KEEP_SYSTEM, user), judge_deepseek(client, KEEP_SYSTEM, user), judge_gemini(client, KEEP_SYSTEM, user), ) return [v for v in (_bool(c, "keep"), _bool(ds, "keep"), _bool(gm, "keep")) if v is not None] def _decide(votes: list[bool], policy: str) -> bool | None: """Auto-decision (True=keep / False=drop) or None=escalate.""" if len(votes) < 2: return None yes, no = sum(votes), len(votes) - sum(votes) if policy == "unanimous": if len(votes) == 3 and yes == 3: return True if len(votes) == 3 and no == 3: return False return None # majority if yes > no: return True if no > yes: return False return None # tie def _fmt(x) -> str: return "—" if x is None else (f"{x:.1%}" if isinstance(x, float) else str(x)) async def _run_captured(args: argparse.Namespace) -> int: """FU-5 — measure the panel's REAL performance against the growing chair gold-set, from CAPTURED rounds (FU-1) ⋈ chair rulings (FU-2). Zero-cost: no re-voting, no LLM — it reports what the panel actually delivered, per round. """ pairs = await db.panel_rounds_vs_chair(limit=args.limit or 5000) overall = summarize_calibration(pairs) print(f"captured calibration on {overall['n']} (panel ⋈ chair) pairs\n", flush=True) if not overall["n"]: print("no chair-resolved pairs yet — seeds accrue as the chair reviews " "panel-judged halachot (FU-2). Nothing to measure.", flush=True) return 0 print("=" * 70) print(f"{'round-day':<12}{'pairs':>7}{'auto':>6}{'split%':>9}{'precision':>11}" f"{'fKEEP':>7}{'fDROP':>7}") print("-" * 70) for day, s in bucket_by_round(pairs): print(f"{day:<12}{s['n']:>7}{s['auto_decided']:>6}{_fmt(s['split_rate']):>9}" f"{_fmt(s['auto_precision']):>11}{s['false_keep']:>7}{s['false_drop']:>7}") print("-" * 70) print(f"{'OVERALL':<12}{overall['n']:>7}{overall['auto_decided']:>6}" f"{_fmt(overall['split_rate']):>9}{_fmt(overall['auto_precision']):>11}" f"{overall['false_keep']:>7}{overall['false_drop']:>7}") print("\nper-judge disagreement with the chair (lower = better aligned):") for j in _JUDGES: print(f" {j:<10} {_fmt(overall['judge_disagree'][j])}") anon = await _anon_stability("default") print(f"\nanon-stability (echo-chamber health, batch 'default'): " f"{_fmt(anon['stable_rate'])} over {anon['probed']} probed " f"({'falling = memorization risk' if anon['probed'] else 'not populated — run goldset_panel_label'})") return 0 async def main(args: argparse.Namespace) -> int: if args.source == "captured": return await _run_captured(args) items = [it for it in await db.goldset_list(args.batch) if it.get("is_holding") is not None] if args.limit: items = items[: args.limit] print(f"calibrating panel KEEP vs is_holding on {len(items)} gold-set items\n", flush=True) sem = asyncio.Semaphore(args.concurrency) rows = [] async with httpx.AsyncClient() as client: async def one(it): async with sem: v = await _votes(client, it) rows.append({"truth": bool(it["is_holding"]), "votes": v}) tasks = [one(it) for it in items] for i in range(0, len(tasks), args.concurrency): await asyncio.gather(*tasks[i : i + args.concurrency]) print(f" …{len(rows)}/{len(items)}", flush=True) print("\n" + "=" * 64) print(f"{'policy':<11}{'auto':>6}{'escalate':>10}{'correct':>9}{'wrong':>7}{'precision':>11}{'coverage':>10}{'split':>10}") print("-" * 74) for policy in ("unanimous", "majority"): auto = wrong = correct = 0 for r in rows: d = _decide(r["votes"], policy) if d is None: continue auto += 1 if d == r["truth"]: correct += 1 else: wrong += 1 esc = len(rows) - auto prec = correct / auto if auto else 0.0 cov = auto / len(rows) if rows else 0.0 split = esc / len(rows) if rows else 0.0 # FU-5: escalation = split rate print(f"{policy:<11}{auto:>6}{esc:>10}{correct:>9}{wrong:>7}{prec:>10.1%}{cov:>10.1%}{split:>10.1%}") # where do the WRONG auto-decisions fall? (false-keep is the costly one) print("\n=== costly errors: panel auto-KEEPS but human says NOT-holding (per policy) ===") for policy in ("unanimous", "majority"): fk = sum(1 for r in rows if _decide(r["votes"], policy) is True and not r["truth"]) fd = sum(1 for r in rows if _decide(r["votes"], policy) is False and r["truth"]) print(f" {policy:<11} false-KEEP (bad rule approved): {fk} false-DROP (good rule rejected): {fd}") # FU-5: echo-chamber health — does masking case names flip the verdict? anon = await _anon_stability(args.batch) print(f"\nanon-stability (echo-chamber health): {_fmt(anon['stable_rate'])} " f"over {anon['probed']} probed" + ("" if anon["probed"] else " — not populated; run goldset_panel_label")) return 0 if __name__ == "__main__": ap = argparse.ArgumentParser(description="Calibrate / measure the halacha panel (Trust-or-Escalate; FU-5).") ap.add_argument("--source", choices=("live", "captured"), default="live", help="live: re-vote the gold-set now (needs CLI+keys). " "captured: zero-cost — measure stored rounds (FU-1) vs chair rulings (FU-2), per round.") ap.add_argument("--batch", default="default", help="gold-set batch for live mode + anon-stability") ap.add_argument("--limit", type=int, default=0) ap.add_argument("--concurrency", type=int, default=6) raise SystemExit(asyncio.run(main(ap.parse_args())))