legal-ai/scripts/halacha_panel_calibrate.py

#!/usr/bin/env python3
"""Calibrate the approval-panel voting policy on the gold-set (Trust-or-Escalate).

The literature (Trust or Escalate, ICLR 2025; PoLL; selective prediction) says:
don't guess the aggregation policy — calibrate it to a target risk α on a
calibration set, and ESCALATE disagreement to the human. We have a calibration
set: the gold-set's ``is_holding`` is the COARSE "is this a real, keepable rule?"
label — the axis we already proved is reliable across models (92%).

This runs the panel's KEEP question (3 independent judges) on every gold-set item
that has an is_holding label, then reports, FOR EACH POLICY, the auto-decision
precision (vs is_holding) and coverage (how many it decides vs escalates):

  - unanimous : auto-decide only on 3/3 agreement, else escalate
  - majority  : auto-decide on 2/3, else escalate

Pick the policy whose auto-error stays under your tolerance while covering the
most items. Read-only. Local-only (claude_session needs the CLI).

FU-5 (#133) adds two things on top of the original live calibration:
  - ``--source captured``: ZERO-COST measurement that reads the stored panel
    rounds (FU-1) joined with the chair's rulings (FU-2) and reports, PER ROUND,
    the split-rate + auto-precision the panel actually delivered against the
    GROWING chair gold-set. This is how we see the active-learning loop working:
    as the rubric improves (FU-4 → chair adopts v2), precision should hold while
    the split-rate falls.
  - anon-stability: the share of gold-set items whose verdict survives masking
    the case names (#81.7 probe) — an echo-chamber health metric. A falling rate
    means the panel is recognizing cases, not reasoning.

    cd ~/legal-ai/mcp-server
    .venv/bin/python ../scripts/halacha_panel_calibrate.py                 # live calibration
    .venv/bin/python ../scripts/halacha_panel_calibrate.py --source captured  # per-round trend (free)
"""
from __future__ import annotations

import argparse
import asyncio

import httpx

from legal_mcp.services import db
# reuse the exact panel judges + KEEP question (single source of truth)
from halacha_panel_approve import (  # noqa: E402
    KEEP_SYSTEM, _bool, _keep_user, judge_claude, judge_deepseek, judge_gemini,
)
# reuse FU-4's stats core so "what failed" is computed in ONE place (no drift)
from halacha_rubric_distill import _JUDGES, analyze_pairs  # noqa: E402


def summarize_calibration(pairs: list[dict]) -> dict:
    """Pure: from captured (panel ⋈ chair) pairs, the calibration health metrics
    the panel ACTUALLY delivered against the chair's ground-truth (#133/FU-5).

    split_rate — fraction the panel escalated to the chair (split/incomplete);
      the cost of caution. auto_precision — of the auto-decisions, the fraction
      the chair later agreed with. As the rubric improves (FU-4 → chair adopts),
      precision should hold while split_rate falls. Reuses FU-4's analyze_pairs
      so the failure buckets are defined once."""
    labeled = [p for p in pairs if p.get("chair_keep") is not None]
    a = analyze_pairs(labeled)
    n = a["n_pairs"]
    escalated = a["n_splits_resolved"]
    auto = n - escalated
    wrong = a["n_false_keep"] + a["n_false_drop"]
    return {
        "n": n,
        "auto_decided": auto,
        "escalated": escalated,
        "split_rate": round(escalated / n, 3) if n else None,
        "auto_precision": round((auto - wrong) / auto, 3) if auto else None,
        "false_keep": a["n_false_keep"],
        "false_drop": a["n_false_drop"],
        "judge_disagree": {j: a["judge_stats"][j]["disagree_rate"] for j in _JUDGES},
    }


def bucket_by_round(pairs: list[dict]) -> list[tuple[str, dict]]:
    """Pure: split the pairs by the panel round's CALENDAR DAY and summarize each
    bucket — the per-round trend on the GROWING chair gold-set. round_ts is the
    captured round's ISO timestamp; the day prefix groups a run together."""
    buckets: dict[str, list[dict]] = {}
    for p in pairs:
        day = (p.get("round_ts") or "")[:10] or "unknown"
        buckets.setdefault(day, []).append(p)
    return [(day, summarize_calibration(buckets[day])) for day in sorted(buckets)]


async def _anon_stability(batch: str) -> dict:
    """Echo-chamber health metric (#133/FU-5). The anon probe (#81.7) re-judges
    gold-set items with case names masked; ``anon_stable`` is True when the
    masked verdict still matches the consensus. A FALLING stable-rate means the
    panel is recognizing cases rather than reasoning — the echo-chamber symptom
    the loop must watch. Reads stored columns (populated by goldset_panel_label
    for the 'default' batch); returns counts so callers can report or skip."""
    items = await db.goldset_list(batch)
    probed = [it for it in items if it.get("anon_stable") is not None]
    stable = sum(1 for it in probed if it["anon_stable"])
    return {
        "probed": len(probed),
        "stable": stable,
        "stable_rate": round(stable / len(probed), 3) if probed else None,
    }


async def _votes(client, h) -> list[bool]:
    user = _keep_user(h)
    c, ds, gm = await asyncio.gather(
        judge_claude(KEEP_SYSTEM, user),
        judge_deepseek(client, KEEP_SYSTEM, user),
        judge_gemini(client, KEEP_SYSTEM, user),
    )
    return [v for v in (_bool(c, "keep"), _bool(ds, "keep"), _bool(gm, "keep")) if v is not None]


def _decide(votes: list[bool], policy: str) -> bool | None:
    """Auto-decision (True=keep / False=drop) or None=escalate."""
    if len(votes) < 2:
        return None
    yes, no = sum(votes), len(votes) - sum(votes)
    if policy == "unanimous":
        if len(votes) == 3 and yes == 3:
            return True
        if len(votes) == 3 and no == 3:
            return False
        return None
    # majority
    if yes > no:
        return True
    if no > yes:
        return False
    return None  # tie


def _fmt(x) -> str:
    return "—" if x is None else (f"{x:.1%}" if isinstance(x, float) else str(x))


async def _run_captured(args: argparse.Namespace) -> int:
    """FU-5 — measure the panel's REAL performance against the growing chair
    gold-set, from CAPTURED rounds (FU-1) ⋈ chair rulings (FU-2). Zero-cost: no
    re-voting, no LLM — it reports what the panel actually delivered, per round.
    """
    pairs = await db.panel_rounds_vs_chair(limit=args.limit or 5000)
    overall = summarize_calibration(pairs)
    print(f"captured calibration on {overall['n']} (panel ⋈ chair) pairs\n", flush=True)
    if not overall["n"]:
        print("no chair-resolved pairs yet — seeds accrue as the chair reviews "
              "panel-judged halachot (FU-2). Nothing to measure.", flush=True)
        return 0

    print("=" * 70)
    print(f"{'round-day':<12}{'pairs':>7}{'auto':>6}{'split%':>9}{'precision':>11}"
          f"{'fKEEP':>7}{'fDROP':>7}")
    print("-" * 70)
    for day, s in bucket_by_round(pairs):
        print(f"{day:<12}{s['n']:>7}{s['auto_decided']:>6}{_fmt(s['split_rate']):>9}"
              f"{_fmt(s['auto_precision']):>11}{s['false_keep']:>7}{s['false_drop']:>7}")
    print("-" * 70)
    print(f"{'OVERALL':<12}{overall['n']:>7}{overall['auto_decided']:>6}"
          f"{_fmt(overall['split_rate']):>9}{_fmt(overall['auto_precision']):>11}"
          f"{overall['false_keep']:>7}{overall['false_drop']:>7}")
    print("\nper-judge disagreement with the chair (lower = better aligned):")
    for j in _JUDGES:
        print(f"  {j:<10} {_fmt(overall['judge_disagree'][j])}")

    anon = await _anon_stability("default")
    print(f"\nanon-stability (echo-chamber health, batch 'default'): "
          f"{_fmt(anon['stable_rate'])} over {anon['probed']} probed "
          f"({'falling = memorization risk' if anon['probed'] else 'not populated — run goldset_panel_label'})")
    return 0


async def main(args: argparse.Namespace) -> int:
    if args.source == "captured":
        return await _run_captured(args)

    items = [it for it in await db.goldset_list(args.batch) if it.get("is_holding") is not None]
    if args.limit:
        items = items[: args.limit]
    print(f"calibrating panel KEEP vs is_holding on {len(items)} gold-set items\n", flush=True)

    sem = asyncio.Semaphore(args.concurrency)
    rows = []
    async with httpx.AsyncClient() as client:
        async def one(it):
            async with sem:
                v = await _votes(client, it)
            rows.append({"truth": bool(it["is_holding"]), "votes": v})
        tasks = [one(it) for it in items]
        for i in range(0, len(tasks), args.concurrency):
            await asyncio.gather(*tasks[i : i + args.concurrency])
            print(f"  …{len(rows)}/{len(items)}", flush=True)

    print("\n" + "=" * 64)
    print(f"{'policy':<11}{'auto':>6}{'escalate':>10}{'correct':>9}{'wrong':>7}{'precision':>11}{'coverage':>10}{'split':>10}")
    print("-" * 74)
    for policy in ("unanimous", "majority"):
        auto = wrong = correct = 0
        for r in rows:
            d = _decide(r["votes"], policy)
            if d is None:
                continue
            auto += 1
            if d == r["truth"]:
                correct += 1
            else:
                wrong += 1
        esc = len(rows) - auto
        prec = correct / auto if auto else 0.0
        cov = auto / len(rows) if rows else 0.0
        split = esc / len(rows) if rows else 0.0  # FU-5: escalation = split rate
        print(f"{policy:<11}{auto:>6}{esc:>10}{correct:>9}{wrong:>7}{prec:>10.1%}{cov:>10.1%}{split:>10.1%}")

    # where do the WRONG auto-decisions fall? (false-keep is the costly one)
    print("\n=== costly errors: panel auto-KEEPS but human says NOT-holding (per policy) ===")
    for policy in ("unanimous", "majority"):
        fk = sum(1 for r in rows if _decide(r["votes"], policy) is True and not r["truth"])
        fd = sum(1 for r in rows if _decide(r["votes"], policy) is False and r["truth"])
        print(f"  {policy:<11} false-KEEP (bad rule approved): {fk}   false-DROP (good rule rejected): {fd}")

    # FU-5: echo-chamber health — does masking case names flip the verdict?
    anon = await _anon_stability(args.batch)
    print(f"\nanon-stability (echo-chamber health): {_fmt(anon['stable_rate'])} "
          f"over {anon['probed']} probed"
          + ("" if anon["probed"] else " — not populated; run goldset_panel_label"))
    return 0


if __name__ == "__main__":
    ap = argparse.ArgumentParser(description="Calibrate / measure the halacha panel (Trust-or-Escalate; FU-5).")
    ap.add_argument("--source", choices=("live", "captured"), default="live",
                    help="live: re-vote the gold-set now (needs CLI+keys). "
                         "captured: zero-cost — measure stored rounds (FU-1) vs chair rulings (FU-2), per round.")
    ap.add_argument("--batch", default="default", help="gold-set batch for live mode + anon-stability")
    ap.add_argument("--limit", type=int, default=0)
    ap.add_argument("--concurrency", type=int, default=6)
    raise SystemExit(asyncio.run(main(ap.parse_args())))