Merge pull request 'feat(halacha): #81.8 — כיול שער-האישור-האוטומטי על ה-gold-set (משמרים 0.80, מתועד)' (#191) from worktree-halacha-autoapprove-calibration into main

2026-06-11 16:29:45 +00:00
parent 7e1a0c879a 4e06662208
commit 276bb4ae93
3 changed files with 161 additions and 6 deletions
--- a/mcp-server/src/legal_mcp/config.py
+++ b/mcp-server/src/legal_mcp/config.py
@@ -138,12 +138,26 @@ BM25_HYBRID_ENABLED = (
 )
 # Halacha extraction — auto-approve threshold. Halachot with extractor
-# confidence >= this value are inserted with review_status='approved'
+# confidence >= this value AND no quality_flags are inserted
-# instead of 'pending_review' (so they immediately appear in
+# review_status='approved' (so they appear immediately in
-# search_precedent_library). Set to a value > 1.0 to disable auto-approval.
+# search_precedent_library). Set > 1.0 to disable auto-approval.
-# 0.80 baseline: 89% of historical extractions land here, manual spot-check
+#
-# of 10 random samples confirmed quality. Tunable via env if drift is
+# CALIBRATION (#81.8, 2026-06-11) against the 100-item human-labeled gold-set
-# observed (e.g. raise to 0.90 if false-positives appear).
+# (db.goldset_calibrate, ground_truth='chair'; 93 keep / 7 drop):
 #   conf>=0.80 -> precision 0.98, recall 0.53   <- current (errs safe)
 #   conf>=0.75 -> precision 0.96, recall 0.81
 #   conf>=0.70 -> precision 0.94, recall 0.94
 # 0.80 clears the >=0.90 precision target with margin, so we KEEP it — it errs
 # toward the chair (low recall = more items reviewed, never the reverse).
 # Two findings shape the policy:
 #  (a) self-confidence alone is well-calibrated for PRECISION; the rule-based
 #      validators do NOT discriminate keep/drop on the gold-set (P~0.1), so a
 #      "confidence x validators" combined score would only hurt — not adopted.
 #  (b) the real COVERAGE lever is the tri-model panel (halacha_panel_approve):
 #      unanimous-3/3 -> precision 0.988 at 95% coverage, dominating any single
 #      confidence threshold. Lowering this gate to ~0.75 is a governance
 #      tradeoff (more unreviewed auto-approvals, INV-G10) on thin evidence
 #      (7 negatives) -> deferred to chair/panel (TaskMaster #121), not changed here.
 HALACHA_AUTO_APPROVE_THRESHOLD = float(
    os.environ.get("HALACHA_AUTO_APPROVE_THRESHOLD", "0.80")
 )
--- a/mcp-server/src/legal_mcp/services/db.py
+++ b/mcp-server/src/legal_mcp/services/db.py
@@ -5025,6 +5025,68 @@ async def goldset_score(batch: str = "default") -> dict:
    }
 async def goldset_calibrate(
    batch: str = "default", ground_truth: str = "chair",
    thresholds: tuple[float, ...] = (0.70, 0.75, 0.80, 0.85, 0.90, 0.95),
 ) -> dict:
    """Calibrate the halacha auto-approve gate against the gold-set (#81.8).
    Against the gold-set ``is_holding`` labels, measures:
      - confidence-threshold gate: for each T, precision (P[is_holding|conf≥T])
        and recall (share of true keeps approved) — calibrates
        ``HALACHA_AUTO_APPROVE_THRESHOLD``;
      - panel policies: precision + coverage of auto-approving on the stored
        tri-model votes (majority 2/3, unanimous 3/3).
    ``ground_truth='chair'`` scores against HUMAN labels only — the panel votes
    are an input to the panel-policy rows, so scoring them against the consensus
    they produced would be circular; the human labels are the independent truth.
    Returns a structured dict (INV-LRN3); no DB writes (read-only).
    """
    items = await goldset_list(batch)
    if ground_truth == "chair":
        rows = [r for r in items if r.get("tagged_by") == "chair" and r.get("is_holding") is not None]
    else:
        rows = [r for r in items if r.get("is_holding") is not None]
    conf_rows = [r for r in rows if r.get("confidence") is not None]
    pos = sum(1 for r in conf_rows if r["is_holding"])
    def _gate(t: float) -> dict:
        appr = [r for r in conf_rows if float(r["confidence"]) >= t]
        tp = sum(1 for r in appr if r["is_holding"])
        prec = tp / len(appr) if appr else 0.0
        rec = tp / pos if pos else 0.0
        return {"threshold": t, "approved": len(appr),
                "precision": round(prec, 3), "recall": round(rec, 3)}
    def _votes(r: dict) -> list[bool]:
        return [r[k] for k in ("ai_is_holding", "ds_is_holding", "gm_is_holding")
                if r.get(k) is not None]
    def _policy(unanimous: bool) -> dict:
        decided = approved = tp = 0
        for r in rows:
            v = _votes(r)
            if len(v) < (3 if unanimous else 2):
                continue
            keep = all(v) if unanimous else (sum(v) > len(v) - sum(v))
            decided += 1
            if keep:
                approved += 1
                tp += int(bool(r["is_holding"]))
        return {"approved": approved, "precision": round(tp / approved, 3) if approved else 0.0,
                "coverage": round(decided / len(rows), 3) if rows else 0.0}
    return {
        "batch": batch, "ground_truth": ground_truth,
        "n": len(conf_rows), "positives": pos, "negatives": len(conf_rows) - pos,
        "confidence_sweep": [_gate(t) for t in thresholds],
        "current_threshold": config.HALACHA_AUTO_APPROVE_THRESHOLD,
        "panel_majority_2of3": _policy(unanimous=False),
        "panel_unanimous_3of3": _policy(unanimous=True),
    }
 async def list_corroboration_for_halacha(halacha_id: UUID) -> list[dict]:
    """Return all corroboration rows for one halacha, ordered by match_score DESC."""
    pool = await get_pool()
--- a/mcp-server/tests/test_goldset_calibrate.py
+++ b/mcp-server/tests/test_goldset_calibrate.py
@@ -0,0 +1,79 @@
 """Tests for #81.8 — db.goldset_calibrate (auto-approve gate calibration).
 Verifies the confidence-threshold sweep and the panel-policy precision/coverage
 against synthetic gold-set rows. Fully OFFLINE — monkeypatches db.goldset_list,
 no Postgres.
 """
 from __future__ import annotations
 import asyncio
 import pytest
 from legal_mcp.services import db
 def _item(tag, keep, conf, votes):
    c, d, g = votes
    return {
        "tagged_by": tag, "is_holding": keep, "confidence": conf,
        "ai_is_holding": c, "ds_is_holding": d, "gm_is_holding": g,
    }
 # A,B,C,D are chair-labeled; E is panel-labeled (excluded under ground_truth='chair').
 ITEMS = [
    _item("chair", True, 0.90, (True, True, True)),     # A unanimous keep
    _item("chair", True, 0.80, (True, True, False)),    # B majority keep
    _item("chair", False, 0.60, (False, False, False)),  # C unanimous drop
    _item("chair", True, 0.75, (True, True, True)),     # D unanimous keep
    _item("panel:opus+deepseek+gemini", False, 0.99, (False, False, False)),  # E excluded
 ]
@pytest.fixture()
 def patched(monkeypatch: pytest.MonkeyPatch):
    async def _fake(batch="default"):
        return list(ITEMS)
    monkeypatch.setattr(db, "goldset_list", _fake)
 def _run(coro):
    loop = asyncio.new_event_loop()
    try:
        return loop.run_until_complete(coro)
    finally:
        loop.close()
 def test_ground_truth_chair_excludes_panel_rows(patched):
    r = _run(db.goldset_calibrate("default", ground_truth="chair"))
    assert r["n"] == 4 and r["positives"] == 3 and r["negatives"] == 1
 def test_confidence_sweep_precision_recall(patched):
    r = _run(db.goldset_calibrate("default", ground_truth="chair"))
    sweep = {round(g["threshold"], 2): g for g in r["confidence_sweep"]}
    # T=0.80 approves A(0.90)+B(0.80) → both keep → P=1.0, recall 2/3
    assert sweep[0.80]["approved"] == 2
    assert sweep[0.80]["precision"] == 1.0
    assert sweep[0.80]["recall"] == pytest.approx(0.667, abs=0.01)
    # T=0.70 approves A,B,D (C's 0.60 excluded) → all keep → P=1.0, recall 1.0
    assert sweep[0.70]["approved"] == 3
    assert sweep[0.70]["recall"] == 1.0
 def test_panel_policies(patched):
    r = _run(db.goldset_calibrate("default", ground_truth="chair"))
    # majority: A,B,D keep / C drop → approves 3, all keep, full coverage
    maj = r["panel_majority_2of3"]
    assert maj["approved"] == 3 and maj["precision"] == 1.0 and maj["coverage"] == 1.0
    # unanimous: A,D approved (B is T,T,F → not unanimous), all decided → P=1.0
    un = r["panel_unanimous_3of3"]
    assert un["approved"] == 2 and un["precision"] == 1.0 and un["coverage"] == 1.0
 def test_current_threshold_surfaced(patched):
    r = _run(db.goldset_calibrate("default"))
    assert r["current_threshold"] == db.config.HALACHA_AUTO_APPROVE_THRESHOLD