feat(halacha): #81.8 — calibrate auto-approve gate on the gold-set (keep 0.80, documented)

כיול סף-האישור-האוטומטי מול ה-100 תוויות-היו"ר (93 keep / 7 drop), אמת אנושית (לא הקונצנזוס — מונע מעגליות): conf≥0.80 → P=0.98 R=0.53 ← נוכחי (errs safe) conf≥0.75 → P=0.96 R=0.81 conf≥0.70 → P=0.94 R=0.94 panel unanimous-3/3 → P=0.988 cov=95% · majority-2/3 → P=0.948 cov=100% הכרעה: **לשמר 0.80** — עומד ביעד precision≥0.90 עם מרווח, וטועה לכיוון היו"ר (recall נמוך = יותר סקירה, לא פחות). שני ממצאים: (א) self-confidence מכויל היטב ל-precision; הוולידטורים ה-rule-based לא-מבחינים על ה-gold-set (P≈0.1) → "confidence × validators" רק יזיק, לא אומץ (תשובה ל-#81.8). (ב) מנוף-הכיסוי האמיתי = הפאנל התלת-מודלי (unanimous 0.988/95%), לא סף-confidence נמוך. הורדת השער ל-0.75 = tradeoff governance (יותר auto-approve לא-מסוקר, INV-G10) על ראיה דקה (7 שליליים) → נדחה ליו"ר/פאנל (#121), לא שונה כאן. - db.goldset_calibrate(): sweep-confidence + panel-policy precision/coverage מול הזהב, read-only, משוחזר (INV-LRN3). ground_truth='chair' default (אנטי-מעגליות). - config: הערת HALACHA_AUTO_APPROVE_THRESHOLD מעודכנת לממצא-הכיול (במקום spot-check-of-10). invariants: INV-G10 (לא הורדנו את השער הלא-מסוקר) · INV-LRN2/LRN3 (כיול מתועד במקור, מובנה). tests: 4 offline (sweep/policies/anti-circularity/threshold-surfaced). אומת חי: משחזר את המספרים. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-11 16:29:24 +00:00
parent 7e1a0c879a
commit 4e06662208
3 changed files with 161 additions and 6 deletions
--- a/mcp-server/src/legal_mcp/config.py
+++ b/mcp-server/src/legal_mcp/config.py
@@ -138,12 +138,26 @@ BM25_HYBRID_ENABLED = (
 )

 # Halacha extraction — auto-approve threshold. Halachot with extractor
-# confidence >= this value are inserted with review_status='approved'
-# instead of 'pending_review' (so they immediately appear in
-# search_precedent_library). Set to a value > 1.0 to disable auto-approval.
-# 0.80 baseline: 89% of historical extractions land here, manual spot-check
-# of 10 random samples confirmed quality. Tunable via env if drift is
-# observed (e.g. raise to 0.90 if false-positives appear).
+# confidence >= this value AND no quality_flags are inserted
+# review_status='approved' (so they appear immediately in
+# search_precedent_library). Set > 1.0 to disable auto-approval.
+#
+# CALIBRATION (#81.8, 2026-06-11) against the 100-item human-labeled gold-set
+# (db.goldset_calibrate, ground_truth='chair'; 93 keep / 7 drop):
+#   conf>=0.80 -> precision 0.98, recall 0.53   <- current (errs safe)
+#   conf>=0.75 -> precision 0.96, recall 0.81
+#   conf>=0.70 -> precision 0.94, recall 0.94
+# 0.80 clears the >=0.90 precision target with margin, so we KEEP it — it errs
+# toward the chair (low recall = more items reviewed, never the reverse).
+# Two findings shape the policy:
+#  (a) self-confidence alone is well-calibrated for PRECISION; the rule-based
+#      validators do NOT discriminate keep/drop on the gold-set (P~0.1), so a
+#      "confidence x validators" combined score would only hurt — not adopted.
+#  (b) the real COVERAGE lever is the tri-model panel (halacha_panel_approve):
+#      unanimous-3/3 -> precision 0.988 at 95% coverage, dominating any single
+#      confidence threshold. Lowering this gate to ~0.75 is a governance
+#      tradeoff (more unreviewed auto-approvals, INV-G10) on thin evidence
+#      (7 negatives) -> deferred to chair/panel (TaskMaster #121), not changed here.
 HALACHA_AUTO_APPROVE_THRESHOLD = float(
    os.environ.get("HALACHA_AUTO_APPROVE_THRESHOLD", "0.80")
 )
--- a/mcp-server/src/legal_mcp/services/db.py
+++ b/mcp-server/src/legal_mcp/services/db.py
@@ -5025,6 +5025,68 @@ async def goldset_score(batch: str = "default") -> dict:
    }


+async def goldset_calibrate(
+    batch: str = "default", ground_truth: str = "chair",
+    thresholds: tuple[float, ...] = (0.70, 0.75, 0.80, 0.85, 0.90, 0.95),
+) -> dict:
+    """Calibrate the halacha auto-approve gate against the gold-set (#81.8).
+
+    Against the gold-set ``is_holding`` labels, measures:
+      - confidence-threshold gate: for each T, precision (P[is_holding|conf≥T])
+        and recall (share of true keeps approved) — calibrates
+        ``HALACHA_AUTO_APPROVE_THRESHOLD``;
+      - panel policies: precision + coverage of auto-approving on the stored
+        tri-model votes (majority 2/3, unanimous 3/3).
+
+    ``ground_truth='chair'`` scores against HUMAN labels only — the panel votes
+    are an input to the panel-policy rows, so scoring them against the consensus
+    they produced would be circular; the human labels are the independent truth.
+    Returns a structured dict (INV-LRN3); no DB writes (read-only).
+    """
+    items = await goldset_list(batch)
+    if ground_truth == "chair":
+        rows = [r for r in items if r.get("tagged_by") == "chair" and r.get("is_holding") is not None]
+    else:
+        rows = [r for r in items if r.get("is_holding") is not None]
+    conf_rows = [r for r in rows if r.get("confidence") is not None]
+    pos = sum(1 for r in conf_rows if r["is_holding"])
+
+    def _gate(t: float) -> dict:
+        appr = [r for r in conf_rows if float(r["confidence"]) >= t]
+        tp = sum(1 for r in appr if r["is_holding"])
+        prec = tp / len(appr) if appr else 0.0
+        rec = tp / pos if pos else 0.0
+        return {"threshold": t, "approved": len(appr),
+                "precision": round(prec, 3), "recall": round(rec, 3)}
+
+    def _votes(r: dict) -> list[bool]:
+        return [r[k] for k in ("ai_is_holding", "ds_is_holding", "gm_is_holding")
+                if r.get(k) is not None]
+
+    def _policy(unanimous: bool) -> dict:
+        decided = approved = tp = 0
+        for r in rows:
+            v = _votes(r)
+            if len(v) < (3 if unanimous else 2):
+                continue
+            keep = all(v) if unanimous else (sum(v) > len(v) - sum(v))
+            decided += 1
+            if keep:
+                approved += 1
+                tp += int(bool(r["is_holding"]))
+        return {"approved": approved, "precision": round(tp / approved, 3) if approved else 0.0,
+                "coverage": round(decided / len(rows), 3) if rows else 0.0}
+
+    return {
+        "batch": batch, "ground_truth": ground_truth,
+        "n": len(conf_rows), "positives": pos, "negatives": len(conf_rows) - pos,
+        "confidence_sweep": [_gate(t) for t in thresholds],
+        "current_threshold": config.HALACHA_AUTO_APPROVE_THRESHOLD,
+        "panel_majority_2of3": _policy(unanimous=False),
+        "panel_unanimous_3of3": _policy(unanimous=True),
+    }
+
+
 async def list_corroboration_for_halacha(halacha_id: UUID) -> list[dict]:
    """Return all corroboration rows for one halacha, ordered by match_score DESC."""
    pool = await get_pool()