All checks were successful
G12 Leak-Guard / leak-guard (pull_request) Successful in 5s
כיול סף-האישור-האוטומטי מול ה-100 תוויות-היו"ר (93 keep / 7 drop), אמת אנושית (לא
הקונצנזוס — מונע מעגליות):
conf≥0.80 → P=0.98 R=0.53 ← נוכחי (errs safe)
conf≥0.75 → P=0.96 R=0.81
conf≥0.70 → P=0.94 R=0.94
panel unanimous-3/3 → P=0.988 cov=95% · majority-2/3 → P=0.948 cov=100%
הכרעה: **לשמר 0.80** — עומד ביעד precision≥0.90 עם מרווח, וטועה לכיוון היו"ר
(recall נמוך = יותר סקירה, לא פחות). שני ממצאים:
(א) self-confidence מכויל היטב ל-precision; הוולידטורים ה-rule-based לא-מבחינים
על ה-gold-set (P≈0.1) → "confidence × validators" רק יזיק, לא אומץ (תשובה ל-#81.8).
(ב) מנוף-הכיסוי האמיתי = הפאנל התלת-מודלי (unanimous 0.988/95%), לא סף-confidence נמוך.
הורדת השער ל-0.75 = tradeoff governance (יותר auto-approve לא-מסוקר, INV-G10) על
ראיה דקה (7 שליליים) → נדחה ליו"ר/פאנל (#121), לא שונה כאן.
- db.goldset_calibrate(): sweep-confidence + panel-policy precision/coverage מול הזהב,
read-only, משוחזר (INV-LRN3). ground_truth='chair' default (אנטי-מעגליות).
- config: הערת HALACHA_AUTO_APPROVE_THRESHOLD מעודכנת לממצא-הכיול (במקום spot-check-of-10).
invariants: INV-G10 (לא הורדנו את השער הלא-מסוקר) · INV-LRN2/LRN3 (כיול מתועד במקור, מובנה).
tests: 4 offline (sweep/policies/anti-circularity/threshold-surfaced). אומת חי: משחזר את המספרים.
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
80 lines
2.8 KiB
Python
80 lines
2.8 KiB
Python
"""Tests for #81.8 — db.goldset_calibrate (auto-approve gate calibration).
|
|
|
|
Verifies the confidence-threshold sweep and the panel-policy precision/coverage
|
|
against synthetic gold-set rows. Fully OFFLINE — monkeypatches db.goldset_list,
|
|
no Postgres.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
|
|
import pytest
|
|
|
|
from legal_mcp.services import db
|
|
|
|
|
|
def _item(tag, keep, conf, votes):
|
|
c, d, g = votes
|
|
return {
|
|
"tagged_by": tag, "is_holding": keep, "confidence": conf,
|
|
"ai_is_holding": c, "ds_is_holding": d, "gm_is_holding": g,
|
|
}
|
|
|
|
|
|
# A,B,C,D are chair-labeled; E is panel-labeled (excluded under ground_truth='chair').
|
|
ITEMS = [
|
|
_item("chair", True, 0.90, (True, True, True)), # A unanimous keep
|
|
_item("chair", True, 0.80, (True, True, False)), # B majority keep
|
|
_item("chair", False, 0.60, (False, False, False)), # C unanimous drop
|
|
_item("chair", True, 0.75, (True, True, True)), # D unanimous keep
|
|
_item("panel:opus+deepseek+gemini", False, 0.99, (False, False, False)), # E excluded
|
|
]
|
|
|
|
|
|
@pytest.fixture()
|
|
def patched(monkeypatch: pytest.MonkeyPatch):
|
|
async def _fake(batch="default"):
|
|
return list(ITEMS)
|
|
monkeypatch.setattr(db, "goldset_list", _fake)
|
|
|
|
|
|
def _run(coro):
|
|
loop = asyncio.new_event_loop()
|
|
try:
|
|
return loop.run_until_complete(coro)
|
|
finally:
|
|
loop.close()
|
|
|
|
|
|
def test_ground_truth_chair_excludes_panel_rows(patched):
|
|
r = _run(db.goldset_calibrate("default", ground_truth="chair"))
|
|
assert r["n"] == 4 and r["positives"] == 3 and r["negatives"] == 1
|
|
|
|
|
|
def test_confidence_sweep_precision_recall(patched):
|
|
r = _run(db.goldset_calibrate("default", ground_truth="chair"))
|
|
sweep = {round(g["threshold"], 2): g for g in r["confidence_sweep"]}
|
|
# T=0.80 approves A(0.90)+B(0.80) → both keep → P=1.0, recall 2/3
|
|
assert sweep[0.80]["approved"] == 2
|
|
assert sweep[0.80]["precision"] == 1.0
|
|
assert sweep[0.80]["recall"] == pytest.approx(0.667, abs=0.01)
|
|
# T=0.70 approves A,B,D (C's 0.60 excluded) → all keep → P=1.0, recall 1.0
|
|
assert sweep[0.70]["approved"] == 3
|
|
assert sweep[0.70]["recall"] == 1.0
|
|
|
|
|
|
def test_panel_policies(patched):
|
|
r = _run(db.goldset_calibrate("default", ground_truth="chair"))
|
|
# majority: A,B,D keep / C drop → approves 3, all keep, full coverage
|
|
maj = r["panel_majority_2of3"]
|
|
assert maj["approved"] == 3 and maj["precision"] == 1.0 and maj["coverage"] == 1.0
|
|
# unanimous: A,D approved (B is T,T,F → not unanimous), all decided → P=1.0
|
|
un = r["panel_unanimous_3of3"]
|
|
assert un["approved"] == 2 and un["precision"] == 1.0 and un["coverage"] == 1.0
|
|
|
|
|
|
def test_current_threshold_surfaced(patched):
|
|
r = _run(db.goldset_calibrate("default"))
|
|
assert r["current_threshold"] == db.config.HALACHA_AUTO_APPROVE_THRESHOLD
|