All checks were successful
G12 Leak-Guard / leak-guard (pull_request) Successful in 5s
job תקופתי שסוגר את לולאת-הלמידה: מצליב את סבבי-הפאנל (FU-1, הצבעות+
נימוקים) מול הכרעות-היו"ר (FU-2 seeds), מזהה כשלים שיטתיים, ומציע
KEEP_SYSTEM v2 + exemplars מופשטים — כדוח-diff לעיון-היו"ר. לעולם לא
auto-applied.
- db.panel_rounds_vs_chair() — read-only LATERAL join: לכל הלכה עם seed
chair-live (FU-2, אמת אנושית) + סבב-פאנל אחרון (FU-1) → הצבעות+נימוקי-
3-השופטים מול keep/drop של היו"ר. הסיגנל היחיד = הכרעת-יו"ר, לא
הצבעות-הפאנל (anti-echo-chamber, INV-LRN1).
- scripts/halacha_rubric_distill.py:
• analyze_pairs() — ליבה דטרמיניסטית טהורה (offline-testable): false-keep
(פאנל שמר, יו"ר דחה), false-drop, פיצולים-שהוכרעו, שיעור-מחלוקת-עם-
היו"ר לכל שופט; בוחר ראיות-מחלוקת מכוסות.
• הצעת-LLM מקומית (claude_session, tools="", אפס עלות): מזהה דפוסי-כשל
ומציע נוסח-רובריקה v2 + exemplars מופשטים (INV-LRN5 — בלי מהות-תיק).
• כותב data/learning/rubric-proposal-<ts>.md עם diff(KEEP_SYSTEM→v2);
אף שורת-קוד לא משתנה. אימוץ = עריכה ידנית דרך PR (INV-LRN1).
• <12 זוגות → "אין מספיק נתונים" (מצב נוכחי: seeds עדיין מצטברים).
• --no-llm (סטטיסטיקה בלבד) / --limit N.
- tests/test_rubric_distill.py — 8 בדיקות offline על analyze_pairs.
- SCRIPTS.md עודכן. smoke read-only עבר (0 זוגות → insufficient-data).
תואם הדפוס הקיים (style_lesson_panel/halacha_panel_audit): פאנל מציע,
הטמעה נשארת שער-יו"ר ידני. Invariants: INV-LRN1 (propose-only) ·
INV-LRN5 (טוהר-רובריקה) · INV-G10 · anti-echo-chamber. בלי שער/UI חדש.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
88 lines
3.4 KiB
Python
88 lines
3.4 KiB
Python
"""Tests for #133 / FU-4 — rubric distillation from chair decisions.
|
|
|
|
Covers the PURE deterministic core (analyze_pairs): given (panel ⋈ chair) pairs
|
|
it must correctly classify the systematic-failure buckets — false-keep (panel
|
|
auto-kept, chair dropped), false-drop, chair-resolved splits — and the per-judge
|
|
disagreement-with-chair rate. Fully OFFLINE (no DB, no LLM). The LLM proposal
|
|
and the report rendering are exercised by the integration smoke run.
|
|
|
|
The invariant this locks down: the only label compared against is the chair's
|
|
human ruling — never the panel's own votes (echo-chamber guard, INV-LRN1).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# the script lives in ../scripts relative to mcp-server/
|
|
sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "scripts"))
|
|
import halacha_rubric_distill as rd # noqa: E402
|
|
|
|
|
|
def _pair(chair, verdict, action, c, ds, gm, rule="כלל כלשהו"):
|
|
return {
|
|
"chair_keep": chair, "verdict": verdict, "applied_action": action,
|
|
"rule_statement": rule,
|
|
"claude_vote": c, "claude_reason": "rc",
|
|
"deepseek_vote": ds, "deepseek_reason": "rd",
|
|
"gemini_vote": gm, "gemini_reason": "rg",
|
|
}
|
|
|
|
|
|
def test_false_keep_detected():
|
|
"""Panel auto-approved but the chair dropped → a false-keep (the costly error)."""
|
|
a = rd.analyze_pairs([_pair(False, "unanimous_yes", "approved", True, True, True)])
|
|
assert a["n_false_keep"] == 1
|
|
assert a["n_false_drop"] == 0
|
|
|
|
|
|
def test_nli_cleared_counts_as_keep():
|
|
a = rd.analyze_pairs([_pair(False, "unanimous_yes", "nli_cleared", True, True, True)])
|
|
assert a["n_false_keep"] == 1
|
|
|
|
|
|
def test_false_drop_detected():
|
|
a = rd.analyze_pairs([_pair(True, "unanimous_no", "rejected", False, False, False)])
|
|
assert a["n_false_drop"] == 1
|
|
assert a["n_false_keep"] == 0
|
|
|
|
|
|
def test_split_resolved_counted_not_a_false_decision():
|
|
"""A split escalates to the chair — no auto-decision, so it is neither a
|
|
false-keep nor a false-drop, but it IS a resolved split (learning signal)."""
|
|
a = rd.analyze_pairs([_pair(True, "split", "chair", True, False, None)])
|
|
assert a["n_splits_resolved"] == 1
|
|
assert a["n_false_keep"] == 0 and a["n_false_drop"] == 0
|
|
|
|
|
|
def test_judge_disagreement_rate_vs_chair():
|
|
"""Disagreement is measured against the chair, never against the other judges."""
|
|
pairs = [
|
|
_pair(True, "split", "chair", True, False, None), # claude agree, deepseek disagree
|
|
_pair(True, "split", "chair", True, False, True), # claude agree, deepseek disagree, gemini agree
|
|
]
|
|
a = rd.analyze_pairs(pairs)
|
|
assert a["judge_stats"]["claude"]["disagree_rate"] == 0.0
|
|
assert a["judge_stats"]["deepseek"]["disagree_rate"] == 1.0
|
|
# gemini voted once (agree) and abstained once → rate 0.0 over 1 vote
|
|
assert a["judge_stats"]["gemini"]["voted"] == 1
|
|
assert a["judge_stats"]["gemini"]["disagree_rate"] == 0.0
|
|
|
|
|
|
def test_none_chair_label_ignored():
|
|
a = rd.analyze_pairs([_pair(None, "split", "chair", True, False, None)])
|
|
assert a["n_pairs"] == 1 # counted in total …
|
|
assert a["judge_stats"]["claude"]["voted"] == 0 # … but contributes no signal
|
|
|
|
|
|
def test_evidence_capped():
|
|
pairs = [_pair(False, "unanimous_yes", "approved", True, True, True) for _ in range(40)]
|
|
a = rd.analyze_pairs(pairs)
|
|
assert a["n_false_keep"] == 40
|
|
assert len(a["evidence"]) == rd.MAX_EVIDENCE
|
|
|
|
|
|
def test_min_pairs_threshold_is_sane():
|
|
assert rd.MIN_PAIRS >= 1
|