feat(halacha): #81.7 — gold-set labeled by tri-model consensus (Opus+DeepSeek+Gemini)
All checks were successful
G12 Leak-Guard / leak-guard (pull_request) Successful in 6s
All checks were successful
G12 Leak-Guard / leak-guard (pull_request) Successful in 6s
מבטל את ה-man-in-the-loop בתיוג ה-gold-set (הנחיית-יו"ר 2026-06-11): במקום תיוג ידני של חיים/דפנה, אמת-המידה נקבעת בקונצנזוס שלוש שושלות-מודל עצמאיות — אותו פאנל שמערכת האישור החיה כבר משתמשת בו (halacha_panel_approve), עם 92% הסכמה חוצת-מודלים על הציר הגס. למה לא מעגלי: הוולידטורים הנמדדים ב-#81.8 (compute_quality_flags / is_fact_dependent / is_quote_truncated / is_thin_restatement) הם היוריסטיקות **rule-based** — משפחת-שיטה שונה מה-LLM-judges. שני שומרי-יושר: (1) פיצול-קולות (אין רוב 2/3) לא כותב לייבל — הפריט נשאר NULL ומוסלם ליו"ר (INV-G10); (2) מבחן-אנונימיזציה — שיפוט-מחדש עם מזהה-התיק ממוסך, flip בקונצנזוס = שינון ולא הנמקה (arXiv:2505.02172). - db.py: עמודות per-lineage (ds_*/gm_*; ai_*=claude קיים) + consensus/agreement/anon + goldset_set_panel_label() שכותב רוב-2/3 ל-is_holding/correct_type (tagged_by='panel:…', לא דורס tagged_by='chair'). goldset_score נשאר ללא שינוי — קורא is_holding (G2, אין מסלול ניקוד מקביל). עדכון הערת-הסכמה (בוטלה דרישת "MUST be human"). - scripts/goldset_panel_label.py: 3 שופטים (מיובאים מ-halacha_panel_approve, מקור-אמת יחיד) + prompt עשיר (מיובא מ-goldset_ai_recommend) + Fleiss κ + מבחן-אנונימיזציה. דוח→data/audit/. - SCRIPTS.md: סקריפט חדש; goldset_ai_recommend/independent_judge מסומנים single-model נבלעים. invariants: G2 (שופטים+prompt מיובאים, אין כפילות; ניקוד יחיד) · INV-G10 (פיצול→יו"ר) · INV-LRN2/LRN3 (איכות-במקור, לכידה מובנית). מקור: PoLL · Trust-or-Escalate (ICLR 2025) · arXiv:2505.02172. tests: 18 offline (consensus/type/Fleiss-κ/anonymize). live labeling = צעד תפעולי אחרי deploy. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
104
mcp-server/tests/test_goldset_panel_consensus.py
Normal file
104
mcp-server/tests/test_goldset_panel_consensus.py
Normal file
@@ -0,0 +1,104 @@
|
||||
"""Tests for #81.7 — tri-model consensus labeling of the halacha gold-set.
|
||||
|
||||
Covers the pure aggregation/probe functions in scripts/goldset_panel_label.py
|
||||
(consensus vote, type consensus, Fleiss' kappa, anonymization masking). Fully
|
||||
OFFLINE — no DB, no model calls.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
# the script lives in ../scripts relative to mcp-server/
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "scripts"))
|
||||
import goldset_panel_label as g # noqa: E402
|
||||
|
||||
|
||||
# ── consensus() ───────────────────────────────────────────────────────────────
|
||||
|
||||
@pytest.mark.parametrize("votes,expected", [
|
||||
([True, True, True], (True, "3/3")),
|
||||
([False, False, False], (False, "3/3")),
|
||||
([True, True, False], (True, "2/3")),
|
||||
([False, False, True], (False, "2/3")),
|
||||
([True, False, None], (None, "split")), # 1-1 of the two valid → chair
|
||||
([True, None, None], (None, "incomplete")), # only one judge → chair
|
||||
([None, None, None], (None, "incomplete")),
|
||||
])
|
||||
def test_consensus(votes, expected):
|
||||
assert g.consensus(votes) == expected
|
||||
|
||||
|
||||
def test_split_writes_no_label():
|
||||
"""A genuine 1-1 split must NOT yield a decision (escalates to chair, G10)."""
|
||||
decided, tag = g.consensus([True, False, None])
|
||||
assert decided is None and tag == "split"
|
||||
|
||||
|
||||
# ── consensus_type() ──────────────────────────────────────────────────────────
|
||||
|
||||
def test_consensus_type_holding_majority():
|
||||
per = [{"type": "holding"}, {"type": "holding"}, {"type": "application"}]
|
||||
assert g.consensus_type(per, decided=True) == "holding"
|
||||
|
||||
|
||||
def test_consensus_type_constrained_to_is_holding():
|
||||
"""When the consensus is is_holding=False, only application/obiter types
|
||||
are eligible — an inconsistent 'holding' vote is ignored."""
|
||||
per = [{"type": "holding"}, {"type": "application"}, {"type": "obiter"}]
|
||||
out = g.consensus_type(per, decided=False)
|
||||
assert out in {"application", "obiter"}
|
||||
|
||||
|
||||
def test_consensus_type_undecided_is_blank():
|
||||
per = [{"type": "holding"}, {"type": "application"}, {"type": "obiter"}]
|
||||
assert g.consensus_type(per, decided=None) == ""
|
||||
|
||||
|
||||
# ── fleiss_kappa() ────────────────────────────────────────────────────────────
|
||||
|
||||
def test_fleiss_kappa_perfect_agreement():
|
||||
# every item rated 3/0 or 0/3 → κ == 1.0
|
||||
rows = [(3, 0), (3, 0), (0, 3), (0, 3)]
|
||||
assert g.fleiss_kappa(rows) == pytest.approx(1.0)
|
||||
|
||||
|
||||
def test_fleiss_kappa_disagreement_is_low():
|
||||
rows = [(2, 1), (1, 2)]
|
||||
k = g.fleiss_kappa(rows)
|
||||
assert k is not None and k < 0.0 # worse than chance
|
||||
|
||||
|
||||
def test_fleiss_kappa_ragged_returns_none():
|
||||
# mixed rater counts (3 then 2) is not well-defined → None
|
||||
assert g.fleiss_kappa([(3, 0), (1, 1)]) is None
|
||||
|
||||
|
||||
def test_fleiss_kappa_empty_returns_none():
|
||||
assert g.fleiss_kappa([]) is None
|
||||
|
||||
|
||||
# ── anonymize() ───────────────────────────────────────────────────────────────
|
||||
|
||||
def test_anonymize_masks_case_number_and_name():
|
||||
text = "מקור: החלטת ועדת-ערר (8125-09-24). העוררים פלוני בע\"מ טענו..."
|
||||
out = g.anonymize(text, case_number="8125-09-24", case_name='פלוני בע"מ')
|
||||
assert "8125-09-24" not in out
|
||||
assert 'פלוני בע"מ' not in out
|
||||
assert g._FAKE_CASE in out
|
||||
|
||||
|
||||
def test_anonymize_no_identifiers_is_noop():
|
||||
text = "כלל משפטי כללי ללא מזהים."
|
||||
assert g.anonymize(text, case_number=None, case_name=None) == text
|
||||
|
||||
|
||||
def test_anonymize_preserves_legal_substance():
|
||||
"""Masking swaps only the identifier — the rule text is untouched."""
|
||||
text = "הכלל: מיצוי הליכים הוא תנאי-סף. (תיק 9001-01-20)"
|
||||
out = g.anonymize(text, case_number="9001-01-20", case_name=None)
|
||||
assert "מיצוי הליכים הוא תנאי-סף" in out
|
||||
assert "9001-01-20" not in out
|
||||
Reference in New Issue
Block a user