All checks were successful
G12 Leak-Guard / leak-guard (pull_request) Successful in 5s
ריצת-הפאנל החיה חשפה Fleiss κ=-0.07 למרות 97.5% הסכמה-גסה (28/40 פה-אחד, 11/40 רוב). זה אינו חוסר-אמינות אלא **פרדוקס-הקאפא**: ה-marginal של is_holding מוטה קיצונית (≈הכול True, כמו 93/100 ה-keep בתוויות-האנוש), וכש-Pe→1 גם κ→0 (Feinstein & Cicchetti 1990, "high agreement, low kappa"). - gwet_ac1(): מדד הסכמה עמיד-שכיחות (Gwet 2008) — אותו Pa כמו Fleiss, אומדן-מקריות שונה (2·p·(1-p)). הופך לכותרת; Fleiss κ עדיין מודווח לשקיפות + raw 3/3. - consensus-vs-HUMAN: כשקיים תיוג-יו"ר, הדוח מודד התאמת-הקונצנזוס מולו (תוקף חיצוני). אימות בפועל על 100 תוויות-היו"ר: 29/29 = 100% התאמה. invariants: ללא שינוי בהתנהגות-הכתיבה; מטריקה בלבד. tests: 21 (3 חדשות, כולל מקרה-פרדוקס מפורש). מקור: Gwet 2008 (AC1) · Feinstein & Cicchetti 1990. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
130 lines
5.4 KiB
Python
130 lines
5.4 KiB
Python
"""Tests for #81.7 — tri-model consensus labeling of the halacha gold-set.
|
|
|
|
Covers the pure aggregation/probe functions in scripts/goldset_panel_label.py
|
|
(consensus vote, type consensus, Fleiss' kappa, anonymization masking). Fully
|
|
OFFLINE — no DB, no model calls.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
# the script lives in ../scripts relative to mcp-server/
|
|
sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "scripts"))
|
|
import goldset_panel_label as g # noqa: E402
|
|
|
|
|
|
# ── consensus() ───────────────────────────────────────────────────────────────
|
|
|
|
@pytest.mark.parametrize("votes,expected", [
|
|
([True, True, True], (True, "3/3")),
|
|
([False, False, False], (False, "3/3")),
|
|
([True, True, False], (True, "2/3")),
|
|
([False, False, True], (False, "2/3")),
|
|
([True, False, None], (None, "split")), # 1-1 of the two valid → chair
|
|
([True, None, None], (None, "incomplete")), # only one judge → chair
|
|
([None, None, None], (None, "incomplete")),
|
|
])
|
|
def test_consensus(votes, expected):
|
|
assert g.consensus(votes) == expected
|
|
|
|
|
|
def test_split_writes_no_label():
|
|
"""A genuine 1-1 split must NOT yield a decision (escalates to chair, G10)."""
|
|
decided, tag = g.consensus([True, False, None])
|
|
assert decided is None and tag == "split"
|
|
|
|
|
|
# ── consensus_type() ──────────────────────────────────────────────────────────
|
|
|
|
def test_consensus_type_holding_majority():
|
|
per = [{"type": "holding"}, {"type": "holding"}, {"type": "application"}]
|
|
assert g.consensus_type(per, decided=True) == "holding"
|
|
|
|
|
|
def test_consensus_type_constrained_to_is_holding():
|
|
"""When the consensus is is_holding=False, only application/obiter types
|
|
are eligible — an inconsistent 'holding' vote is ignored."""
|
|
per = [{"type": "holding"}, {"type": "application"}, {"type": "obiter"}]
|
|
out = g.consensus_type(per, decided=False)
|
|
assert out in {"application", "obiter"}
|
|
|
|
|
|
def test_consensus_type_undecided_is_blank():
|
|
per = [{"type": "holding"}, {"type": "application"}, {"type": "obiter"}]
|
|
assert g.consensus_type(per, decided=None) == ""
|
|
|
|
|
|
# ── fleiss_kappa() ────────────────────────────────────────────────────────────
|
|
|
|
def test_fleiss_kappa_perfect_agreement():
|
|
# every item rated 3/0 or 0/3 → κ == 1.0
|
|
rows = [(3, 0), (3, 0), (0, 3), (0, 3)]
|
|
assert g.fleiss_kappa(rows) == pytest.approx(1.0)
|
|
|
|
|
|
def test_fleiss_kappa_disagreement_is_low():
|
|
rows = [(2, 1), (1, 2)]
|
|
k = g.fleiss_kappa(rows)
|
|
assert k is not None and k < 0.0 # worse than chance
|
|
|
|
|
|
def test_fleiss_kappa_ragged_returns_none():
|
|
# mixed rater counts (3 then 2) is not well-defined → None
|
|
assert g.fleiss_kappa([(3, 0), (1, 1)]) is None
|
|
|
|
|
|
def test_fleiss_kappa_empty_returns_none():
|
|
assert g.fleiss_kappa([]) is None
|
|
|
|
|
|
# ── gwet_ac1() ────────────────────────────────────────────────────────────────
|
|
|
|
def test_gwet_ac1_perfect_agreement():
|
|
rows = [(3, 0), (3, 0), (0, 3), (0, 3)]
|
|
assert g.gwet_ac1(rows) == pytest.approx(1.0)
|
|
|
|
|
|
def test_gwet_ac1_resolves_the_kappa_paradox():
|
|
"""The headline reason AC1 exists here: under a heavily skewed marginal
|
|
(almost every item is_holding=True) Fleiss κ collapses to ~0 despite very
|
|
high observed agreement, while AC1 correctly reports near-perfect.
|
|
9 unanimous-yes items + 1 split → 93% observed agreement."""
|
|
rows = [(3, 0)] * 9 + [(2, 1)]
|
|
kappa = g.fleiss_kappa(rows)
|
|
ac1 = g.gwet_ac1(rows)
|
|
assert abs(kappa) < 0.1 # κ paradox: near zero
|
|
assert ac1 > 0.9 # AC1: almost-perfect, matching reality
|
|
assert ac1 > kappa # AC1 strictly more faithful under skew
|
|
|
|
|
|
def test_gwet_ac1_ragged_and_empty_return_none():
|
|
assert g.gwet_ac1([(3, 0), (1, 1)]) is None
|
|
assert g.gwet_ac1([]) is None
|
|
|
|
|
|
# ── anonymize() ───────────────────────────────────────────────────────────────
|
|
|
|
def test_anonymize_masks_case_number_and_name():
|
|
text = "מקור: החלטת ועדת-ערר (8125-09-24). העוררים פלוני בע\"מ טענו..."
|
|
out = g.anonymize(text, case_number="8125-09-24", case_name='פלוני בע"מ')
|
|
assert "8125-09-24" not in out
|
|
assert 'פלוני בע"מ' not in out
|
|
assert g._FAKE_CASE in out
|
|
|
|
|
|
def test_anonymize_no_identifiers_is_noop():
|
|
text = "כלל משפטי כללי ללא מזהים."
|
|
assert g.anonymize(text, case_number=None, case_name=None) == text
|
|
|
|
|
|
def test_anonymize_preserves_legal_substance():
|
|
"""Masking swaps only the identifier — the rule text is untouched."""
|
|
text = "הכלל: מיצוי הליכים הוא תנאי-סף. (תיק 9001-01-20)"
|
|
out = g.anonymize(text, case_number="9001-01-20", case_name=None)
|
|
assert "מיצוי הליכים הוא תנאי-סף" in out
|
|
assert "9001-01-20" not in out
|