Merge pull request 'feat(halacha): #81.7 — תיוג gold-set בקונצנזוס תלת-מודלי (Opus+DeepSeek+Gemini), κ + אנונימיזציה' (#188) from worktree-goldset-tri-model-consensus into main
This commit was merged in pull request #188.
This commit is contained in:
104
mcp-server/tests/test_goldset_panel_consensus.py
Normal file
104
mcp-server/tests/test_goldset_panel_consensus.py
Normal file
@@ -0,0 +1,104 @@
|
||||
"""Tests for #81.7 — tri-model consensus labeling of the halacha gold-set.
|
||||
|
||||
Covers the pure aggregation/probe functions in scripts/goldset_panel_label.py
|
||||
(consensus vote, type consensus, Fleiss' kappa, anonymization masking). Fully
|
||||
OFFLINE — no DB, no model calls.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
# the script lives in ../scripts relative to mcp-server/
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "scripts"))
|
||||
import goldset_panel_label as g # noqa: E402
|
||||
|
||||
|
||||
# ── consensus() ───────────────────────────────────────────────────────────────
|
||||
|
||||
@pytest.mark.parametrize("votes,expected", [
|
||||
([True, True, True], (True, "3/3")),
|
||||
([False, False, False], (False, "3/3")),
|
||||
([True, True, False], (True, "2/3")),
|
||||
([False, False, True], (False, "2/3")),
|
||||
([True, False, None], (None, "split")), # 1-1 of the two valid → chair
|
||||
([True, None, None], (None, "incomplete")), # only one judge → chair
|
||||
([None, None, None], (None, "incomplete")),
|
||||
])
|
||||
def test_consensus(votes, expected):
|
||||
assert g.consensus(votes) == expected
|
||||
|
||||
|
||||
def test_split_writes_no_label():
|
||||
"""A genuine 1-1 split must NOT yield a decision (escalates to chair, G10)."""
|
||||
decided, tag = g.consensus([True, False, None])
|
||||
assert decided is None and tag == "split"
|
||||
|
||||
|
||||
# ── consensus_type() ──────────────────────────────────────────────────────────
|
||||
|
||||
def test_consensus_type_holding_majority():
|
||||
per = [{"type": "holding"}, {"type": "holding"}, {"type": "application"}]
|
||||
assert g.consensus_type(per, decided=True) == "holding"
|
||||
|
||||
|
||||
def test_consensus_type_constrained_to_is_holding():
|
||||
"""When the consensus is is_holding=False, only application/obiter types
|
||||
are eligible — an inconsistent 'holding' vote is ignored."""
|
||||
per = [{"type": "holding"}, {"type": "application"}, {"type": "obiter"}]
|
||||
out = g.consensus_type(per, decided=False)
|
||||
assert out in {"application", "obiter"}
|
||||
|
||||
|
||||
def test_consensus_type_undecided_is_blank():
|
||||
per = [{"type": "holding"}, {"type": "application"}, {"type": "obiter"}]
|
||||
assert g.consensus_type(per, decided=None) == ""
|
||||
|
||||
|
||||
# ── fleiss_kappa() ────────────────────────────────────────────────────────────
|
||||
|
||||
def test_fleiss_kappa_perfect_agreement():
|
||||
# every item rated 3/0 or 0/3 → κ == 1.0
|
||||
rows = [(3, 0), (3, 0), (0, 3), (0, 3)]
|
||||
assert g.fleiss_kappa(rows) == pytest.approx(1.0)
|
||||
|
||||
|
||||
def test_fleiss_kappa_disagreement_is_low():
|
||||
rows = [(2, 1), (1, 2)]
|
||||
k = g.fleiss_kappa(rows)
|
||||
assert k is not None and k < 0.0 # worse than chance
|
||||
|
||||
|
||||
def test_fleiss_kappa_ragged_returns_none():
|
||||
# mixed rater counts (3 then 2) is not well-defined → None
|
||||
assert g.fleiss_kappa([(3, 0), (1, 1)]) is None
|
||||
|
||||
|
||||
def test_fleiss_kappa_empty_returns_none():
|
||||
assert g.fleiss_kappa([]) is None
|
||||
|
||||
|
||||
# ── anonymize() ───────────────────────────────────────────────────────────────
|
||||
|
||||
def test_anonymize_masks_case_number_and_name():
|
||||
text = "מקור: החלטת ועדת-ערר (8125-09-24). העוררים פלוני בע\"מ טענו..."
|
||||
out = g.anonymize(text, case_number="8125-09-24", case_name='פלוני בע"מ')
|
||||
assert "8125-09-24" not in out
|
||||
assert 'פלוני בע"מ' not in out
|
||||
assert g._FAKE_CASE in out
|
||||
|
||||
|
||||
def test_anonymize_no_identifiers_is_noop():
|
||||
text = "כלל משפטי כללי ללא מזהים."
|
||||
assert g.anonymize(text, case_number=None, case_name=None) == text
|
||||
|
||||
|
||||
def test_anonymize_preserves_legal_substance():
|
||||
"""Masking swaps only the identifier — the rule text is untouched."""
|
||||
text = "הכלל: מיצוי הליכים הוא תנאי-סף. (תיק 9001-01-20)"
|
||||
out = g.anonymize(text, case_number="9001-01-20", case_name=None)
|
||||
assert "מיצוי הליכים הוא תנאי-סף" in out
|
||||
assert "9001-01-20" not in out
|
||||
Reference in New Issue
Block a user