"""Tests for #81.7 — tri-model consensus labeling of the halacha gold-set. Covers the pure aggregation/probe functions in scripts/goldset_panel_label.py (consensus vote, type consensus, Fleiss' kappa, anonymization masking). Fully OFFLINE — no DB, no model calls. """ from __future__ import annotations import sys from pathlib import Path import pytest # the script lives in ../scripts relative to mcp-server/ sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "scripts")) import goldset_panel_label as g # noqa: E402 # ── consensus() ─────────────────────────────────────────────────────────────── @pytest.mark.parametrize("votes,expected", [ ([True, True, True], (True, "3/3")), ([False, False, False], (False, "3/3")), ([True, True, False], (True, "2/3")), ([False, False, True], (False, "2/3")), ([True, False, None], (None, "split")), # 1-1 of the two valid → chair ([True, None, None], (None, "incomplete")), # only one judge → chair ([None, None, None], (None, "incomplete")), ]) def test_consensus(votes, expected): assert g.consensus(votes) == expected def test_split_writes_no_label(): """A genuine 1-1 split must NOT yield a decision (escalates to chair, G10).""" decided, tag = g.consensus([True, False, None]) assert decided is None and tag == "split" # ── consensus_type() ────────────────────────────────────────────────────────── def test_consensus_type_holding_majority(): per = [{"type": "holding"}, {"type": "holding"}, {"type": "application"}] assert g.consensus_type(per, decided=True) == "holding" def test_consensus_type_constrained_to_is_holding(): """When the consensus is is_holding=False, only application/obiter types are eligible — an inconsistent 'holding' vote is ignored.""" per = [{"type": "holding"}, {"type": "application"}, {"type": "obiter"}] out = g.consensus_type(per, decided=False) assert out in {"application", "obiter"} def test_consensus_type_undecided_is_blank(): per = [{"type": "holding"}, {"type": "application"}, {"type": "obiter"}] assert g.consensus_type(per, decided=None) == "" # ── fleiss_kappa() ──────────────────────────────────────────────────────────── def test_fleiss_kappa_perfect_agreement(): # every item rated 3/0 or 0/3 → κ == 1.0 rows = [(3, 0), (3, 0), (0, 3), (0, 3)] assert g.fleiss_kappa(rows) == pytest.approx(1.0) def test_fleiss_kappa_disagreement_is_low(): rows = [(2, 1), (1, 2)] k = g.fleiss_kappa(rows) assert k is not None and k < 0.0 # worse than chance def test_fleiss_kappa_ragged_returns_none(): # mixed rater counts (3 then 2) is not well-defined → None assert g.fleiss_kappa([(3, 0), (1, 1)]) is None def test_fleiss_kappa_empty_returns_none(): assert g.fleiss_kappa([]) is None # ── anonymize() ─────────────────────────────────────────────────────────────── def test_anonymize_masks_case_number_and_name(): text = "מקור: החלטת ועדת-ערר (8125-09-24). העוררים פלוני בע\"מ טענו..." out = g.anonymize(text, case_number="8125-09-24", case_name='פלוני בע"מ') assert "8125-09-24" not in out assert 'פלוני בע"מ' not in out assert g._FAKE_CASE in out def test_anonymize_no_identifiers_is_noop(): text = "כלל משפטי כללי ללא מזהים." assert g.anonymize(text, case_number=None, case_name=None) == text def test_anonymize_preserves_legal_substance(): """Masking swaps only the identifier — the rule text is untouched.""" text = "הכלל: מיצוי הליכים הוא תנאי-סף. (תיק 9001-01-20)" out = g.anonymize(text, case_number="9001-01-20", case_name=None) assert "מיצוי הליכים הוא תנאי-סף" in out assert "9001-01-20" not in out