legal-ai/mcp-server/tests/test_goldset_panel_consensus.py

"""Tests for #81.7 — tri-model consensus labeling of the halacha gold-set.

Covers the pure aggregation/probe functions in scripts/goldset_panel_label.py
(consensus vote, type consensus, Fleiss' kappa, anonymization masking). Fully
OFFLINE — no DB, no model calls.
"""

from __future__ import annotations

import sys
from pathlib import Path

import pytest

# the script lives in ../scripts relative to mcp-server/
sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "scripts"))
import goldset_panel_label as g  # noqa: E402


# ── consensus() ───────────────────────────────────────────────────────────────

@pytest.mark.parametrize("votes,expected", [
    ([True, True, True], (True, "3/3")),
    ([False, False, False], (False, "3/3")),
    ([True, True, False], (True, "2/3")),
    ([False, False, True], (False, "2/3")),
    ([True, False, None], (None, "split")),      # 1-1 of the two valid → chair
    ([True, None, None], (None, "incomplete")),  # only one judge → chair
    ([None, None, None], (None, "incomplete")),
])
def test_consensus(votes, expected):
    assert g.consensus(votes) == expected


def test_split_writes_no_label():
    """A genuine 1-1 split must NOT yield a decision (escalates to chair, G10)."""
    decided, tag = g.consensus([True, False, None])
    assert decided is None and tag == "split"


# ── consensus_type() ──────────────────────────────────────────────────────────

def test_consensus_type_holding_majority():
    per = [{"type": "holding"}, {"type": "holding"}, {"type": "application"}]
    assert g.consensus_type(per, decided=True) == "holding"


def test_consensus_type_constrained_to_is_holding():
    """When the consensus is is_holding=False, only application/obiter types
    are eligible — an inconsistent 'holding' vote is ignored."""
    per = [{"type": "holding"}, {"type": "application"}, {"type": "obiter"}]
    out = g.consensus_type(per, decided=False)
    assert out in {"application", "obiter"}


def test_consensus_type_undecided_is_blank():
    per = [{"type": "holding"}, {"type": "application"}, {"type": "obiter"}]
    assert g.consensus_type(per, decided=None) == ""


# ── fleiss_kappa() ────────────────────────────────────────────────────────────

def test_fleiss_kappa_perfect_agreement():
    # every item rated 3/0 or 0/3 → κ == 1.0
    rows = [(3, 0), (3, 0), (0, 3), (0, 3)]
    assert g.fleiss_kappa(rows) == pytest.approx(1.0)


def test_fleiss_kappa_disagreement_is_low():
    rows = [(2, 1), (1, 2)]
    k = g.fleiss_kappa(rows)
    assert k is not None and k < 0.0  # worse than chance


def test_fleiss_kappa_ragged_returns_none():
    # mixed rater counts (3 then 2) is not well-defined → None
    assert g.fleiss_kappa([(3, 0), (1, 1)]) is None


def test_fleiss_kappa_empty_returns_none():
    assert g.fleiss_kappa([]) is None


# ── gwet_ac1() ────────────────────────────────────────────────────────────────

def test_gwet_ac1_perfect_agreement():
    rows = [(3, 0), (3, 0), (0, 3), (0, 3)]
    assert g.gwet_ac1(rows) == pytest.approx(1.0)


def test_gwet_ac1_resolves_the_kappa_paradox():
    """The headline reason AC1 exists here: under a heavily skewed marginal
    (almost every item is_holding=True) Fleiss κ collapses to ~0 despite very
    high observed agreement, while AC1 correctly reports near-perfect.
    9 unanimous-yes items + 1 split → 93% observed agreement."""
    rows = [(3, 0)] * 9 + [(2, 1)]
    kappa = g.fleiss_kappa(rows)
    ac1 = g.gwet_ac1(rows)
    assert abs(kappa) < 0.1            # κ paradox: near zero
    assert ac1 > 0.9                   # AC1: almost-perfect, matching reality
    assert ac1 > kappa                 # AC1 strictly more faithful under skew


def test_gwet_ac1_ragged_and_empty_return_none():
    assert g.gwet_ac1([(3, 0), (1, 1)]) is None
    assert g.gwet_ac1([]) is None


# ── anonymize() ───────────────────────────────────────────────────────────────

def test_anonymize_masks_case_number_and_name():
    text = "מקור: החלטת ועדת-ערר (8125-09-24). העוררים פלוני בע\"מ טענו..."
    out = g.anonymize(text, case_number="8125-09-24", case_name='פלוני בע"מ')
    assert "8125-09-24" not in out
    assert 'פלוני בע"מ' not in out
    assert g._FAKE_CASE in out


def test_anonymize_no_identifiers_is_noop():
    text = "כלל משפטי כללי ללא מזהים."
    assert g.anonymize(text, case_number=None, case_name=None) == text


def test_anonymize_preserves_legal_substance():
    """Masking swaps only the identifier — the rule text is untouched."""
    text = "הכלל: מיצוי הליכים הוא תנאי-סף. (תיק 9001-01-20)"
    out = g.anonymize(text, case_number="9001-01-20", case_name=None)
    assert "מיצוי הליכים הוא תנאי-סף" in out
    assert "9001-01-20" not in out