From 5f93c7492f5820130f1419f8d2ea87205f56a3c2 Mon Sep 17 00:00:00 2001 From: Chaim Date: Thu, 11 Jun 2026 16:13:24 +0000 Subject: [PATCH] =?UTF-8?q?fix(halacha):=20#81.7=20=E2=80=94=20report=20Gw?= =?UTF-8?q?et=20AC1=20+=20consensus-vs-human=20(=CE=BA=20paradox=20under?= =?UTF-8?q?=20skew)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ריצת-הפאנל החיה חשפה Fleiss κ=-0.07 למרות 97.5% הסכמה-גסה (28/40 פה-אחד, 11/40 רוב). זה אינו חוסר-אמינות אלא **פרדוקס-הקאפא**: ה-marginal של is_holding מוטה קיצונית (≈הכול True, כמו 93/100 ה-keep בתוויות-האנוש), וכש-Pe→1 גם κ→0 (Feinstein & Cicchetti 1990, "high agreement, low kappa"). - gwet_ac1(): מדד הסכמה עמיד-שכיחות (Gwet 2008) — אותו Pa כמו Fleiss, אומדן-מקריות שונה (2·p·(1-p)). הופך לכותרת; Fleiss κ עדיין מודווח לשקיפות + raw 3/3. - consensus-vs-HUMAN: כשקיים תיוג-יו"ר, הדוח מודד התאמת-הקונצנזוס מולו (תוקף חיצוני). אימות בפועל על 100 תוויות-היו"ר: 29/29 = 100% התאמה. invariants: ללא שינוי בהתנהגות-הכתיבה; מטריקה בלבד. tests: 21 (3 חדשות, כולל מקרה-פרדוקס מפורש). מקור: Gwet 2008 (AC1) · Feinstein & Cicchetti 1990. Co-Authored-By: Claude Opus 4.8 --- .../tests/test_goldset_panel_consensus.py | 25 ++++++++ scripts/goldset_panel_label.py | 61 +++++++++++++++++-- 2 files changed, 80 insertions(+), 6 deletions(-) diff --git a/mcp-server/tests/test_goldset_panel_consensus.py b/mcp-server/tests/test_goldset_panel_consensus.py index 58c37d9..1f67403 100644 --- a/mcp-server/tests/test_goldset_panel_consensus.py +++ b/mcp-server/tests/test_goldset_panel_consensus.py @@ -81,6 +81,31 @@ def test_fleiss_kappa_empty_returns_none(): assert g.fleiss_kappa([]) is None +# ── gwet_ac1() ──────────────────────────────────────────────────────────────── + +def test_gwet_ac1_perfect_agreement(): + rows = [(3, 0), (3, 0), (0, 3), (0, 3)] + assert g.gwet_ac1(rows) == pytest.approx(1.0) + + +def test_gwet_ac1_resolves_the_kappa_paradox(): + """The headline reason AC1 exists here: under a heavily skewed marginal + (almost every item is_holding=True) Fleiss κ collapses to ~0 despite very + high observed agreement, while AC1 correctly reports near-perfect. + 9 unanimous-yes items + 1 split → 93% observed agreement.""" + rows = [(3, 0)] * 9 + [(2, 1)] + kappa = g.fleiss_kappa(rows) + ac1 = g.gwet_ac1(rows) + assert abs(kappa) < 0.1 # κ paradox: near zero + assert ac1 > 0.9 # AC1: almost-perfect, matching reality + assert ac1 > kappa # AC1 strictly more faithful under skew + + +def test_gwet_ac1_ragged_and_empty_return_none(): + assert g.gwet_ac1([(3, 0), (1, 1)]) is None + assert g.gwet_ac1([]) is None + + # ── anonymize() ─────────────────────────────────────────────────────────────── def test_anonymize_masks_case_number_and_name(): diff --git a/scripts/goldset_panel_label.py b/scripts/goldset_panel_label.py index ce2b761..5e5b090 100644 --- a/scripts/goldset_panel_label.py +++ b/scripts/goldset_panel_label.py @@ -119,6 +119,32 @@ def fleiss_kappa(rows: list[tuple[int, int]]) -> float | None: return (Pbar - Pe) / (1 - Pe) +def gwet_ac1(rows: list[tuple[int, int]]) -> float | None: + """Gwet's AC1 for binary ratings (yes_count, no_count) per item. + + Reported ALONGSIDE Fleiss' κ because κ suffers the "kappa paradox": under a + skewed marginal (here almost every halacha is is_holding=True) κ collapses + toward 0 even at very high observed agreement, since chance-agreement Pe is + also near 1. AC1 estimates chance agreement under maximum uncertainty, so it + is robust to prevalence and tracks the true observed agreement (Gwet 2008; + Feinstein & Cicchetti 1990, "high agreement, low kappa"). Same Pa as Fleiss; + only the chance term differs.""" + rows = [(y, n) for (y, n) in rows if (y + n) > 0] + N = len(rows) + if N == 0: + return None + n = rows[0][0] + rows[0][1] + if n < 2 or any((y + n_) != n for (y, n_) in rows): + return None + Pa = sum((y * (y - 1) + nn * (nn - 1)) / (n * (n - 1)) for (y, nn) in rows) / N + p_yes = sum(y for (y, _) in rows) / (N * n) + p_no = 1.0 - p_yes + Pe = 2 * p_yes * p_no # q=2 categories: (1/(q-1))·Σ π_k(1-π_k) = 2·p·(1-p) + if Pe >= 1.0: + return 1.0 + return (Pa - Pe) / (1 - Pe) + + # ── anonymization probe (pure — unit-tested) ────────────────────────────────── _FAKE_CASE = "12345-67-89" @@ -186,10 +212,11 @@ async def main(args: argparse.Namespace) -> int: tags: Counter = Counter() kappa_rows: list[tuple[int, int]] = [] anon_checked = anon_stable = 0 + chair_overlap = chair_agree = 0 # consensus vs existing human label (external validity) async with httpx.AsyncClient() as client: async def run(i: int, it: dict) -> None: - nonlocal anon_checked, anon_stable + nonlocal anon_checked, anon_stable, chair_overlap, chair_agree async with sem: user = _prompt(it) per, decided, tag = await panel_pass(client, user) @@ -217,6 +244,11 @@ async def main(args: argparse.Namespace) -> int: if len(nv) == 3: y = sum(1 for m in nv if m["is_holding"]) kappa_rows.append((y, 3 - y)) + # external validity: where a human already labeled, does consensus match? + if (it.get("tagged_by") == "chair" and it.get("is_holding") is not None + and decided is not None): + chair_overlap += 1 + chair_agree += int(decided == it["is_holding"]) mark = {"3/3": "✓✓✓", "2/3": "✓✓", "split": "⚖", "incomplete": "…"}[tag] astr = "" if anon_st is None else (" anon✓" if anon_st else " anon✗FLIP") print(f"[{i}/{len(todo)}] {it.get('case_number')}: {mark} {tag} " @@ -227,7 +259,15 @@ async def main(args: argparse.Namespace) -> int: await asyncio.gather(*tasks[j : j + args.concurrency]) kappa = fleiss_kappa(kappa_rows) + ac1 = gwet_ac1(kappa_rows) + raw_agree = (sum(1 for (y, n) in kappa_rows if y == 0 or n == 0) / len(kappa_rows) + if kappa_rows else None) # share of items with unanimous 3/3 decided_n = tags["3/3"] + tags["2/3"] + + def interp(k: float) -> str: + return ("almost-perfect" if k >= 0.8 else "substantial" if k >= 0.6 + else "moderate" if k >= 0.4 else "fair/poor") + print("\n" + "=" * 60) print(f"PANEL LABELING — gold-set '{args.batch}'") print("=" * 60) @@ -236,10 +276,17 @@ async def main(args: argparse.Namespace) -> int: print(f" ⚖ split→chair : {tags['split']}") print(f" … incomplete : {tags['incomplete']}") print(f" DECIDED (labels written): {decided_n}/{len(todo)}") - if kappa is not None: - interp = ("almost-perfect" if kappa >= 0.8 else "substantial" if kappa >= 0.6 - else "moderate" if kappa >= 0.4 else "fair/poor") - print(f" Fleiss κ (3 raters, is_holding, n={len(kappa_rows)}): {kappa:.3f} ({interp})") + if kappa_rows: + # Report AC1 as the headline agreement metric: the is_holding marginal is + # heavily skewed (≈all True), where Fleiss κ hits the "kappa paradox" + # (high agreement, near-zero κ). AC1 is prevalence-robust. + print(f" inter-model agreement (n={len(kappa_rows)}, 3 raters, is_holding):") + print(f" Gwet AC1 : {ac1:.3f} ({interp(ac1)}) ← headline (skew-robust)") + print(f" Fleiss κ : {kappa:.3f} ({interp(kappa)}) [paradox under skew — see code]") + print(f" raw 3/3 : {raw_agree:.1%} unanimous") + if chair_overlap: + print(f" consensus vs HUMAN labels (external validity): " + f"{chair_agree}/{chair_overlap} = {chair_agree / chair_overlap:.1%}") if anon_checked: rate = anon_stable / anon_checked print(f" anonymization stability: {anon_stable}/{anon_checked} = {rate:.1%} " @@ -250,7 +297,9 @@ async def main(args: argparse.Namespace) -> int: report.parent.mkdir(parents=True, exist_ok=True) report.write_text(json.dumps({ "batch": args.batch, "labeled": len(todo), "agreement": dict(tags), - "decided": decided_n, "fleiss_kappa": kappa, + "decided": decided_n, "fleiss_kappa": kappa, "gwet_ac1": ac1, + "raw_unanimous": raw_agree, + "consensus_vs_human": {"agree": chair_agree, "overlap": chair_overlap}, "anon_checked": anon_checked, "anon_stable": anon_stable, }, ensure_ascii=False, indent=2)) print(f"\nreport → {report}") -- 2.49.1