Merge pull request 'fix(halacha): #81.7 — Gwet AC1 + consensus-vs-human (פרדוקס-הקאפא תחת הטיה)' (#189) from worktree-goldset-ac1-agreement into main
This commit was merged in pull request #189.
This commit is contained in:
@@ -81,6 +81,31 @@ def test_fleiss_kappa_empty_returns_none():
|
||||
assert g.fleiss_kappa([]) is None
|
||||
|
||||
|
||||
# ── gwet_ac1() ────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_gwet_ac1_perfect_agreement():
|
||||
rows = [(3, 0), (3, 0), (0, 3), (0, 3)]
|
||||
assert g.gwet_ac1(rows) == pytest.approx(1.0)
|
||||
|
||||
|
||||
def test_gwet_ac1_resolves_the_kappa_paradox():
|
||||
"""The headline reason AC1 exists here: under a heavily skewed marginal
|
||||
(almost every item is_holding=True) Fleiss κ collapses to ~0 despite very
|
||||
high observed agreement, while AC1 correctly reports near-perfect.
|
||||
9 unanimous-yes items + 1 split → 93% observed agreement."""
|
||||
rows = [(3, 0)] * 9 + [(2, 1)]
|
||||
kappa = g.fleiss_kappa(rows)
|
||||
ac1 = g.gwet_ac1(rows)
|
||||
assert abs(kappa) < 0.1 # κ paradox: near zero
|
||||
assert ac1 > 0.9 # AC1: almost-perfect, matching reality
|
||||
assert ac1 > kappa # AC1 strictly more faithful under skew
|
||||
|
||||
|
||||
def test_gwet_ac1_ragged_and_empty_return_none():
|
||||
assert g.gwet_ac1([(3, 0), (1, 1)]) is None
|
||||
assert g.gwet_ac1([]) is None
|
||||
|
||||
|
||||
# ── anonymize() ───────────────────────────────────────────────────────────────
|
||||
|
||||
def test_anonymize_masks_case_number_and_name():
|
||||
|
||||
@@ -119,6 +119,32 @@ def fleiss_kappa(rows: list[tuple[int, int]]) -> float | None:
|
||||
return (Pbar - Pe) / (1 - Pe)
|
||||
|
||||
|
||||
def gwet_ac1(rows: list[tuple[int, int]]) -> float | None:
|
||||
"""Gwet's AC1 for binary ratings (yes_count, no_count) per item.
|
||||
|
||||
Reported ALONGSIDE Fleiss' κ because κ suffers the "kappa paradox": under a
|
||||
skewed marginal (here almost every halacha is is_holding=True) κ collapses
|
||||
toward 0 even at very high observed agreement, since chance-agreement Pe is
|
||||
also near 1. AC1 estimates chance agreement under maximum uncertainty, so it
|
||||
is robust to prevalence and tracks the true observed agreement (Gwet 2008;
|
||||
Feinstein & Cicchetti 1990, "high agreement, low kappa"). Same Pa as Fleiss;
|
||||
only the chance term differs."""
|
||||
rows = [(y, n) for (y, n) in rows if (y + n) > 0]
|
||||
N = len(rows)
|
||||
if N == 0:
|
||||
return None
|
||||
n = rows[0][0] + rows[0][1]
|
||||
if n < 2 or any((y + n_) != n for (y, n_) in rows):
|
||||
return None
|
||||
Pa = sum((y * (y - 1) + nn * (nn - 1)) / (n * (n - 1)) for (y, nn) in rows) / N
|
||||
p_yes = sum(y for (y, _) in rows) / (N * n)
|
||||
p_no = 1.0 - p_yes
|
||||
Pe = 2 * p_yes * p_no # q=2 categories: (1/(q-1))·Σ π_k(1-π_k) = 2·p·(1-p)
|
||||
if Pe >= 1.0:
|
||||
return 1.0
|
||||
return (Pa - Pe) / (1 - Pe)
|
||||
|
||||
|
||||
# ── anonymization probe (pure — unit-tested) ──────────────────────────────────
|
||||
|
||||
_FAKE_CASE = "12345-67-89"
|
||||
@@ -186,10 +212,11 @@ async def main(args: argparse.Namespace) -> int:
|
||||
tags: Counter = Counter()
|
||||
kappa_rows: list[tuple[int, int]] = []
|
||||
anon_checked = anon_stable = 0
|
||||
chair_overlap = chair_agree = 0 # consensus vs existing human label (external validity)
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
async def run(i: int, it: dict) -> None:
|
||||
nonlocal anon_checked, anon_stable
|
||||
nonlocal anon_checked, anon_stable, chair_overlap, chair_agree
|
||||
async with sem:
|
||||
user = _prompt(it)
|
||||
per, decided, tag = await panel_pass(client, user)
|
||||
@@ -217,6 +244,11 @@ async def main(args: argparse.Namespace) -> int:
|
||||
if len(nv) == 3:
|
||||
y = sum(1 for m in nv if m["is_holding"])
|
||||
kappa_rows.append((y, 3 - y))
|
||||
# external validity: where a human already labeled, does consensus match?
|
||||
if (it.get("tagged_by") == "chair" and it.get("is_holding") is not None
|
||||
and decided is not None):
|
||||
chair_overlap += 1
|
||||
chair_agree += int(decided == it["is_holding"])
|
||||
mark = {"3/3": "✓✓✓", "2/3": "✓✓", "split": "⚖", "incomplete": "…"}[tag]
|
||||
astr = "" if anon_st is None else (" anon✓" if anon_st else " anon✗FLIP")
|
||||
print(f"[{i}/{len(todo)}] {it.get('case_number')}: {mark} {tag} "
|
||||
@@ -227,7 +259,15 @@ async def main(args: argparse.Namespace) -> int:
|
||||
await asyncio.gather(*tasks[j : j + args.concurrency])
|
||||
|
||||
kappa = fleiss_kappa(kappa_rows)
|
||||
ac1 = gwet_ac1(kappa_rows)
|
||||
raw_agree = (sum(1 for (y, n) in kappa_rows if y == 0 or n == 0) / len(kappa_rows)
|
||||
if kappa_rows else None) # share of items with unanimous 3/3
|
||||
decided_n = tags["3/3"] + tags["2/3"]
|
||||
|
||||
def interp(k: float) -> str:
|
||||
return ("almost-perfect" if k >= 0.8 else "substantial" if k >= 0.6
|
||||
else "moderate" if k >= 0.4 else "fair/poor")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print(f"PANEL LABELING — gold-set '{args.batch}'")
|
||||
print("=" * 60)
|
||||
@@ -236,10 +276,17 @@ async def main(args: argparse.Namespace) -> int:
|
||||
print(f" ⚖ split→chair : {tags['split']}")
|
||||
print(f" … incomplete : {tags['incomplete']}")
|
||||
print(f" DECIDED (labels written): {decided_n}/{len(todo)}")
|
||||
if kappa is not None:
|
||||
interp = ("almost-perfect" if kappa >= 0.8 else "substantial" if kappa >= 0.6
|
||||
else "moderate" if kappa >= 0.4 else "fair/poor")
|
||||
print(f" Fleiss κ (3 raters, is_holding, n={len(kappa_rows)}): {kappa:.3f} ({interp})")
|
||||
if kappa_rows:
|
||||
# Report AC1 as the headline agreement metric: the is_holding marginal is
|
||||
# heavily skewed (≈all True), where Fleiss κ hits the "kappa paradox"
|
||||
# (high agreement, near-zero κ). AC1 is prevalence-robust.
|
||||
print(f" inter-model agreement (n={len(kappa_rows)}, 3 raters, is_holding):")
|
||||
print(f" Gwet AC1 : {ac1:.3f} ({interp(ac1)}) ← headline (skew-robust)")
|
||||
print(f" Fleiss κ : {kappa:.3f} ({interp(kappa)}) [paradox under skew — see code]")
|
||||
print(f" raw 3/3 : {raw_agree:.1%} unanimous")
|
||||
if chair_overlap:
|
||||
print(f" consensus vs HUMAN labels (external validity): "
|
||||
f"{chair_agree}/{chair_overlap} = {chair_agree / chair_overlap:.1%}")
|
||||
if anon_checked:
|
||||
rate = anon_stable / anon_checked
|
||||
print(f" anonymization stability: {anon_stable}/{anon_checked} = {rate:.1%} "
|
||||
@@ -250,7 +297,9 @@ async def main(args: argparse.Namespace) -> int:
|
||||
report.parent.mkdir(parents=True, exist_ok=True)
|
||||
report.write_text(json.dumps({
|
||||
"batch": args.batch, "labeled": len(todo), "agreement": dict(tags),
|
||||
"decided": decided_n, "fleiss_kappa": kappa,
|
||||
"decided": decided_n, "fleiss_kappa": kappa, "gwet_ac1": ac1,
|
||||
"raw_unanimous": raw_agree,
|
||||
"consensus_vs_human": {"agree": chair_agree, "overlap": chair_overlap},
|
||||
"anon_checked": anon_checked, "anon_stable": anon_stable,
|
||||
}, ensure_ascii=False, indent=2))
|
||||
print(f"\nreport → {report}")
|
||||
|
||||
Reference in New Issue
Block a user