diff --git a/mcp-server/src/legal_mcp/services/db.py b/mcp-server/src/legal_mcp/services/db.py index 7f50f1d..87e6cd5 100644 --- a/mcp-server/src/legal_mcp/services/db.py +++ b/mcp-server/src/legal_mcp/services/db.py @@ -1259,11 +1259,21 @@ CREATE INDEX IF NOT EXISTS idx_equiv_halacha_b ON equivalent_halachot(halacha_b) """ SCHEMA_V29_SQL = """ --- halacha_goldset (#81.7/#81.8): a human-tagged evaluation set. A stratified --- sample of halachot the chair/Dafna labels (is_holding / correct_type / --- quote_complete) so we can measure the extraction validators' precision/recall --- and recalibrate the auto-approve threshold. The tags are the ground truth — --- they MUST be human (no AI pre-fill) to avoid circular bias. +-- halacha_goldset (#81.7/#81.8): an evaluation set. A stratified sample of +-- halachot labeled (is_holding / correct_type / quote_complete) so we can +-- measure the extraction validators' precision/recall and recalibrate the +-- auto-approve threshold. +-- LABELING — tri-model consensus (no man-in-the-loop, chair directive +-- 2026-06-11): the ground-truth label is the MAJORITY of three independent +-- model lineages (Opus / DeepSeek / Gemini), written here with +-- tagged_by='panel:opus+deepseek+gemini'. This is NOT circular: the validators +-- being measured (#81.8, compute_quality_flags / is_fact_dependent / …) are +-- RULE-BASED heuristics, a different method family from the LLM judges. Two +-- guards keep the consensus honest: (1) a SPLIT vote (no 2/3) writes NO label +-- (is_holding stays NULL → escalates to the chair, INV-G10), and (2) the +-- anonymization probe (anon_*) re-judges with case names masked to catch +-- memorization vs genuine reasoning. A human tag (goldset_tag, tagged_by= +-- 'chair') still overrides the panel for any item. CREATE TABLE IF NOT EXISTS halacha_goldset ( id UUID PRIMARY KEY DEFAULT gen_random_uuid(), halacha_id UUID NOT NULL REFERENCES halachot(id) ON DELETE CASCADE, @@ -1286,6 +1296,22 @@ ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_is_holding BOOLEAN; ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_correct_type TEXT DEFAULT ''; ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_rationale TEXT DEFAULT ''; ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_generated_at TIMESTAMPTZ; + +-- Tri-model consensus labeling (#81.7, chair directive 2026-06-11). ai_* above +-- holds the Opus/claude vote (lineage 1); these hold the other two lineages, +-- plus the derived consensus, the per-item agreement, and the anonymization +-- probe. The consensus (when not split) is also written into is_holding / +-- correct_type with tagged_by='panel:...' so goldset_score reads it unchanged. +ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ds_is_holding BOOLEAN; -- DeepSeek +ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ds_correct_type TEXT DEFAULT ''; +ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ds_rationale TEXT DEFAULT ''; +ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS gm_is_holding BOOLEAN; -- Gemini +ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS gm_correct_type TEXT DEFAULT ''; +ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS gm_rationale TEXT DEFAULT ''; +ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS panel_agreement TEXT DEFAULT ''; -- 3/3 | 2/3 | split | incomplete +ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS anon_is_holding BOOLEAN; -- re-judge, case names masked +ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS anon_stable BOOLEAN; -- anon verdict == consensus +ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS panel_generated_at TIMESTAMPTZ; """ @@ -4837,6 +4863,9 @@ async def goldset_list(batch: str = "default") -> list[dict]: "SELECT g.id, g.halacha_id::text AS halacha_id, g.is_holding, " " g.correct_type, g.quote_complete, g.tagged_by, g.tagged_at, " " g.ai_is_holding, g.ai_correct_type, g.ai_rationale, g.ai_generated_at, " + " g.ds_is_holding, g.ds_correct_type, g.ds_rationale, " + " g.gm_is_holding, g.gm_correct_type, g.gm_rationale, " + " g.panel_agreement, g.anon_is_holding, g.anon_stable, g.panel_generated_at, " " h.rule_statement, h.supporting_quote, h.reasoning_summary, " " h.rule_type, h.confidence, h.quality_flags, h.review_status, " " cl.case_number, cl.case_name, cl.source_type, cl.precedent_level " @@ -4851,6 +4880,8 @@ async def goldset_list(batch: str = "default") -> list[dict]: d["tagged_at"] = d["tagged_at"].isoformat() if d.get("ai_generated_at") is not None: d["ai_generated_at"] = d["ai_generated_at"].isoformat() + if d.get("panel_generated_at") is not None: + d["panel_generated_at"] = d["panel_generated_at"].isoformat() if d.get("confidence") is not None: d["confidence"] = float(d["confidence"]) # authority is DERIVED from the source, never stored (INV-DM7) @@ -4872,6 +4903,62 @@ async def goldset_set_ai_recommendation( ) +async def goldset_set_panel_label( + goldset_id: UUID, *, + claude: dict | None, deepseek: dict | None, gemini: dict | None, + consensus_is_holding: bool | None, consensus_type: str, + agreement: str, + anon_is_holding: bool | None = None, anon_stable: bool | None = None, +) -> None: + """Store a tri-model panel labeling for one gold-set item (#81.7). + + Writes the three per-lineage votes (claude→ai_*, deepseek→ds_*, gemini→gm_*) + and the agreement tag. When the panel reached a non-split consensus, ALSO + writes it into the ground-truth columns (is_holding / correct_type) with + tagged_by='panel:opus+deepseek+gemini' so ``goldset_score`` reads it + unchanged. A SPLIT/incomplete vote leaves is_holding untouched (NULL → + chair escalation, INV-G10). A prior human tag (tagged_by='chair') is never + overwritten. + + Each per-model dict is ``{"is_holding": bool, "type": str, "rationale": str}`` + or None when that judge failed. + """ + def _h(d): + return bool(d["is_holding"]) if isinstance(d, dict) and "is_holding" in d else None + + def _t(d): + return str(d.get("type") or "") if isinstance(d, dict) else "" + + def _r(d): + return str(d.get("rationale") or "")[:300] if isinstance(d, dict) else "" + + pool = await get_pool() + async with pool.acquire() as conn: + async with conn.transaction(): + await conn.execute( + "UPDATE halacha_goldset SET " + "ai_is_holding=$2, ai_correct_type=$3, ai_rationale=$4, ai_generated_at=now(), " + "ds_is_holding=$5, ds_correct_type=$6, ds_rationale=$7, " + "gm_is_holding=$8, gm_correct_type=$9, gm_rationale=$10, " + "panel_agreement=$11, anon_is_holding=$12, anon_stable=$13, " + "panel_generated_at=now() WHERE id=$1", + goldset_id, + _h(claude), _t(claude), _r(claude), + _h(deepseek), _t(deepseek), _r(deepseek), + _h(gemini), _t(gemini), _r(gemini), + agreement, anon_is_holding, anon_stable, + ) + # Write the consensus into ground-truth ONLY on a real majority, and + # never clobber a human tag. + if consensus_is_holding is not None and agreement in ("3/3", "2/3"): + await conn.execute( + "UPDATE halacha_goldset SET is_holding=$2, correct_type=$3, " + "tagged_by='panel:opus+deepseek+gemini', tagged_at=now() " + "WHERE id=$1 AND COALESCE(tagged_by,'') <> 'chair'", + goldset_id, consensus_is_holding, consensus_type, + ) + + async def goldset_tag( goldset_id: UUID, *, is_holding: bool | None = None, correct_type: str | None = None, quote_complete: bool | None = None, @@ -4895,9 +4982,13 @@ async def goldset_tag( async def goldset_score(batch: str = "default") -> dict: - """Measure each extraction validator against the human tags (#81.8). + """Measure each extraction validator against the gold-set labels (#81.8). - A validator flag predicts "NOT a clean holding"; ground truth is + Ground truth is the ``is_holding`` column — set either by the tri-model + panel consensus (tagged_by='panel:…', #81.7) or by a human override + (tagged_by='chair'). Split-vote items stay NULL and are excluded here, so + the score reflects only items the panel (or chair) actually decided. A + validator flag predicts "NOT a clean holding"; ground truth is is_holding == false. truncated_quote is scored against quote_complete.""" items = await goldset_list(batch) labeled = [r for r in items if r.get("is_holding") is not None] diff --git a/mcp-server/tests/test_goldset_panel_consensus.py b/mcp-server/tests/test_goldset_panel_consensus.py new file mode 100644 index 0000000..58c37d9 --- /dev/null +++ b/mcp-server/tests/test_goldset_panel_consensus.py @@ -0,0 +1,104 @@ +"""Tests for #81.7 — tri-model consensus labeling of the halacha gold-set. + +Covers the pure aggregation/probe functions in scripts/goldset_panel_label.py +(consensus vote, type consensus, Fleiss' kappa, anonymization masking). Fully +OFFLINE — no DB, no model calls. +""" + +from __future__ import annotations + +import sys +from pathlib import Path + +import pytest + +# the script lives in ../scripts relative to mcp-server/ +sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "scripts")) +import goldset_panel_label as g # noqa: E402 + + +# ── consensus() ─────────────────────────────────────────────────────────────── + +@pytest.mark.parametrize("votes,expected", [ + ([True, True, True], (True, "3/3")), + ([False, False, False], (False, "3/3")), + ([True, True, False], (True, "2/3")), + ([False, False, True], (False, "2/3")), + ([True, False, None], (None, "split")), # 1-1 of the two valid → chair + ([True, None, None], (None, "incomplete")), # only one judge → chair + ([None, None, None], (None, "incomplete")), +]) +def test_consensus(votes, expected): + assert g.consensus(votes) == expected + + +def test_split_writes_no_label(): + """A genuine 1-1 split must NOT yield a decision (escalates to chair, G10).""" + decided, tag = g.consensus([True, False, None]) + assert decided is None and tag == "split" + + +# ── consensus_type() ────────────────────────────────────────────────────────── + +def test_consensus_type_holding_majority(): + per = [{"type": "holding"}, {"type": "holding"}, {"type": "application"}] + assert g.consensus_type(per, decided=True) == "holding" + + +def test_consensus_type_constrained_to_is_holding(): + """When the consensus is is_holding=False, only application/obiter types + are eligible — an inconsistent 'holding' vote is ignored.""" + per = [{"type": "holding"}, {"type": "application"}, {"type": "obiter"}] + out = g.consensus_type(per, decided=False) + assert out in {"application", "obiter"} + + +def test_consensus_type_undecided_is_blank(): + per = [{"type": "holding"}, {"type": "application"}, {"type": "obiter"}] + assert g.consensus_type(per, decided=None) == "" + + +# ── fleiss_kappa() ──────────────────────────────────────────────────────────── + +def test_fleiss_kappa_perfect_agreement(): + # every item rated 3/0 or 0/3 → κ == 1.0 + rows = [(3, 0), (3, 0), (0, 3), (0, 3)] + assert g.fleiss_kappa(rows) == pytest.approx(1.0) + + +def test_fleiss_kappa_disagreement_is_low(): + rows = [(2, 1), (1, 2)] + k = g.fleiss_kappa(rows) + assert k is not None and k < 0.0 # worse than chance + + +def test_fleiss_kappa_ragged_returns_none(): + # mixed rater counts (3 then 2) is not well-defined → None + assert g.fleiss_kappa([(3, 0), (1, 1)]) is None + + +def test_fleiss_kappa_empty_returns_none(): + assert g.fleiss_kappa([]) is None + + +# ── anonymize() ─────────────────────────────────────────────────────────────── + +def test_anonymize_masks_case_number_and_name(): + text = "מקור: החלטת ועדת-ערר (8125-09-24). העוררים פלוני בע\"מ טענו..." + out = g.anonymize(text, case_number="8125-09-24", case_name='פלוני בע"מ') + assert "8125-09-24" not in out + assert 'פלוני בע"מ' not in out + assert g._FAKE_CASE in out + + +def test_anonymize_no_identifiers_is_noop(): + text = "כלל משפטי כללי ללא מזהים." + assert g.anonymize(text, case_number=None, case_name=None) == text + + +def test_anonymize_preserves_legal_substance(): + """Masking swaps only the identifier — the rule text is untouched.""" + text = "הכלל: מיצוי הליכים הוא תנאי-סף. (תיק 9001-01-20)" + out = g.anonymize(text, case_number="9001-01-20", case_name=None) + assert "מיצוי הליכים הוא תנאי-סף" in out + assert "9001-01-20" not in out diff --git a/scripts/SCRIPTS.md b/scripts/SCRIPTS.md index 3131a25..24749b3 100644 --- a/scripts/SCRIPTS.md +++ b/scripts/SCRIPTS.md @@ -48,7 +48,8 @@ | `backfill_nevo_preamble.py` | python | **#86.2** — מיגרציית-נתונים: חיתוך preamble/רציו של נבו שדלף לפסיקה שהוטמעה לפני תיקון #86.1. מאתר כל `case_law` ש-`strip_nevo_preamble(full_text)` עדיין מקצר (דליפה היסטורית), ומבצע: (1) לכידת ה-מיני-רציו ל-`case_law.nevo_ratio` (gold-set ל-#86.3); (2) שכתוב `full_text` החתוך + חישוב-מחדש של `content_hash`; (3) `reindex_case_law` (re-chunk+embed, ללא re-OCR/LLM); (4) **סימון (לא מחיקה)** הלכות ש-`supporting_quote` שלהן בתוך ה-preamble שהוסר → `pending_review` + quality_flag `nevo_preamble_leak`. **שומר-בטיחות:** שורות עם keep%<`--min-keep` (ברירת-מחדל 60) מוחרגות מ-`--apply` כחשד over-strip (אלא אם `--include-suspicious`). **dry-run כברירת-מחדל**; `--apply` כותב backup JSON + manifest CSV ל-`data/audit/` תחילה. idempotent. רץ עם venv של mcp-server. **chair-gated** (לאמת manifest לפני apply) | מיגרציית-נתונים — dry-run בוצע (19 פסקים, 27 הלכות מזוהמות); apply ממתין לאישור | | `nevo_ratio_benchmark.py` | python | **#86.3** — מדידת איכות חילוץ-הלכות מול ה-מיני-רציו של נבו (gold-set מקצועי חינמי). לכל פסק עם `nevo_ratio` (או נגזר מ-`full_text` אם טרם בוצע backfill): LLM-judge מקומי (`claude_session`, אפס עלות) ממפה סמנטית את הלכות-המערכת מול הלכות-נבו ומפיק **recall** (כיסוי הלכות-נבו), **precision** (אחוז הלכותינו הממופות), **granularity** (יחס פירוק — איתות over-extraction ל-#81.5). `--case ` / `--all [--limit N]` / `--model` / `--out`. כותב CSV ל-`data/audit/`. רץ עם venv של mcp-server (דורש Claude CLI מקומי). אומת על בג"ץ 1764/05: recall 0.875, precision 1.0, granularity 1.75x | ידני — מדידת-איכות (CI/ad-hoc) | | `halacha_goldset.py` | python | **#81.7** — הארנס gold-set לאיכות חילוץ-הלכות. `export --n N` מייצא מדגם מרובד (לפי precedent×rule_type) ל-CSV עם עמודות-תיוג ריקות (`is_holding`/`correct_type`/`quote_complete`) לתיוג ידני (חיים/דפנה). `score --in ` קורא את ה-CSV המתויג ומודד כל ולידטור (`compute_quality_flags`/`is_fact_dependent`/`is_quote_truncated`/`is_thin_restatement`) מול אמת-המידה האנושית: P/R/F1 + confusion. בסיס ל-#81.8 (כיול סף האישור). מייבא את אותם ולידטורים שה-extractor מריץ. רץ עם venv של mcp-server. **הערה:** קיים גם דף-תיוג אינטראקטיבי DB-backed (`/goldset`) — זה ה-CSV-fallback | ידני — export→תיוג→score | -| `goldset_ai_recommend.py` | python | **#81.7 QA** — מייצר **חוות-דעת-AI שנייה** (claude מקומי, אפס עלות) לכל פריט ב-`halacha_goldset`: `is_holding`+`type`+נימוק, נשמר ב-`ai_*` ומוצג בדף לצד התיוג האנושי לזיהוי אי-הסכמות. **עצמאי** מהוולידטורים שנמדדים (אין מעגליות) ו**לא** מוחל אוטומטית. `--force` (חידוש)/`--limit N`. **חובה מקומי** (claude_session). | ידני — לאחר יצירת/הרחבת batch | +| `goldset_panel_label.py` | python | **#81.7 — תיוג ה-gold-set בקונצנזוס תלת-מודלי (ללא man-in-the-loop, הנחיית-יו"ר 2026-06-11).** מריץ את שלושת השופטים העצמאיים (Opus/claude_session · DeepSeek · Gemini, מיובאים מ-`halacha_panel_approve`) עם ה-prompt העשיר (`is_holding`+`type`+נימוק מ-`goldset_ai_recommend`) על כל פריט; **רוב 2/3 נכתב ל-`is_holding`/`correct_type`** עם `tagged_by='panel:opus+deepseek+gemini'` (פיצול→NULL→יו"ר, INV-G10). מודד **Fleiss κ** (3 מעריכים) ומריץ **מבחן-אנונימיזציה** (שמות-תיק ממוסכים→שיפוט-מחדש; flip=שינון). לא מעגלי — הוולידטורים הנמדדים rule-based. כותב per-model+consensus+anon ל-DB ודוח ל-`data/audit/`. **מחליף** תיוג-ידני; `goldset_ai_recommend`/`goldset_independent_judge` נשארים כבדיקות single-model. `--limit`/`--no-anon`/`--force`. **חובה מקומי**. | ידני — לאחר יצירת/הרחבת batch | +| `goldset_ai_recommend.py` | python | **#81.7 QA (single-model, נבלע ב-panel)** — חוות-דעת claude בלבד ל-`ai_*`. כעת לינאז' 1/3 בתוך `goldset_panel_label`; נשאר כבדיקת-claude עצמאית/חידוש נקודתי. `--force`/`--limit`. **חובה מקומי**. | ידני — בדיקה נקודתית | | `goldset_independent_judge.py` | python | **INV-DM7 ולידציה** — שופט-תפקיד **עצמאי שני** ממודל אחר (DeepSeek API ישיר, OpenAI-compatible) ששובר את עיגון-ה-AI: מסווג rule_role **בעיוור** (בלי לראות תיוג-אדם או המלצת-claude) ומחשב מטריצת-הסכמה (deepseek↔אדם מול ai↔אדם) + ציר-גס (כלל-בר-הכללה מול application/obiter). **ממצא (2026-06-07):** ai↔אדם=100% (מעוגן), deepseek↔אדם=50% מדויק אך **92% גס** → תת-הסוג holding/interpretive/procedural עמום-מטבעו (לא לשער עליו); הציר-הגס אמין חוצה-מודלים. read-only על הזהב. `--model`/`--limit`/`--concurrency`. מפתח מ-`~/.hermes/profiles/deepseek/.env`. raw→`/tmp/goldset_judge_raw.json`. | ידני — ולידציית אמינות-תוויות | | `halacha_panel_approve.py` | python | **פאנל-אישור הלכות (Trust-or-Escalate, dry-run).** 3 שופטים בלתי-תלויי-לינאז' (Opus/claude_session · DeepSeek · Gemini-2.5-flash) מצביעים על ה**ציר-הגס האמין** (92% חוצה-מודלים): נקיות→"הלכה לשמירה?"; nli_unsupported→"הציטוט תומך בכלל?" (שיפוט-מחדש); פגומות→re-extraction. רק ורדיקט מוסכם פועל אוטומטית, **פיצול מסלים ליו"ר** (INV-G10). `--apply` **מחווט** (clean: רוב 2/3; nli: פה-אחד-entailed מנקה flag) — הפיך, מגבה ל-`data/audit/` קודם. מפתחות: DeepSeek מ-`~/.hermes/...`, Gemini מ-`~/.env`. **חובה מקומי**. dry-run 2026-06-07: 197→103 אוטו (פה-אחד) / ~15 (רוב). | ידני / שלב-אימות-הלכות במסלול-הסופי | | `style_lesson_panel.py` | python | **פאנל-סגנון דו-סוכני (למידה כפולה).** על-גבי דיסטילציית-ה-Opus (draft↔final ב-`draft_final_pairs.analysis`), שני שופטים בלתי-תלויים — DeepSeek + Gemini-2.5-flash — מצביעים לכל לקח על השאלה הגסה "האם זו הנחיית-סגנון מופשטת ובת-הכללה (INV-LRN5 — קול ולא מהות)?". הסכמה 2/2-keep → נכתב כ-`decision_lesson` (`source=panel:deepseek+gemini`); 2/2-drop → לא נכתב; פיצול/substance → מוסלם ליו"ר. `--apply` הפיך, מגבה ל-`data/audit/`. הטמעה ל-SKILL.md/lessons.md נשארת שער-יו"ר ידני (INV-G10). מפתחות כמו פאנל-ההלכות. **חובה מקומי**. `--case ` / `--pair-id `. | שלב-למידה במסלול-הסופי | diff --git a/scripts/goldset_panel_label.py b/scripts/goldset_panel_label.py new file mode 100644 index 0000000..ce2b761 --- /dev/null +++ b/scripts/goldset_panel_label.py @@ -0,0 +1,270 @@ +#!/usr/bin/env python3 +"""#81.7 — label the halacha gold-set by TRI-MODEL CONSENSUS (no man-in-the-loop). + +Chair directive (2026-06-11): replace manual chair/Dafna tagging of the gold-set +with the agreement of three INDEPENDENT model lineages. This is the same panel +the live approval triage uses (``halacha_panel_approve.py``), proven to agree on +the coarse "is this a real, keepable rule?" axis across models (92%): + + - claude (Opus via claude_session — local CLI, zero marginal cost) [Anthropic] + - deepseek (api.deepseek.com) [DeepSeek] + - gemini (gemini-2.5-flash) [Google] + +Why this is NOT circular: the validators measured downstream (#81.8 — +compute_quality_flags / is_fact_dependent / is_quote_truncated / +is_thin_restatement) are RULE-BASED heuristics, a different method family from +the LLM judges. Two honesty guards: + 1. SPLIT vote (no 2/3 agreement) writes NO ground-truth label — the item + stays NULL and escalates to the chair (INV-G10). + 2. ANONYMIZATION probe — every item is re-judged with the case identifier + masked/faked; if the consensus flips, the verdict was keying on the + identifier (memorization), not the legal reasoning. Reported as a + stability rate (arXiv:2505.02172). + +Reuses the model callers from halacha_panel_approve and the rich is_holding+type +prompt from goldset_ai_recommend — single source, no parallel path (G2). + +Run locally (claude_session needs the CLI; DeepSeek/Gemini keys from ~/.env): + + cd ~/legal-ai/mcp-server + .venv/bin/python ../scripts/goldset_panel_label.py --limit 8 # smoke + .venv/bin/python ../scripts/goldset_panel_label.py # full, with anon + .venv/bin/python ../scripts/goldset_panel_label.py --no-anon # skip the anon probe +""" +from __future__ import annotations + +import argparse +import asyncio +import json +import re +import sys +from collections import Counter +from datetime import datetime, timezone +from pathlib import Path +from uuid import UUID + +import httpx + +from legal_mcp.services import claude_session, db + +# Reuse the model callers (DeepSeek/Gemini HTTP) and the rich gold-set prompt — +# importing them keeps ONE source of truth for each (G2). +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from halacha_panel_approve import judge_deepseek, judge_gemini # noqa: E402 +from goldset_ai_recommend import SYSTEM, VALID_TYPES, _prompt # noqa: E402 + + +# ── consensus aggregation (pure — unit-tested) ──────────────────────────────── + +def consensus(votes: list[bool | None]) -> tuple[bool | None, str]: + """Majority of the (up to three) is_holding votes. + + Returns ``(consensus_bool_or_None, agreement_tag)`` where the tag is one of + '3/3' / '2/3' / 'split' / 'incomplete'. A consensus is returned only on a + real majority; 'split' and 'incomplete' return None (→ chair). + """ + valid = [v for v in votes if v is not None] + if len(valid) < 2: + return None, "incomplete" + yes = sum(1 for v in valid if v) + no = len(valid) - yes + if yes == no: + return None, "split" + decision = yes > no + if len(valid) == 3 and (yes == 3 or no == 3): + return decision, "3/3" + return decision, "2/3" + + +def consensus_type(per_model: list[dict | None], decided: bool | None) -> str: + """Most-common rule_type among the models, constrained to be consistent with + the is_holding consensus (holding/interpretive/procedural ↔ True; + application/obiter ↔ False). '' when undecided or no agreement.""" + if decided is None: + return "" + consistent = ( + {"holding", "interpretive", "procedural"} if decided + else {"application", "obiter"} + ) + types = [ + str(d.get("type") or "") for d in per_model + if isinstance(d, dict) and str(d.get("type") or "") in consistent + ] + if not types: + return "" + return Counter(types).most_common(1)[0][0] + + +def fleiss_kappa(rows: list[tuple[int, int]]) -> float | None: + """Fleiss' kappa for binary ratings (yes_count, no_count) per item. + + Only items rated by ALL raters (here: 3) should be passed. Returns None if + there isn't enough data. Standard formula (Fleiss 1971).""" + rows = [(y, n) for (y, n) in rows if (y + n) > 0] + N = len(rows) + if N == 0: + return None + n = rows[0][0] + rows[0][1] + if n < 2 or any((y + n_) != n for (y, n_) in rows): + return None # ragged rater counts — not well-defined + # P_i: agreement within item i + P = [(y * (y - 1) + nn * (nn - 1)) / (n * (n - 1)) for (y, nn) in rows] + Pbar = sum(P) / N + # p_j: marginal proportion per category + p_yes = sum(y for (y, _) in rows) / (N * n) + p_no = sum(nn for (_, nn) in rows) / (N * n) + Pe = p_yes ** 2 + p_no ** 2 + if Pe >= 1.0: + return 1.0 # degenerate (all one category) → perfect by convention + return (Pbar - Pe) / (1 - Pe) + + +# ── anonymization probe (pure — unit-tested) ────────────────────────────────── + +_FAKE_CASE = "12345-67-89" +_FAKE_NAME = "פלוני נ' אלמוני" + + +def anonymize(text: str, case_number: str | None, case_name: str | None) -> str: + """Mask the case identifier so the model can't key on a memorized case. + + Replaces the literal case_number and case_name (if they appear) with fake + plausible tokens. Legal substance (the rule + quote) is untouched — only the + identifiers that enable memorization are swapped (arXiv:2505.02172).""" + out = text + if case_number: + out = out.replace(case_number, _FAKE_CASE) + # also catch a bare nnnn-nn-nn / nnnn/nn pattern of the same case + out = re.sub(re.escape(case_number).replace(r"\-", r"[-/]"), _FAKE_CASE, out) + if case_name: + out = out.replace(case_name, _FAKE_NAME) + return out + + +# ── one panel pass over a single item ───────────────────────────────────────── + +def _parse(d: dict | None) -> dict | None: + if not isinstance(d, dict) or "is_holding" not in d: + return None + t = str(d.get("type") or "").strip() + return { + "is_holding": bool(d["is_holding"]), + "type": t if t in VALID_TYPES else "", + "rationale": str(d.get("rationale") or "")[:300], + } + + +async def _judge_claude(user: str) -> dict | None: + try: + return await claude_session.query_json(user, system=SYSTEM, effort="low") + except Exception: # noqa: BLE001 + return None + + +async def panel_pass(client: httpx.AsyncClient, user: str) -> tuple[list[dict | None], bool | None, str]: + """Run the three judges on one prompt; return (per_model, consensus, tag).""" + c, ds, gm = await asyncio.gather( + _judge_claude(user), + judge_deepseek(client, SYSTEM, user), + judge_gemini(client, SYSTEM, user), + ) + per = [_parse(c), _parse(ds), _parse(gm)] + decided, tag = consensus([m["is_holding"] if m else None for m in per]) + return per, decided, tag + + +async def main(args: argparse.Namespace) -> int: + print(f"keys — deepseek:{bool(db and True)} (see panel) · claude:local · anon:{not args.no_anon}\n", + flush=True) + items = await db.goldset_list(args.batch) + todo = [it for it in items if args.force or not it.get("panel_generated_at")] + if args.limit: + todo = todo[: args.limit] + print(f"gold-set '{args.batch}': {len(items)} items, {len(todo)} to label by panel", flush=True) + + sem = asyncio.Semaphore(args.concurrency) + tags: Counter = Counter() + kappa_rows: list[tuple[int, int]] = [] + anon_checked = anon_stable = 0 + + async with httpx.AsyncClient() as client: + async def run(i: int, it: dict) -> None: + nonlocal anon_checked, anon_stable + async with sem: + user = _prompt(it) + per, decided, tag = await panel_pass(client, user) + + anon_hold = anon_st = None + if not args.no_anon and decided is not None: + anon_user = anonymize(user, it.get("case_number"), it.get("case_name")) + _, anon_decided, _ = await panel_pass(client, anon_user) + if anon_decided is not None: + anon_hold = anon_decided + anon_st = (anon_decided == decided) + anon_checked += 1 + anon_stable += int(anon_st) + + ctype = consensus_type(per, decided) + await db.goldset_set_panel_label( + UUID(str(it["id"])), + claude=per[0], deepseek=per[1], gemini=per[2], + consensus_is_holding=decided, consensus_type=ctype, + agreement=tag, anon_is_holding=anon_hold, anon_stable=anon_st, + ) + tags[tag] += 1 + # κ counts only items all three judged + nv = [m for m in per if m is not None] + if len(nv) == 3: + y = sum(1 for m in nv if m["is_holding"]) + kappa_rows.append((y, 3 - y)) + mark = {"3/3": "✓✓✓", "2/3": "✓✓", "split": "⚖", "incomplete": "…"}[tag] + astr = "" if anon_st is None else (" anon✓" if anon_st else " anon✗FLIP") + print(f"[{i}/{len(todo)}] {it.get('case_number')}: {mark} {tag} " + f"→ {decided}/{ctype}{astr}", flush=True) + + tasks = [run(i, it) for i, it in enumerate(todo, 1)] + for j in range(0, len(tasks), args.concurrency): + await asyncio.gather(*tasks[j : j + args.concurrency]) + + kappa = fleiss_kappa(kappa_rows) + decided_n = tags["3/3"] + tags["2/3"] + print("\n" + "=" * 60) + print(f"PANEL LABELING — gold-set '{args.batch}'") + print("=" * 60) + print(f" 3/3 unanimous : {tags['3/3']}") + print(f" 2/3 majority : {tags['2/3']}") + print(f" ⚖ split→chair : {tags['split']}") + print(f" … incomplete : {tags['incomplete']}") + print(f" DECIDED (labels written): {decided_n}/{len(todo)}") + if kappa is not None: + interp = ("almost-perfect" if kappa >= 0.8 else "substantial" if kappa >= 0.6 + else "moderate" if kappa >= 0.4 else "fair/poor") + print(f" Fleiss κ (3 raters, is_holding, n={len(kappa_rows)}): {kappa:.3f} ({interp})") + if anon_checked: + rate = anon_stable / anon_checked + print(f" anonymization stability: {anon_stable}/{anon_checked} = {rate:.1%} " + f"({'robust' if rate >= 0.9 else 'CHECK memorization'})") + + ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") + report = Path(__file__).resolve().parent.parent / "data" / "audit" / f"goldset-panel-{args.batch}-{ts}.json" + report.parent.mkdir(parents=True, exist_ok=True) + report.write_text(json.dumps({ + "batch": args.batch, "labeled": len(todo), "agreement": dict(tags), + "decided": decided_n, "fleiss_kappa": kappa, + "anon_checked": anon_checked, "anon_stable": anon_stable, + }, ensure_ascii=False, indent=2)) + print(f"\nreport → {report}") + print("next: .venv/bin/python ../scripts/halacha_goldset.py score " + "(measures validators vs the consensus labels — #81.8)") + return 0 + + +if __name__ == "__main__": + ap = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("--batch", default="default") + ap.add_argument("--force", action="store_true", help="re-label even if already paneled") + ap.add_argument("--limit", type=int, default=0) + ap.add_argument("--concurrency", type=int, default=4) + ap.add_argument("--no-anon", action="store_true", help="skip the anonymization probe") + raise SystemExit(asyncio.run(main(ap.parse_args())))