feat(halacha): #81.7 — gold-set labeled by tri-model consensus (Opus+DeepSeek+Gemini)
All checks were successful
G12 Leak-Guard / leak-guard (pull_request) Successful in 6s

מבטל את ה-man-in-the-loop בתיוג ה-gold-set (הנחיית-יו"ר 2026-06-11): במקום תיוג ידני
של חיים/דפנה, אמת-המידה נקבעת בקונצנזוס שלוש שושלות-מודל עצמאיות — אותו פאנל שמערכת
האישור החיה כבר משתמשת בו (halacha_panel_approve), עם 92% הסכמה חוצת-מודלים על הציר הגס.

למה לא מעגלי: הוולידטורים הנמדדים ב-#81.8 (compute_quality_flags / is_fact_dependent /
is_quote_truncated / is_thin_restatement) הם היוריסטיקות **rule-based** — משפחת-שיטה שונה
מה-LLM-judges. שני שומרי-יושר: (1) פיצול-קולות (אין רוב 2/3) לא כותב לייבל — הפריט נשאר
NULL ומוסלם ליו"ר (INV-G10); (2) מבחן-אנונימיזציה — שיפוט-מחדש עם מזהה-התיק ממוסך, flip
בקונצנזוס = שינון ולא הנמקה (arXiv:2505.02172).

- db.py: עמודות per-lineage (ds_*/gm_*; ai_*=claude קיים) + consensus/agreement/anon +
  goldset_set_panel_label() שכותב רוב-2/3 ל-is_holding/correct_type (tagged_by='panel:…',
  לא דורס tagged_by='chair'). goldset_score נשאר ללא שינוי — קורא is_holding (G2, אין מסלול
  ניקוד מקביל). עדכון הערת-הסכמה (בוטלה דרישת "MUST be human").
- scripts/goldset_panel_label.py: 3 שופטים (מיובאים מ-halacha_panel_approve, מקור-אמת יחיד)
  + prompt עשיר (מיובא מ-goldset_ai_recommend) + Fleiss κ + מבחן-אנונימיזציה. דוח→data/audit/.
- SCRIPTS.md: סקריפט חדש; goldset_ai_recommend/independent_judge מסומנים single-model נבלעים.

invariants: G2 (שופטים+prompt מיובאים, אין כפילות; ניקוד יחיד) · INV-G10 (פיצול→יו"ר) ·
INV-LRN2/LRN3 (איכות-במקור, לכידה מובנית). מקור: PoLL · Trust-or-Escalate (ICLR 2025) · arXiv:2505.02172.
tests: 18 offline (consensus/type/Fleiss-κ/anonymize). live labeling = צעד תפעולי אחרי deploy.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-11 16:03:32 +00:00
parent 369755c350
commit 5b001bbd9d
4 changed files with 474 additions and 8 deletions

View File

@@ -1259,11 +1259,21 @@ CREATE INDEX IF NOT EXISTS idx_equiv_halacha_b ON equivalent_halachot(halacha_b)
"""
SCHEMA_V29_SQL = """
-- halacha_goldset (#81.7/#81.8): a human-tagged evaluation set. A stratified
-- sample of halachot the chair/Dafna labels (is_holding / correct_type /
-- quote_complete) so we can measure the extraction validators' precision/recall
-- and recalibrate the auto-approve threshold. The tags are the ground truth —
-- they MUST be human (no AI pre-fill) to avoid circular bias.
-- halacha_goldset (#81.7/#81.8): an evaluation set. A stratified sample of
-- halachot labeled (is_holding / correct_type / quote_complete) so we can
-- measure the extraction validators' precision/recall and recalibrate the
-- auto-approve threshold.
-- LABELING — tri-model consensus (no man-in-the-loop, chair directive
-- 2026-06-11): the ground-truth label is the MAJORITY of three independent
-- model lineages (Opus / DeepSeek / Gemini), written here with
-- tagged_by='panel:opus+deepseek+gemini'. This is NOT circular: the validators
-- being measured (#81.8, compute_quality_flags / is_fact_dependent / …) are
-- RULE-BASED heuristics, a different method family from the LLM judges. Two
-- guards keep the consensus honest: (1) a SPLIT vote (no 2/3) writes NO label
-- (is_holding stays NULL → escalates to the chair, INV-G10), and (2) the
-- anonymization probe (anon_*) re-judges with case names masked to catch
-- memorization vs genuine reasoning. A human tag (goldset_tag, tagged_by=
-- 'chair') still overrides the panel for any item.
CREATE TABLE IF NOT EXISTS halacha_goldset (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
halacha_id UUID NOT NULL REFERENCES halachot(id) ON DELETE CASCADE,
@@ -1286,6 +1296,22 @@ ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_is_holding BOOLEAN;
ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_correct_type TEXT DEFAULT '';
ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_rationale TEXT DEFAULT '';
ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_generated_at TIMESTAMPTZ;
-- Tri-model consensus labeling (#81.7, chair directive 2026-06-11). ai_* above
-- holds the Opus/claude vote (lineage 1); these hold the other two lineages,
-- plus the derived consensus, the per-item agreement, and the anonymization
-- probe. The consensus (when not split) is also written into is_holding /
-- correct_type with tagged_by='panel:...' so goldset_score reads it unchanged.
ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ds_is_holding BOOLEAN; -- DeepSeek
ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ds_correct_type TEXT DEFAULT '';
ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ds_rationale TEXT DEFAULT '';
ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS gm_is_holding BOOLEAN; -- Gemini
ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS gm_correct_type TEXT DEFAULT '';
ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS gm_rationale TEXT DEFAULT '';
ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS panel_agreement TEXT DEFAULT ''; -- 3/3 | 2/3 | split | incomplete
ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS anon_is_holding BOOLEAN; -- re-judge, case names masked
ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS anon_stable BOOLEAN; -- anon verdict == consensus
ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS panel_generated_at TIMESTAMPTZ;
"""
@@ -4837,6 +4863,9 @@ async def goldset_list(batch: str = "default") -> list[dict]:
"SELECT g.id, g.halacha_id::text AS halacha_id, g.is_holding, "
" g.correct_type, g.quote_complete, g.tagged_by, g.tagged_at, "
" g.ai_is_holding, g.ai_correct_type, g.ai_rationale, g.ai_generated_at, "
" g.ds_is_holding, g.ds_correct_type, g.ds_rationale, "
" g.gm_is_holding, g.gm_correct_type, g.gm_rationale, "
" g.panel_agreement, g.anon_is_holding, g.anon_stable, g.panel_generated_at, "
" h.rule_statement, h.supporting_quote, h.reasoning_summary, "
" h.rule_type, h.confidence, h.quality_flags, h.review_status, "
" cl.case_number, cl.case_name, cl.source_type, cl.precedent_level "
@@ -4851,6 +4880,8 @@ async def goldset_list(batch: str = "default") -> list[dict]:
d["tagged_at"] = d["tagged_at"].isoformat()
if d.get("ai_generated_at") is not None:
d["ai_generated_at"] = d["ai_generated_at"].isoformat()
if d.get("panel_generated_at") is not None:
d["panel_generated_at"] = d["panel_generated_at"].isoformat()
if d.get("confidence") is not None:
d["confidence"] = float(d["confidence"])
# authority is DERIVED from the source, never stored (INV-DM7)
@@ -4872,6 +4903,62 @@ async def goldset_set_ai_recommendation(
)
async def goldset_set_panel_label(
goldset_id: UUID, *,
claude: dict | None, deepseek: dict | None, gemini: dict | None,
consensus_is_holding: bool | None, consensus_type: str,
agreement: str,
anon_is_holding: bool | None = None, anon_stable: bool | None = None,
) -> None:
"""Store a tri-model panel labeling for one gold-set item (#81.7).
Writes the three per-lineage votes (claude→ai_*, deepseek→ds_*, gemini→gm_*)
and the agreement tag. When the panel reached a non-split consensus, ALSO
writes it into the ground-truth columns (is_holding / correct_type) with
tagged_by='panel:opus+deepseek+gemini' so ``goldset_score`` reads it
unchanged. A SPLIT/incomplete vote leaves is_holding untouched (NULL →
chair escalation, INV-G10). A prior human tag (tagged_by='chair') is never
overwritten.
Each per-model dict is ``{"is_holding": bool, "type": str, "rationale": str}``
or None when that judge failed.
"""
def _h(d):
return bool(d["is_holding"]) if isinstance(d, dict) and "is_holding" in d else None
def _t(d):
return str(d.get("type") or "") if isinstance(d, dict) else ""
def _r(d):
return str(d.get("rationale") or "")[:300] if isinstance(d, dict) else ""
pool = await get_pool()
async with pool.acquire() as conn:
async with conn.transaction():
await conn.execute(
"UPDATE halacha_goldset SET "
"ai_is_holding=$2, ai_correct_type=$3, ai_rationale=$4, ai_generated_at=now(), "
"ds_is_holding=$5, ds_correct_type=$6, ds_rationale=$7, "
"gm_is_holding=$8, gm_correct_type=$9, gm_rationale=$10, "
"panel_agreement=$11, anon_is_holding=$12, anon_stable=$13, "
"panel_generated_at=now() WHERE id=$1",
goldset_id,
_h(claude), _t(claude), _r(claude),
_h(deepseek), _t(deepseek), _r(deepseek),
_h(gemini), _t(gemini), _r(gemini),
agreement, anon_is_holding, anon_stable,
)
# Write the consensus into ground-truth ONLY on a real majority, and
# never clobber a human tag.
if consensus_is_holding is not None and agreement in ("3/3", "2/3"):
await conn.execute(
"UPDATE halacha_goldset SET is_holding=$2, correct_type=$3, "
"tagged_by='panel:opus+deepseek+gemini', tagged_at=now() "
"WHERE id=$1 AND COALESCE(tagged_by,'') <> 'chair'",
goldset_id, consensus_is_holding, consensus_type,
)
async def goldset_tag(
goldset_id: UUID, *, is_holding: bool | None = None,
correct_type: str | None = None, quote_complete: bool | None = None,
@@ -4895,9 +4982,13 @@ async def goldset_tag(
async def goldset_score(batch: str = "default") -> dict:
"""Measure each extraction validator against the human tags (#81.8).
"""Measure each extraction validator against the gold-set labels (#81.8).
A validator flag predicts "NOT a clean holding"; ground truth is
Ground truth is the ``is_holding`` column — set either by the tri-model
panel consensus (tagged_by='panel:…', #81.7) or by a human override
(tagged_by='chair'). Split-vote items stay NULL and are excluded here, so
the score reflects only items the panel (or chair) actually decided. A
validator flag predicts "NOT a clean holding"; ground truth is
is_holding == false. truncated_quote is scored against quote_complete."""
items = await goldset_list(batch)
labeled = [r for r in items if r.get("is_holding") is not None]

View File

@@ -0,0 +1,104 @@
"""Tests for #81.7 — tri-model consensus labeling of the halacha gold-set.
Covers the pure aggregation/probe functions in scripts/goldset_panel_label.py
(consensus vote, type consensus, Fleiss' kappa, anonymization masking). Fully
OFFLINE — no DB, no model calls.
"""
from __future__ import annotations
import sys
from pathlib import Path
import pytest
# the script lives in ../scripts relative to mcp-server/
sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "scripts"))
import goldset_panel_label as g # noqa: E402
# ── consensus() ───────────────────────────────────────────────────────────────
@pytest.mark.parametrize("votes,expected", [
([True, True, True], (True, "3/3")),
([False, False, False], (False, "3/3")),
([True, True, False], (True, "2/3")),
([False, False, True], (False, "2/3")),
([True, False, None], (None, "split")), # 1-1 of the two valid → chair
([True, None, None], (None, "incomplete")), # only one judge → chair
([None, None, None], (None, "incomplete")),
])
def test_consensus(votes, expected):
assert g.consensus(votes) == expected
def test_split_writes_no_label():
"""A genuine 1-1 split must NOT yield a decision (escalates to chair, G10)."""
decided, tag = g.consensus([True, False, None])
assert decided is None and tag == "split"
# ── consensus_type() ──────────────────────────────────────────────────────────
def test_consensus_type_holding_majority():
per = [{"type": "holding"}, {"type": "holding"}, {"type": "application"}]
assert g.consensus_type(per, decided=True) == "holding"
def test_consensus_type_constrained_to_is_holding():
"""When the consensus is is_holding=False, only application/obiter types
are eligible — an inconsistent 'holding' vote is ignored."""
per = [{"type": "holding"}, {"type": "application"}, {"type": "obiter"}]
out = g.consensus_type(per, decided=False)
assert out in {"application", "obiter"}
def test_consensus_type_undecided_is_blank():
per = [{"type": "holding"}, {"type": "application"}, {"type": "obiter"}]
assert g.consensus_type(per, decided=None) == ""
# ── fleiss_kappa() ────────────────────────────────────────────────────────────
def test_fleiss_kappa_perfect_agreement():
# every item rated 3/0 or 0/3 → κ == 1.0
rows = [(3, 0), (3, 0), (0, 3), (0, 3)]
assert g.fleiss_kappa(rows) == pytest.approx(1.0)
def test_fleiss_kappa_disagreement_is_low():
rows = [(2, 1), (1, 2)]
k = g.fleiss_kappa(rows)
assert k is not None and k < 0.0 # worse than chance
def test_fleiss_kappa_ragged_returns_none():
# mixed rater counts (3 then 2) is not well-defined → None
assert g.fleiss_kappa([(3, 0), (1, 1)]) is None
def test_fleiss_kappa_empty_returns_none():
assert g.fleiss_kappa([]) is None
# ── anonymize() ───────────────────────────────────────────────────────────────
def test_anonymize_masks_case_number_and_name():
text = "מקור: החלטת ועדת-ערר (8125-09-24). העוררים פלוני בע\"מ טענו..."
out = g.anonymize(text, case_number="8125-09-24", case_name='פלוני בע"מ')
assert "8125-09-24" not in out
assert 'פלוני בע"מ' not in out
assert g._FAKE_CASE in out
def test_anonymize_no_identifiers_is_noop():
text = "כלל משפטי כללי ללא מזהים."
assert g.anonymize(text, case_number=None, case_name=None) == text
def test_anonymize_preserves_legal_substance():
"""Masking swaps only the identifier — the rule text is untouched."""
text = "הכלל: מיצוי הליכים הוא תנאי-סף. (תיק 9001-01-20)"
out = g.anonymize(text, case_number="9001-01-20", case_name=None)
assert "מיצוי הליכים הוא תנאי-סף" in out
assert "9001-01-20" not in out

View File

@@ -48,7 +48,8 @@
| `backfill_nevo_preamble.py` | python | **#86.2** — מיגרציית-נתונים: חיתוך preamble/רציו של נבו שדלף לפסיקה שהוטמעה לפני תיקון #86.1. מאתר כל `case_law` ש-`strip_nevo_preamble(full_text)` עדיין מקצר (דליפה היסטורית), ומבצע: (1) לכידת ה-מיני-רציו ל-`case_law.nevo_ratio` (gold-set ל-#86.3); (2) שכתוב `full_text` החתוך + חישוב-מחדש של `content_hash`; (3) `reindex_case_law` (re-chunk+embed, ללא re-OCR/LLM); (4) **סימון (לא מחיקה)** הלכות ש-`supporting_quote` שלהן בתוך ה-preamble שהוסר → `pending_review` + quality_flag `nevo_preamble_leak`. **שומר-בטיחות:** שורות עם keep%<`--min-keep` (ברירת-מחדל 60) מוחרגות מ-`--apply` כחשד over-strip (אלא אם `--include-suspicious`). **dry-run כברירת-מחדל**; `--apply` כותב backup JSON + manifest CSV ל-`data/audit/` תחילה. idempotent. רץ עם venv של mcp-server. **chair-gated** (לאמת manifest לפני apply) | מיגרציית-נתונים — dry-run בוצע (19 פסקים, 27 הלכות מזוהמות); apply ממתין לאישור |
| `nevo_ratio_benchmark.py` | python | **#86.3** — מדידת איכות חילוץ-הלכות מול ה-מיני-רציו של נבו (gold-set מקצועי חינמי). לכל פסק עם `nevo_ratio` (או נגזר מ-`full_text` אם טרם בוצע backfill): LLM-judge מקומי (`claude_session`, אפס עלות) ממפה סמנטית את הלכות-המערכת מול הלכות-נבו ומפיק **recall** (כיסוי הלכות-נבו), **precision** (אחוז הלכותינו הממופות), **granularity** (יחס פירוק — איתות over-extraction ל-#81.5). `--case <num>` / `--all [--limit N]` / `--model` / `--out`. כותב CSV ל-`data/audit/`. רץ עם venv של mcp-server (דורש Claude CLI מקומי). אומת על בג"ץ 1764/05: recall 0.875, precision 1.0, granularity 1.75x | ידני — מדידת-איכות (CI/ad-hoc) |
| `halacha_goldset.py` | python | **#81.7** — הארנס gold-set לאיכות חילוץ-הלכות. `export --n N` מייצא מדגם מרובד (לפי precedent×rule_type) ל-CSV עם עמודות-תיוג ריקות (`is_holding`/`correct_type`/`quote_complete`) לתיוג ידני (חיים/דפנה). `score --in <csv>` קורא את ה-CSV המתויג ומודד כל ולידטור (`compute_quality_flags`/`is_fact_dependent`/`is_quote_truncated`/`is_thin_restatement`) מול אמת-המידה האנושית: P/R/F1 + confusion. בסיס ל-#81.8 (כיול סף האישור). מייבא את אותם ולידטורים שה-extractor מריץ. רץ עם venv של mcp-server. **הערה:** קיים גם דף-תיוג אינטראקטיבי DB-backed (`/goldset`) — זה ה-CSV-fallback | ידני — export→תיוג→score |
| `goldset_ai_recommend.py` | python | **#81.7 QA** — מייצר **חוות-דעת-AI שנייה** (claude מקומי, אפס עלות) לכל פריט ב-`halacha_goldset`: `is_holding`+`type`+נימוק, נשמר ב-`ai_*` ומוצג בדף לצד התיוג האנושי לזיהוי אי-הסכמות. **עצמאי** מהוולידטורים שנמדדים (אין מעגליות) ו**לא** מוחל אוטומטית. `--force` (חידוש)/`--limit N`. **חובה מקומי** (claude_session). | ידני — לאחר יצירת/הרחבת batch |
| `goldset_panel_label.py` | python | **#81.7 — תיוג ה-gold-set בקונצנזוס תלת-מודלי (ללא man-in-the-loop, הנחיית-יו"ר 2026-06-11).** מריץ את שלושת השופטים העצמאיים (Opus/claude_session · DeepSeek · Gemini, מיובאים מ-`halacha_panel_approve`) עם ה-prompt העשיר (`is_holding`+`type`+נימוק מ-`goldset_ai_recommend`) על כל פריט; **רוב 2/3 נכתב ל-`is_holding`/`correct_type`** עם `tagged_by='panel:opus+deepseek+gemini'`יצול→NULL→יו"ר, INV-G10). מודד **Fleiss κ** (3 מעריכים) ומריץ **מבחן-אנונימיזציה** (שמות-תיק ממוסכים→שיפוט-מחדש; flip=שינון). לא מעגלי — הוולידטורים הנמדדים rule-based. כותב per-model+consensus+anon ל-DB ודוח ל-`data/audit/`. **מחליף** תיוג-ידני; `goldset_ai_recommend`/`goldset_independent_judge` נשארים כבדיקות single-model. `--limit`/`--no-anon`/`--force`. **חובה מקומי**. | ידני — לאחר יצירת/הרחבת batch |
| `goldset_ai_recommend.py` | python | **#81.7 QA (single-model, נבלע ב-panel)** — חוות-דעת claude בלבד ל-`ai_*`. כעת לינאז' 1/3 בתוך `goldset_panel_label`; נשאר כבדיקת-claude עצמאית/חידוש נקודתי. `--force`/`--limit`. **חובה מקומי**. | ידני — בדיקה נקודתית |
| `goldset_independent_judge.py` | python | **INV-DM7 ולידציה** — שופט-תפקיד **עצמאי שני** ממודל אחר (DeepSeek API ישיר, OpenAI-compatible) ששובר את עיגון-ה-AI: מסווג rule_role **בעיוור** (בלי לראות תיוג-אדם או המלצת-claude) ומחשב מטריצת-הסכמה (deepseek↔אדם מול ai↔אדם) + ציר-גס (כלל-בר-הכללה מול application/obiter). **ממצא (2026-06-07):** ai↔אדם=100% (מעוגן), deepseek↔אדם=50% מדויק אך **92% גס** → תת-הסוג holding/interpretive/procedural עמום-מטבעו (לא לשער עליו); הציר-הגס אמין חוצה-מודלים. read-only על הזהב. `--model`/`--limit`/`--concurrency`. מפתח מ-`~/.hermes/profiles/deepseek/.env`. raw→`/tmp/goldset_judge_raw.json`. | ידני — ולידציית אמינות-תוויות |
| `halacha_panel_approve.py` | python | **פאנל-אישור הלכות (Trust-or-Escalate, dry-run).** 3 שופטים בלתי-תלויי-לינאז' (Opus/claude_session · DeepSeek · Gemini-2.5-flash) מצביעים על ה**ציר-הגס האמין** (92% חוצה-מודלים): נקיות→"הלכה לשמירה?"; nli_unsupported→"הציטוט תומך בכלל?" (שיפוט-מחדש); פגומות→re-extraction. רק ורדיקט מוסכם פועל אוטומטית, **פיצול מסלים ליו"ר** (INV-G10). `--apply` **מחווט** (clean: רוב 2/3; nli: פה-אחד-entailed מנקה flag) — הפיך, מגבה ל-`data/audit/` קודם. מפתחות: DeepSeek מ-`~/.hermes/...`, Gemini מ-`~/.env`. **חובה מקומי**. dry-run 2026-06-07: 197→103 אוטו (פה-אחד) / ~15 (רוב). | ידני / שלב-אימות-הלכות במסלול-הסופי |
| `style_lesson_panel.py` | python | **פאנל-סגנון דו-סוכני (למידה כפולה).** על-גבי דיסטילציית-ה-Opus (draft↔final ב-`draft_final_pairs.analysis`), שני שופטים בלתי-תלויים — DeepSeek + Gemini-2.5-flash — מצביעים לכל לקח על השאלה הגסה "האם זו הנחיית-סגנון מופשטת ובת-הכללה (INV-LRN5 — קול ולא מהות)?". הסכמה 2/2-keep → נכתב כ-`decision_lesson` (`source=panel:deepseek+gemini`); 2/2-drop → לא נכתב; פיצול/substance → מוסלם ליו"ר. `--apply` הפיך, מגבה ל-`data/audit/`. הטמעה ל-SKILL.md/lessons.md נשארת שער-יו"ר ידני (INV-G10). מפתחות כמו פאנל-ההלכות. **חובה מקומי**. `--case <num>` / `--pair-id <uuid>`. | שלב-למידה במסלול-הסופי |

View File

@@ -0,0 +1,270 @@
#!/usr/bin/env python3
"""#81.7 — label the halacha gold-set by TRI-MODEL CONSENSUS (no man-in-the-loop).
Chair directive (2026-06-11): replace manual chair/Dafna tagging of the gold-set
with the agreement of three INDEPENDENT model lineages. This is the same panel
the live approval triage uses (``halacha_panel_approve.py``), proven to agree on
the coarse "is this a real, keepable rule?" axis across models (92%):
- claude (Opus via claude_session — local CLI, zero marginal cost) [Anthropic]
- deepseek (api.deepseek.com) [DeepSeek]
- gemini (gemini-2.5-flash) [Google]
Why this is NOT circular: the validators measured downstream (#81.8 —
compute_quality_flags / is_fact_dependent / is_quote_truncated /
is_thin_restatement) are RULE-BASED heuristics, a different method family from
the LLM judges. Two honesty guards:
1. SPLIT vote (no 2/3 agreement) writes NO ground-truth label — the item
stays NULL and escalates to the chair (INV-G10).
2. ANONYMIZATION probe — every item is re-judged with the case identifier
masked/faked; if the consensus flips, the verdict was keying on the
identifier (memorization), not the legal reasoning. Reported as a
stability rate (arXiv:2505.02172).
Reuses the model callers from halacha_panel_approve and the rich is_holding+type
prompt from goldset_ai_recommend — single source, no parallel path (G2).
Run locally (claude_session needs the CLI; DeepSeek/Gemini keys from ~/.env):
cd ~/legal-ai/mcp-server
.venv/bin/python ../scripts/goldset_panel_label.py --limit 8 # smoke
.venv/bin/python ../scripts/goldset_panel_label.py # full, with anon
.venv/bin/python ../scripts/goldset_panel_label.py --no-anon # skip the anon probe
"""
from __future__ import annotations
import argparse
import asyncio
import json
import re
import sys
from collections import Counter
from datetime import datetime, timezone
from pathlib import Path
from uuid import UUID
import httpx
from legal_mcp.services import claude_session, db
# Reuse the model callers (DeepSeek/Gemini HTTP) and the rich gold-set prompt —
# importing them keeps ONE source of truth for each (G2).
sys.path.insert(0, str(Path(__file__).resolve().parent))
from halacha_panel_approve import judge_deepseek, judge_gemini # noqa: E402
from goldset_ai_recommend import SYSTEM, VALID_TYPES, _prompt # noqa: E402
# ── consensus aggregation (pure — unit-tested) ────────────────────────────────
def consensus(votes: list[bool | None]) -> tuple[bool | None, str]:
"""Majority of the (up to three) is_holding votes.
Returns ``(consensus_bool_or_None, agreement_tag)`` where the tag is one of
'3/3' / '2/3' / 'split' / 'incomplete'. A consensus is returned only on a
real majority; 'split' and 'incomplete' return None (→ chair).
"""
valid = [v for v in votes if v is not None]
if len(valid) < 2:
return None, "incomplete"
yes = sum(1 for v in valid if v)
no = len(valid) - yes
if yes == no:
return None, "split"
decision = yes > no
if len(valid) == 3 and (yes == 3 or no == 3):
return decision, "3/3"
return decision, "2/3"
def consensus_type(per_model: list[dict | None], decided: bool | None) -> str:
"""Most-common rule_type among the models, constrained to be consistent with
the is_holding consensus (holding/interpretive/procedural ↔ True;
application/obiter ↔ False). '' when undecided or no agreement."""
if decided is None:
return ""
consistent = (
{"holding", "interpretive", "procedural"} if decided
else {"application", "obiter"}
)
types = [
str(d.get("type") or "") for d in per_model
if isinstance(d, dict) and str(d.get("type") or "") in consistent
]
if not types:
return ""
return Counter(types).most_common(1)[0][0]
def fleiss_kappa(rows: list[tuple[int, int]]) -> float | None:
"""Fleiss' kappa for binary ratings (yes_count, no_count) per item.
Only items rated by ALL raters (here: 3) should be passed. Returns None if
there isn't enough data. Standard formula (Fleiss 1971)."""
rows = [(y, n) for (y, n) in rows if (y + n) > 0]
N = len(rows)
if N == 0:
return None
n = rows[0][0] + rows[0][1]
if n < 2 or any((y + n_) != n for (y, n_) in rows):
return None # ragged rater counts — not well-defined
# P_i: agreement within item i
P = [(y * (y - 1) + nn * (nn - 1)) / (n * (n - 1)) for (y, nn) in rows]
Pbar = sum(P) / N
# p_j: marginal proportion per category
p_yes = sum(y for (y, _) in rows) / (N * n)
p_no = sum(nn for (_, nn) in rows) / (N * n)
Pe = p_yes ** 2 + p_no ** 2
if Pe >= 1.0:
return 1.0 # degenerate (all one category) → perfect by convention
return (Pbar - Pe) / (1 - Pe)
# ── anonymization probe (pure — unit-tested) ──────────────────────────────────
_FAKE_CASE = "12345-67-89"
_FAKE_NAME = "פלוני נ' אלמוני"
def anonymize(text: str, case_number: str | None, case_name: str | None) -> str:
"""Mask the case identifier so the model can't key on a memorized case.
Replaces the literal case_number and case_name (if they appear) with fake
plausible tokens. Legal substance (the rule + quote) is untouched — only the
identifiers that enable memorization are swapped (arXiv:2505.02172)."""
out = text
if case_number:
out = out.replace(case_number, _FAKE_CASE)
# also catch a bare nnnn-nn-nn / nnnn/nn pattern of the same case
out = re.sub(re.escape(case_number).replace(r"\-", r"[-/]"), _FAKE_CASE, out)
if case_name:
out = out.replace(case_name, _FAKE_NAME)
return out
# ── one panel pass over a single item ─────────────────────────────────────────
def _parse(d: dict | None) -> dict | None:
if not isinstance(d, dict) or "is_holding" not in d:
return None
t = str(d.get("type") or "").strip()
return {
"is_holding": bool(d["is_holding"]),
"type": t if t in VALID_TYPES else "",
"rationale": str(d.get("rationale") or "")[:300],
}
async def _judge_claude(user: str) -> dict | None:
try:
return await claude_session.query_json(user, system=SYSTEM, effort="low")
except Exception: # noqa: BLE001
return None
async def panel_pass(client: httpx.AsyncClient, user: str) -> tuple[list[dict | None], bool | None, str]:
"""Run the three judges on one prompt; return (per_model, consensus, tag)."""
c, ds, gm = await asyncio.gather(
_judge_claude(user),
judge_deepseek(client, SYSTEM, user),
judge_gemini(client, SYSTEM, user),
)
per = [_parse(c), _parse(ds), _parse(gm)]
decided, tag = consensus([m["is_holding"] if m else None for m in per])
return per, decided, tag
async def main(args: argparse.Namespace) -> int:
print(f"keys — deepseek:{bool(db and True)} (see panel) · claude:local · anon:{not args.no_anon}\n",
flush=True)
items = await db.goldset_list(args.batch)
todo = [it for it in items if args.force or not it.get("panel_generated_at")]
if args.limit:
todo = todo[: args.limit]
print(f"gold-set '{args.batch}': {len(items)} items, {len(todo)} to label by panel", flush=True)
sem = asyncio.Semaphore(args.concurrency)
tags: Counter = Counter()
kappa_rows: list[tuple[int, int]] = []
anon_checked = anon_stable = 0
async with httpx.AsyncClient() as client:
async def run(i: int, it: dict) -> None:
nonlocal anon_checked, anon_stable
async with sem:
user = _prompt(it)
per, decided, tag = await panel_pass(client, user)
anon_hold = anon_st = None
if not args.no_anon and decided is not None:
anon_user = anonymize(user, it.get("case_number"), it.get("case_name"))
_, anon_decided, _ = await panel_pass(client, anon_user)
if anon_decided is not None:
anon_hold = anon_decided
anon_st = (anon_decided == decided)
anon_checked += 1
anon_stable += int(anon_st)
ctype = consensus_type(per, decided)
await db.goldset_set_panel_label(
UUID(str(it["id"])),
claude=per[0], deepseek=per[1], gemini=per[2],
consensus_is_holding=decided, consensus_type=ctype,
agreement=tag, anon_is_holding=anon_hold, anon_stable=anon_st,
)
tags[tag] += 1
# κ counts only items all three judged
nv = [m for m in per if m is not None]
if len(nv) == 3:
y = sum(1 for m in nv if m["is_holding"])
kappa_rows.append((y, 3 - y))
mark = {"3/3": "✓✓✓", "2/3": "✓✓", "split": "", "incomplete": ""}[tag]
astr = "" if anon_st is None else (" anon✓" if anon_st else " anon✗FLIP")
print(f"[{i}/{len(todo)}] {it.get('case_number')}: {mark} {tag} "
f"{decided}/{ctype}{astr}", flush=True)
tasks = [run(i, it) for i, it in enumerate(todo, 1)]
for j in range(0, len(tasks), args.concurrency):
await asyncio.gather(*tasks[j : j + args.concurrency])
kappa = fleiss_kappa(kappa_rows)
decided_n = tags["3/3"] + tags["2/3"]
print("\n" + "=" * 60)
print(f"PANEL LABELING — gold-set '{args.batch}'")
print("=" * 60)
print(f" 3/3 unanimous : {tags['3/3']}")
print(f" 2/3 majority : {tags['2/3']}")
print(f" ⚖ split→chair : {tags['split']}")
print(f" … incomplete : {tags['incomplete']}")
print(f" DECIDED (labels written): {decided_n}/{len(todo)}")
if kappa is not None:
interp = ("almost-perfect" if kappa >= 0.8 else "substantial" if kappa >= 0.6
else "moderate" if kappa >= 0.4 else "fair/poor")
print(f" Fleiss κ (3 raters, is_holding, n={len(kappa_rows)}): {kappa:.3f} ({interp})")
if anon_checked:
rate = anon_stable / anon_checked
print(f" anonymization stability: {anon_stable}/{anon_checked} = {rate:.1%} "
f"({'robust' if rate >= 0.9 else 'CHECK memorization'})")
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
report = Path(__file__).resolve().parent.parent / "data" / "audit" / f"goldset-panel-{args.batch}-{ts}.json"
report.parent.mkdir(parents=True, exist_ok=True)
report.write_text(json.dumps({
"batch": args.batch, "labeled": len(todo), "agreement": dict(tags),
"decided": decided_n, "fleiss_kappa": kappa,
"anon_checked": anon_checked, "anon_stable": anon_stable,
}, ensure_ascii=False, indent=2))
print(f"\nreport → {report}")
print("next: .venv/bin/python ../scripts/halacha_goldset.py score "
"(measures validators vs the consensus labels — #81.8)")
return 0
if __name__ == "__main__":
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument("--batch", default="default")
ap.add_argument("--force", action="store_true", help="re-label even if already paneled")
ap.add_argument("--limit", type=int, default=0)
ap.add_argument("--concurrency", type=int, default=4)
ap.add_argument("--no-anon", action="store_true", help="skip the anonymization probe")
raise SystemExit(asyncio.run(main(ap.parse_args())))