feat(halacha): #81.7 — gold-set labeled by tri-model consensus (Opus+DeepSeek+Gemini)

מבטל את ה-man-in-the-loop בתיוג ה-gold-set (הנחיית-יו"ר 2026-06-11): במקום תיוג ידני של חיים/דפנה, אמת-המידה נקבעת בקונצנזוס שלוש שושלות-מודל עצמאיות — אותו פאנל שמערכת האישור החיה כבר משתמשת בו (halacha_panel_approve), עם 92% הסכמה חוצת-מודלים על הציר הגס. למה לא מעגלי: הוולידטורים הנמדדים ב-#81.8 (compute_quality_flags / is_fact_dependent / is_quote_truncated / is_thin_restatement) הם היוריסטיקות **rule-based** — משפחת-שיטה שונה מה-LLM-judges. שני שומרי-יושר: (1) פיצול-קולות (אין רוב 2/3) לא כותב לייבל — הפריט נשאר NULL ומוסלם ליו"ר (INV-G10); (2) מבחן-אנונימיזציה — שיפוט-מחדש עם מזהה-התיק ממוסך, flip בקונצנזוס = שינון ולא הנמקה (arXiv:2505.02172). - db.py: עמודות per-lineage (ds_*/gm_*; ai_*=claude קיים) + consensus/agreement/anon + goldset_set_panel_label() שכותב רוב-2/3 ל-is_holding/correct_type (tagged_by='panel:…', לא דורס tagged_by='chair'). goldset_score נשאר ללא שינוי — קורא is_holding (G2, אין מסלול ניקוד מקביל). עדכון הערת-הסכמה (בוטלה דרישת "MUST be human"). - scripts/goldset_panel_label.py: 3 שופטים (מיובאים מ-halacha_panel_approve, מקור-אמת יחיד) + prompt עשיר (מיובא מ-goldset_ai_recommend) + Fleiss κ + מבחן-אנונימיזציה. דוח→data/audit/. - SCRIPTS.md: סקריפט חדש; goldset_ai_recommend/independent_judge מסומנים single-model נבלעים. invariants: G2 (שופטים+prompt מיובאים, אין כפילות; ניקוד יחיד) · INV-G10 (פיצול→יו"ר) · INV-LRN2/LRN3 (איכות-במקור, לכידה מובנית). מקור: PoLL · Trust-or-Escalate (ICLR 2025) · arXiv:2505.02172. tests: 18 offline (consensus/type/Fleiss-κ/anonymize). live labeling = צעד תפעולי אחרי deploy. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-11 16:03:32 +00:00
parent 369755c350
commit 5b001bbd9d
4 changed files with 474 additions and 8 deletions
--- a/mcp-server/src/legal_mcp/services/db.py
+++ b/mcp-server/src/legal_mcp/services/db.py
@@ -1259,11 +1259,21 @@ CREATE INDEX IF NOT EXISTS idx_equiv_halacha_b ON equivalent_halachot(halacha_b)
 """

 SCHEMA_V29_SQL = """
-- halacha_goldset (#81.7/#81.8): a human-tagged evaluation set. A stratified
-- sample of halachot the chair/Dafna labels (is_holding / correct_type /
-- quote_complete) so we can measure the extraction validators' precision/recall
-- and recalibrate the auto-approve threshold. The tags are the ground truth —
-- they MUST be human (no AI pre-fill) to avoid circular bias.
+-- halacha_goldset (#81.7/#81.8): an evaluation set. A stratified sample of
+-- halachot labeled (is_holding / correct_type / quote_complete) so we can
+-- measure the extraction validators' precision/recall and recalibrate the
+-- auto-approve threshold.
+-- LABELING — tri-model consensus (no man-in-the-loop, chair directive
+-- 2026-06-11): the ground-truth label is the MAJORITY of three independent
+-- model lineages (Opus / DeepSeek / Gemini), written here with
+-- tagged_by='panel:opus+deepseek+gemini'. This is NOT circular: the validators
+-- being measured (#81.8, compute_quality_flags / is_fact_dependent / …) are
+-- RULE-BASED heuristics, a different method family from the LLM judges. Two
+-- guards keep the consensus honest: (1) a SPLIT vote (no 2/3) writes NO label
+-- (is_holding stays NULL → escalates to the chair, INV-G10), and (2) the
+-- anonymization probe (anon_*) re-judges with case names masked to catch
+-- memorization vs genuine reasoning. A human tag (goldset_tag, tagged_by=
+-- 'chair') still overrides the panel for any item.
 CREATE TABLE IF NOT EXISTS halacha_goldset (
    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
    halacha_id UUID NOT NULL REFERENCES halachot(id) ON DELETE CASCADE,
@@ -1286,6 +1296,22 @@ ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_is_holding BOOLEAN;
 ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_correct_type TEXT DEFAULT '';
 ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_rationale TEXT DEFAULT '';
 ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_generated_at TIMESTAMPTZ;
+
+-- Tri-model consensus labeling (#81.7, chair directive 2026-06-11). ai_* above
+-- holds the Opus/claude vote (lineage 1); these hold the other two lineages,
+-- plus the derived consensus, the per-item agreement, and the anonymization
+-- probe. The consensus (when not split) is also written into is_holding /
+-- correct_type with tagged_by='panel:...' so goldset_score reads it unchanged.
+ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ds_is_holding BOOLEAN;       -- DeepSeek
+ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ds_correct_type TEXT DEFAULT '';
+ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ds_rationale TEXT DEFAULT '';
+ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS gm_is_holding BOOLEAN;       -- Gemini
+ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS gm_correct_type TEXT DEFAULT '';
+ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS gm_rationale TEXT DEFAULT '';
+ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS panel_agreement TEXT DEFAULT '';  -- 3/3 | 2/3 | split | incomplete
+ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS anon_is_holding BOOLEAN;     -- re-judge, case names masked
+ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS anon_stable BOOLEAN;         -- anon verdict == consensus
+ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS panel_generated_at TIMESTAMPTZ;
 """


@@ -4837,6 +4863,9 @@ async def goldset_list(batch: str = "default") -> list[dict]:
        "SELECT g.id, g.halacha_id::text AS halacha_id, g.is_holding, "
        "       g.correct_type, g.quote_complete, g.tagged_by, g.tagged_at, "
        "       g.ai_is_holding, g.ai_correct_type, g.ai_rationale, g.ai_generated_at, "
+        "       g.ds_is_holding, g.ds_correct_type, g.ds_rationale, "
+        "       g.gm_is_holding, g.gm_correct_type, g.gm_rationale, "
+        "       g.panel_agreement, g.anon_is_holding, g.anon_stable, g.panel_generated_at, "
        "       h.rule_statement, h.supporting_quote, h.reasoning_summary, "
        "       h.rule_type, h.confidence, h.quality_flags, h.review_status, "
        "       cl.case_number, cl.case_name, cl.source_type, cl.precedent_level "
@@ -4851,6 +4880,8 @@ async def goldset_list(batch: str = "default") -> list[dict]:
            d["tagged_at"] = d["tagged_at"].isoformat()
        if d.get("ai_generated_at") is not None:
            d["ai_generated_at"] = d["ai_generated_at"].isoformat()
+        if d.get("panel_generated_at") is not None:
+            d["panel_generated_at"] = d["panel_generated_at"].isoformat()
        if d.get("confidence") is not None:
            d["confidence"] = float(d["confidence"])
        # authority is DERIVED from the source, never stored (INV-DM7)
@@ -4872,6 +4903,62 @@ async def goldset_set_ai_recommendation(
    )


+async def goldset_set_panel_label(
+    goldset_id: UUID, *,
+    claude: dict | None, deepseek: dict | None, gemini: dict | None,
+    consensus_is_holding: bool | None, consensus_type: str,
+    agreement: str,
+    anon_is_holding: bool | None = None, anon_stable: bool | None = None,
+) -> None:
+    """Store a tri-model panel labeling for one gold-set item (#81.7).
+
+    Writes the three per-lineage votes (claude→ai_*, deepseek→ds_*, gemini→gm_*)
+    and the agreement tag. When the panel reached a non-split consensus, ALSO
+    writes it into the ground-truth columns (is_holding / correct_type) with
+    tagged_by='panel:opus+deepseek+gemini' so ``goldset_score`` reads it
+    unchanged. A SPLIT/incomplete vote leaves is_holding untouched (NULL →
+    chair escalation, INV-G10). A prior human tag (tagged_by='chair') is never
+    overwritten.
+
+    Each per-model dict is ``{"is_holding": bool, "type": str, "rationale": str}``
+    or None when that judge failed.
+    """
+    def _h(d):
+        return bool(d["is_holding"]) if isinstance(d, dict) and "is_holding" in d else None
+
+    def _t(d):
+        return str(d.get("type") or "") if isinstance(d, dict) else ""
+
+    def _r(d):
+        return str(d.get("rationale") or "")[:300] if isinstance(d, dict) else ""
+
+    pool = await get_pool()
+    async with pool.acquire() as conn:
+        async with conn.transaction():
+            await conn.execute(
+                "UPDATE halacha_goldset SET "
+                "ai_is_holding=$2, ai_correct_type=$3, ai_rationale=$4, ai_generated_at=now(), "
+                "ds_is_holding=$5, ds_correct_type=$6, ds_rationale=$7, "
+                "gm_is_holding=$8, gm_correct_type=$9, gm_rationale=$10, "
+                "panel_agreement=$11, anon_is_holding=$12, anon_stable=$13, "
+                "panel_generated_at=now() WHERE id=$1",
+                goldset_id,
+                _h(claude), _t(claude), _r(claude),
+                _h(deepseek), _t(deepseek), _r(deepseek),
+                _h(gemini), _t(gemini), _r(gemini),
+                agreement, anon_is_holding, anon_stable,
+            )
+            # Write the consensus into ground-truth ONLY on a real majority, and
+            # never clobber a human tag.
+            if consensus_is_holding is not None and agreement in ("3/3", "2/3"):
+                await conn.execute(
+                    "UPDATE halacha_goldset SET is_holding=$2, correct_type=$3, "
+                    "tagged_by='panel:opus+deepseek+gemini', tagged_at=now() "
+                    "WHERE id=$1 AND COALESCE(tagged_by,'') <> 'chair'",
+                    goldset_id, consensus_is_holding, consensus_type,
+                )
+
+
 async def goldset_tag(
    goldset_id: UUID, *, is_holding: bool | None = None,
    correct_type: str | None = None, quote_complete: bool | None = None,
@@ -4895,9 +4982,13 @@ async def goldset_tag(


 async def goldset_score(batch: str = "default") -> dict:
-    """Measure each extraction validator against the human tags (#81.8).
+    """Measure each extraction validator against the gold-set labels (#81.8).

-    A validator flag predicts "NOT a clean holding"; ground truth is
+    Ground truth is the ``is_holding`` column — set either by the tri-model
+    panel consensus (tagged_by='panel:…', #81.7) or by a human override
+    (tagged_by='chair'). Split-vote items stay NULL and are excluded here, so
+    the score reflects only items the panel (or chair) actually decided. A
+    validator flag predicts "NOT a clean holding"; ground truth is
    is_holding == false. truncated_quote is scored against quote_complete."""
    items = await goldset_list(batch)
    labeled = [r for r in items if r.get("is_holding") is not None]
--- a/mcp-server/tests/test_goldset_panel_consensus.py
+++ b/mcp-server/tests/test_goldset_panel_consensus.py
@@ -0,0 +1,104 @@
+"""Tests for #81.7 — tri-model consensus labeling of the halacha gold-set.
+
+Covers the pure aggregation/probe functions in scripts/goldset_panel_label.py
+(consensus vote, type consensus, Fleiss' kappa, anonymization masking). Fully
+OFFLINE — no DB, no model calls.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+import pytest
+
+# the script lives in ../scripts relative to mcp-server/
+sys.path.insert(0, str(Path(__file__).resolve().parents[2] / "scripts"))
+import goldset_panel_label as g  # noqa: E402
+
+
+# ── consensus() ───────────────────────────────────────────────────────────────
+
+@pytest.mark.parametrize("votes,expected", [
+    ([True, True, True], (True, "3/3")),
+    ([False, False, False], (False, "3/3")),
+    ([True, True, False], (True, "2/3")),
+    ([False, False, True], (False, "2/3")),
+    ([True, False, None], (None, "split")),      # 1-1 of the two valid → chair
+    ([True, None, None], (None, "incomplete")),  # only one judge → chair
+    ([None, None, None], (None, "incomplete")),
+])
+def test_consensus(votes, expected):
+    assert g.consensus(votes) == expected
+
+
+def test_split_writes_no_label():
+    """A genuine 1-1 split must NOT yield a decision (escalates to chair, G10)."""
+    decided, tag = g.consensus([True, False, None])
+    assert decided is None and tag == "split"
+
+
+# ── consensus_type() ──────────────────────────────────────────────────────────
+
+def test_consensus_type_holding_majority():
+    per = [{"type": "holding"}, {"type": "holding"}, {"type": "application"}]
+    assert g.consensus_type(per, decided=True) == "holding"
+
+
+def test_consensus_type_constrained_to_is_holding():
+    """When the consensus is is_holding=False, only application/obiter types
+    are eligible — an inconsistent 'holding' vote is ignored."""
+    per = [{"type": "holding"}, {"type": "application"}, {"type": "obiter"}]
+    out = g.consensus_type(per, decided=False)
+    assert out in {"application", "obiter"}
+
+
+def test_consensus_type_undecided_is_blank():
+    per = [{"type": "holding"}, {"type": "application"}, {"type": "obiter"}]
+    assert g.consensus_type(per, decided=None) == ""
+
+
+# ── fleiss_kappa() ────────────────────────────────────────────────────────────
+
+def test_fleiss_kappa_perfect_agreement():
+    # every item rated 3/0 or 0/3 → κ == 1.0
+    rows = [(3, 0), (3, 0), (0, 3), (0, 3)]
+    assert g.fleiss_kappa(rows) == pytest.approx(1.0)
+
+
+def test_fleiss_kappa_disagreement_is_low():
+    rows = [(2, 1), (1, 2)]
+    k = g.fleiss_kappa(rows)
+    assert k is not None and k < 0.0  # worse than chance
+
+
+def test_fleiss_kappa_ragged_returns_none():
+    # mixed rater counts (3 then 2) is not well-defined → None
+    assert g.fleiss_kappa([(3, 0), (1, 1)]) is None
+
+
+def test_fleiss_kappa_empty_returns_none():
+    assert g.fleiss_kappa([]) is None
+
+
+# ── anonymize() ───────────────────────────────────────────────────────────────
+
+def test_anonymize_masks_case_number_and_name():
+    text = "מקור: החלטת ועדת-ערר (8125-09-24). העוררים פלוני בע\"מ טענו..."
+    out = g.anonymize(text, case_number="8125-09-24", case_name='פלוני בע"מ')
+    assert "8125-09-24" not in out
+    assert 'פלוני בע"מ' not in out
+    assert g._FAKE_CASE in out
+
+
+def test_anonymize_no_identifiers_is_noop():
+    text = "כלל משפטי כללי ללא מזהים."
+    assert g.anonymize(text, case_number=None, case_name=None) == text
+
+
+def test_anonymize_preserves_legal_substance():
+    """Masking swaps only the identifier — the rule text is untouched."""
+    text = "הכלל: מיצוי הליכים הוא תנאי-סף. (תיק 9001-01-20)"
+    out = g.anonymize(text, case_number="9001-01-20", case_name=None)
+    assert "מיצוי הליכים הוא תנאי-סף" in out
+    assert "9001-01-20" not in out