legal-ai/scripts/goldset_independent_judge.py

#!/usr/bin/env python3
"""Independent second-judge for gold-set rule_ROLE — breaks the AI-anchoring loop.

The gold-set human role tags were made WHILE seeing a claude AI recommendation,
so human↔AI agreement (~100%) is contaminated by anchoring — it is not an
independent measure of role-classification accuracy. This script adds a THIRD,
genuinely independent judge: a DIFFERENT model (DeepSeek, OpenAI-compatible API)
classifies the rule ROLE blind — it never sees the human tag NOR the first AI's
answer. Comparing deepseek↔human against ai↔human tells us whether the labels
are trustworthy or just anchored.

Zero tagging from the chair. Read-only on the gold-set.

    cd ~/legal-ai/mcp-server
    .venv/bin/python ../scripts/goldset_independent_judge.py            # all tagged
    .venv/bin/python ../scripts/goldset_independent_judge.py --limit 10 # smoke
    .venv/bin/python ../scripts/goldset_independent_judge.py --model deepseek-reasoner
"""
from __future__ import annotations

import argparse
import asyncio
import json
import os
import sys
from collections import Counter
from pathlib import Path

import httpx

from legal_mcp.services import db

ROLES = {"holding", "interpretive", "procedural", "application", "obiter"}

SYSTEM = (
    "אתה משפטן בכיר המסווג 'הלכות' שחולצו מפסיקה ישראלית לפי **סוג הכלל** בלבד. "
    "אל תסווג מחייב/משכנע (דרגת-המחייבות אינה רלוונטית). בחר ערך אחד:\n"
    "- holding — עיקרון מהותי שהיה הכרחי להכרעה (ratio; מבחן Wambaugh).\n"
    "- interpretive — פרשנות הוראת-חוק/מונח/תכנית.\n"
    "- procedural — סדר-דין: סמכות/מועדים/זכות-עמידה/מיצוי/נטל.\n"
    "- application — החלה תלוית-עובדות על נסיבות התיק (לרוב לא-הלכה בת-הכללה).\n"
    "- obiter — אמרת-אגב שלא הוכרעה.\n"
    'החזר JSON בלבד: {"role":"<אחד מהחמישה>"}. ללא markdown, ללא הסבר.'
)


def _deepseek_key() -> str:
    for p in (Path.home() / ".hermes/profiles/deepseek/.env", Path.home() / ".env"):
        if p.exists():
            for line in p.read_text().splitlines():
                if line.startswith("DEEPSEEK_API_KEY="):
                    return line.split("=", 1)[1].strip()
    return os.environ.get("DEEPSEEK_API_KEY", "")


def _user_prompt(it: dict) -> str:
    src = "פסק-דין" if it.get("source_type") == "court_ruling" else "החלטת ועדת-ערר"
    return (
        f"מקור: {src}.\n\n"
        f"ניסוח הכלל:\n{it.get('rule_statement') or ''}\n\n"
        f"היגיון:\n{it.get('reasoning_summary') or ''}\n\n"
        f"ציטוט תומך:\n{it.get('supporting_quote') or ''}"
    )


async def _judge(client: httpx.AsyncClient, key: str, model: str, it: dict) -> str | None:
    try:
        r = await client.post(
            "https://api.deepseek.com/v1/chat/completions",
            headers={"Authorization": f"Bearer {key}", "Content-Type": "application/json"},
            json={
                "model": model,
                "messages": [
                    {"role": "system", "content": SYSTEM},
                    {"role": "user", "content": _user_prompt(it)},
                ],
                "temperature": 0,
                "max_tokens": 60,
                "response_format": {"type": "json_object"},
            },
            timeout=90,
        )
        r.raise_for_status()
        content = r.json()["choices"][0]["message"]["content"]
        role = str(json.loads(content).get("role", "")).strip().lower()
        return role if role in ROLES else None
    except Exception as e:  # noqa: BLE001
        print(f"  ! judge error: {e}", flush=True)
        return None


def _agree(rows: list[dict], a: str, b: str) -> tuple[int, int, float]:
    """Return (matches, comparable, percent) — percent is 0..100."""
    valid = [r for r in rows if r.get(a) and r.get(b)]
    ok = sum(1 for r in valid if r[a] == r[b])
    return ok, len(valid), (100.0 * ok / len(valid) if valid else 0.0)


async def main(args: argparse.Namespace) -> int:
    key = _deepseek_key()
    if not key:
        print("no DEEPSEEK_API_KEY found", flush=True)
        return 1

    items = await db.goldset_list(args.batch)
    # only items with a HUMAN role tag (the ground truth we are testing)
    tagged = [it for it in items if (it.get("correct_type") or "").strip() in ROLES]
    if args.limit:
        tagged = tagged[: args.limit]
    print(f"independent judge ({args.model}) on {len(tagged)} human-tagged items\n", flush=True)

    sem = asyncio.Semaphore(args.concurrency)
    rows: list[dict] = []
    async with httpx.AsyncClient() as client:
        async def one(it: dict):
            async with sem:
                ds = await _judge(client, key, args.model, it)
            rows.append({
                "human": (it.get("correct_type") or "").strip().lower(),
                "ai": (it.get("ai_correct_type") or "").strip().lower(),
                "deepseek": ds,
                "machine": (it.get("rule_type") or "").strip().lower(),
                "source": it.get("source_type"),
            })
        for i in range(0, len(tagged), args.concurrency):
            await asyncio.gather(*(one(it) for it in tagged[i : i + args.concurrency]))
            print(f"  …{len(rows)}/{len(tagged)}", flush=True)

    judged = [r for r in rows if r["deepseek"]]
    print(f"\n=== INTER-RATER AGREEMENT on rule_role ({len(judged)} judged) ===")
    print("  ai↔human       (anchored baseline):   %d/%d = %.0f%%" % _agree(rows, "ai", "human"))
    print("  deepseek↔human (INDEPENDENT — key):    %d/%d = %.0f%%" % _agree(judged, "deepseek", "human"))
    print("  deepseek↔ai    (cross-model):          %d/%d = %.0f%%" % _agree(judged, "deepseek", "ai"))
    una = [r for r in judged if r["human"] == r["ai"] == r["deepseek"]]
    print(f"  3-way unanimous (human=ai=deepseek):   {len(una)}/{len(judged)} = {len(una)/max(1,len(judged)):.0%}")

    print("\n=== where the INDEPENDENT judge disagrees with the human (the real signal) ===")
    mm = Counter((r["human"], r["deepseek"]) for r in judged if r["human"] != r["deepseek"])
    for (h, d), n in mm.most_common():
        print(f"  human={h} → deepseek={d}: {n}")

    # COARSE axis: is this a generalizable rule at all? (holding/interpretive/
    # procedural collapse to one class) vs the non-generalizable markers
    # (application/obiter). If fine-grained agreement is low but coarse is high,
    # the disagreement is a cosmetic sub-distinction, not a meaningful one.
    GEN = {"holding", "interpretive", "procedural"}
    def coarse(v): return "rule" if v in GEN else ("nonrule" if v in {"application", "obiter"} else None)
    for r in judged:
        r["human_c"], r["deepseek_c"], r["ai_c"] = coarse(r["human"]), coarse(r["deepseek"]), coarse(r["ai"])
    print("\n=== COARSE agreement (generalizable-rule vs application/obiter) ===")
    print("  deepseek↔human (coarse):   %d/%d = %.0f%%" % _agree(judged, "deepseek_c", "human_c"))
    print("  ai↔human       (coarse):   %d/%d = %.0f%%" % _agree(judged, "ai_c", "human_c"))

    Path("/tmp/goldset_judge_raw.json").write_text(json.dumps(rows, ensure_ascii=False, indent=1))
    print("\nraw judgments → /tmp/goldset_judge_raw.json")
    return 0


if __name__ == "__main__":
    ap = argparse.ArgumentParser(description=__doc__,
                                 formatter_class=argparse.RawDescriptionHelpFormatter)
    ap.add_argument("--batch", default="default")
    ap.add_argument("--model", default="deepseek-chat", help="deepseek-chat | deepseek-reasoner")
    ap.add_argument("--limit", type=int, default=0)
    ap.add_argument("--concurrency", type=int, default=6)
    sys.exit(asyncio.run(main(ap.parse_args())))