feat(goldset): interactive gold-set tagging page (#81.7/#81.8)

Replaces the CSV-edit workflow with an in-app tagging page so the chair/Dafna can label the extraction-quality gold-set by clicking, and see validator precision/recall live. Schema (V29): halacha_goldset — a stratified, human-tagged evaluation batch (is_holding / correct_type / quote_complete, NULL until tagged). db.py: - goldset_create_sample (stratified round-robin over case×rule_type, idempotent), - goldset_list (items + halacha content + the machine's own labels), - goldset_tag (partial — one field at a time for keyboard tagging), - goldset_score (ports the script's P/R/F1: each validator scored as a not-a-holding detector against the human tags — the #81.8 input). API: GET /api/goldset, POST /api/goldset/sample, GET /api/goldset/score, PATCH /api/goldset/{id}. web-ui: - lib/api/goldset.ts (hooks), - components/goldset/goldset-panel.tsx — card-per-item, keyboard-first (J/K nav, H/N holding, C/X quote), progress bar, hide-tagged toggle, and a collapsible live score table, - app/goldset/page.tsx + nav link "מדגם-זהב" under ידע ולמידה. Methodology guard kept explicit in UI + docstrings: tags are HUMAN ground truth, no AI pre-fill (circular bias). Populated a 150-item stratified batch. Verified: backend create/list/tag/score against the live DB; tsc --noEmit 0; py_compile ok. (Local Turbopack build blocked by worktree symlink — CI builds clean.) Invariants: G1 (eval set modeled at source in its own table); G2 (reuses the same halacha_quality validators the extractor runs — no parallel scoring logic). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-06 21:52:05 +00:00
parent 9bd247c421
commit ac279220c4
6 changed files with 632 additions and 1 deletions
--- a/mcp-server/src/legal_mcp/services/db.py
+++ b/mcp-server/src/legal_mcp/services/db.py
@@ -1256,6 +1256,27 @@ CREATE INDEX IF NOT EXISTS idx_equiv_halacha_a ON equivalent_halachot(halacha_a)
 CREATE INDEX IF NOT EXISTS idx_equiv_halacha_b ON equivalent_halachot(halacha_b);
 """

+SCHEMA_V29_SQL = """
+-- halacha_goldset (#81.7/#81.8): a human-tagged evaluation set. A stratified
+-- sample of halachot the chair/Dafna labels (is_holding / correct_type /
+-- quote_complete) so we can measure the extraction validators' precision/recall
+-- and recalibrate the auto-approve threshold. The tags are the ground truth —
+-- they MUST be human (no AI pre-fill) to avoid circular bias.
+CREATE TABLE IF NOT EXISTS halacha_goldset (
+    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    halacha_id UUID NOT NULL REFERENCES halachot(id) ON DELETE CASCADE,
+    batch TEXT NOT NULL DEFAULT 'default',
+    is_holding BOOLEAN,            -- NULL until tagged
+    correct_type TEXT DEFAULT '',  -- binding | interpretive | obiter | application | ''
+    quote_complete BOOLEAN,
+    tagged_by TEXT DEFAULT '',
+    tagged_at TIMESTAMPTZ,
+    created_at TIMESTAMPTZ DEFAULT now(),
+    UNIQUE (halacha_id, batch)
+);
+CREATE INDEX IF NOT EXISTS idx_goldset_batch ON halacha_goldset(batch);
+"""
+

 async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
    async with pool.acquire() as conn:
@@ -1288,7 +1309,8 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
        await conn.execute(SCHEMA_V26_SQL)
        await conn.execute(SCHEMA_V27_SQL)
        await conn.execute(SCHEMA_V28_SQL)
-    logger.info("Database schema initialized (v1-v28)")
+        await conn.execute(SCHEMA_V29_SQL)
+    logger.info("Database schema initialized (v1-v29)")


 async def init_schema() -> None:
@@ -4270,6 +4292,132 @@ async def _annotate_equivalents(pool, out: list[dict]) -> None:
        d["equivalents"] = by_src.get(str(d["id"]), [])


+# ── Gold-set evaluation (#81.7 / #81.8) ──────────────────────────────────────
+
+async def goldset_create_sample(
+    n: int = 150, batch: str = "default", reset: bool = False,
+) -> dict:
+    """Stratified sample of halachot (round-robin over case×rule_type) into a
+    tagging batch. Idempotent (ON CONFLICT); ``reset`` clears the batch first."""
+    pool = await get_pool()
+    if reset:
+        await pool.execute("DELETE FROM halacha_goldset WHERE batch = $1", batch)
+    rows = await pool.fetch(
+        "SELECT id, case_law_id, rule_type FROM halachot WHERE rule_statement <> ''"
+    )
+    from collections import defaultdict
+    buckets: dict = defaultdict(list)
+    for r in rows:
+        buckets[(r["case_law_id"], r["rule_type"])].append(r["id"])
+    keys = list(buckets.values())
+    sample: list = []
+    i = 0
+    while len(sample) < n and any(keys):
+        b = keys[i % len(keys)]
+        if b:
+            sample.append(b.pop())
+        i += 1
+        if i > n * 50:
+            break
+    inserted = 0
+    for hid in sample:
+        res = await pool.execute(
+            "INSERT INTO halacha_goldset (halacha_id, batch) VALUES ($1, $2) "
+            "ON CONFLICT (halacha_id, batch) DO NOTHING", hid, batch,
+        )
+        if res.endswith(" 1"):
+            inserted += 1
+    total = await pool.fetchval(
+        "SELECT count(*) FROM halacha_goldset WHERE batch = $1", batch)
+    return {"batch": batch, "inserted": inserted, "total": total}
+
+
+async def goldset_list(batch: str = "default") -> list[dict]:
+    """Gold-set items joined with the halacha content + the machine's labels."""
+    pool = await get_pool()
+    rows = await pool.fetch(
+        "SELECT g.id, g.halacha_id::text AS halacha_id, g.is_holding, "
+        "       g.correct_type, g.quote_complete, g.tagged_by, g.tagged_at, "
+        "       h.rule_statement, h.supporting_quote, h.reasoning_summary, "
+        "       h.rule_type, h.confidence, h.quality_flags, h.review_status, "
+        "       cl.case_number, cl.case_name "
+        "FROM halacha_goldset g JOIN halachot h ON h.id = g.halacha_id "
+        "LEFT JOIN case_law cl ON cl.id = h.case_law_id "
+        "WHERE g.batch = $1 ORDER BY g.created_at, g.id", batch,
+    )
+    out = []
+    for r in rows:
+        d = dict(r)
+        if d.get("tagged_at") is not None:
+            d["tagged_at"] = d["tagged_at"].isoformat()
+        if d.get("confidence") is not None:
+            d["confidence"] = float(d["confidence"])
+        out.append(d)
+    return out
+
+
+async def goldset_tag(
+    goldset_id: UUID, *, is_holding: bool | None = None,
+    correct_type: str | None = None, quote_complete: bool | None = None,
+    tagged_by: str = "chair",
+) -> dict | None:
+    """Save one human tag (partial — only provided fields change)."""
+    pool = await get_pool()
+    sets = ["tagged_by = $2", "tagged_at = now()"]
+    params: list = [goldset_id, tagged_by]
+    i = 3
+    if is_holding is not None:
+        sets.append(f"is_holding = ${i}"); params.append(is_holding); i += 1
+    if correct_type is not None:
+        sets.append(f"correct_type = ${i}"); params.append(correct_type); i += 1
+    if quote_complete is not None:
+        sets.append(f"quote_complete = ${i}"); params.append(quote_complete); i += 1
+    row = await pool.fetchrow(
+        f"UPDATE halacha_goldset SET {', '.join(sets)} WHERE id = $1 RETURNING *", *params,
+    )
+    return dict(row) if row else None
+
+
+async def goldset_score(batch: str = "default") -> dict:
+    """Measure each extraction validator against the human tags (#81.8).
+
+    A validator flag predicts "NOT a clean holding"; ground truth is
+    is_holding == false. truncated_quote is scored against quote_complete."""
+    items = await goldset_list(batch)
+    labeled = [r for r in items if r.get("is_holding") is not None]
+    from collections import defaultdict
+    counters: dict = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0, "tn": 0})
+
+    def tally(name: str, predicted_bad: bool, truly_bad: bool) -> None:
+        c = counters[name]
+        key = ("tp" if truly_bad else "fp") if predicted_bad else ("fn" if truly_bad else "tn")
+        c[key] += 1
+
+    for r in labeled:
+        rule = r.get("rule_statement") or ""
+        quote = r.get("supporting_quote") or ""
+        rtype = r.get("rule_type") or "binding"
+        qc = r["quote_complete"] if r["quote_complete"] is not None else True
+        truly_bad = r["is_holding"] is False
+        flags = halacha_quality.compute_quality_flags(rule, quote, "", qc, rtype)
+        tally("any_flag", bool(flags), truly_bad)
+        tally("application", halacha_quality.FLAG_APPLICATION in flags, truly_bad)
+        tally("non_decision", halacha_quality.FLAG_NON_DECISION in flags, truly_bad)
+        tally("thin_restatement", halacha_quality.FLAG_THIN_RESTATEMENT in flags, truly_bad)
+        tally("truncated_quote", halacha_quality.is_quote_truncated(quote), qc is False)
+
+    def prf(c: dict) -> dict:
+        p = c["tp"] / (c["tp"] + c["fp"]) if (c["tp"] + c["fp"]) else 0.0
+        rec = c["tp"] / (c["tp"] + c["fn"]) if (c["tp"] + c["fn"]) else 0.0
+        f1 = 2 * p * rec / (p + rec) if (p + rec) else 0.0
+        return {"precision": round(p, 3), "recall": round(rec, 3), "f1": round(f1, 3), **c}
+
+    return {
+        "batch": batch, "total": len(items), "labeled": len(labeled),
+        "validators": {name: prf(c) for name, c in counters.items()},
+    }
+
+
 async def list_corroboration_for_halacha(halacha_id: UUID) -> list[dict]:
    """Return all corroboration rows for one halacha, ordered by match_score DESC."""
    pool = await get_pool()