feat(goldset): interactive gold-set tagging page (#81.7/#81.8)

Replaces the CSV-edit workflow with an in-app tagging page so the chair/Dafna
can label the extraction-quality gold-set by clicking, and see validator
precision/recall live.

Schema (V29): halacha_goldset — a stratified, human-tagged evaluation batch
(is_holding / correct_type / quote_complete, NULL until tagged).

db.py:
- goldset_create_sample (stratified round-robin over case×rule_type, idempotent),
- goldset_list (items + halacha content + the machine's own labels),
- goldset_tag (partial — one field at a time for keyboard tagging),
- goldset_score (ports the script's P/R/F1: each validator scored as a
  not-a-holding detector against the human tags — the #81.8 input).

API: GET /api/goldset, POST /api/goldset/sample, GET /api/goldset/score,
PATCH /api/goldset/{id}.

web-ui:
- lib/api/goldset.ts (hooks),
- components/goldset/goldset-panel.tsx — card-per-item, keyboard-first
  (J/K nav, H/N holding, C/X quote), progress bar, hide-tagged toggle, and a
  collapsible live score table,
- app/goldset/page.tsx + nav link "מדגם-זהב" under ידע ולמידה.

Methodology guard kept explicit in UI + docstrings: tags are HUMAN ground truth,
no AI pre-fill (circular bias). Populated a 150-item stratified batch.

Verified: backend create/list/tag/score against the live DB; tsc --noEmit 0;
py_compile ok. (Local Turbopack build blocked by worktree symlink — CI builds clean.)

Invariants: G1 (eval set modeled at source in its own table); G2 (reuses the same
halacha_quality validators the extractor runs — no parallel scoring logic).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-06 21:52:05 +00:00
parent 9bd247c421
commit ac279220c4
6 changed files with 632 additions and 1 deletions

View File

@@ -1256,6 +1256,27 @@ CREATE INDEX IF NOT EXISTS idx_equiv_halacha_a ON equivalent_halachot(halacha_a)
CREATE INDEX IF NOT EXISTS idx_equiv_halacha_b ON equivalent_halachot(halacha_b);
"""
SCHEMA_V29_SQL = """
-- halacha_goldset (#81.7/#81.8): a human-tagged evaluation set. A stratified
-- sample of halachot the chair/Dafna labels (is_holding / correct_type /
-- quote_complete) so we can measure the extraction validators' precision/recall
-- and recalibrate the auto-approve threshold. The tags are the ground truth —
-- they MUST be human (no AI pre-fill) to avoid circular bias.
CREATE TABLE IF NOT EXISTS halacha_goldset (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
halacha_id UUID NOT NULL REFERENCES halachot(id) ON DELETE CASCADE,
batch TEXT NOT NULL DEFAULT 'default',
is_holding BOOLEAN, -- NULL until tagged
correct_type TEXT DEFAULT '', -- binding | interpretive | obiter | application | ''
quote_complete BOOLEAN,
tagged_by TEXT DEFAULT '',
tagged_at TIMESTAMPTZ,
created_at TIMESTAMPTZ DEFAULT now(),
UNIQUE (halacha_id, batch)
);
CREATE INDEX IF NOT EXISTS idx_goldset_batch ON halacha_goldset(batch);
"""
async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
async with pool.acquire() as conn:
@@ -1288,7 +1309,8 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
await conn.execute(SCHEMA_V26_SQL)
await conn.execute(SCHEMA_V27_SQL)
await conn.execute(SCHEMA_V28_SQL)
logger.info("Database schema initialized (v1-v28)")
await conn.execute(SCHEMA_V29_SQL)
logger.info("Database schema initialized (v1-v29)")
async def init_schema() -> None:
@@ -4270,6 +4292,132 @@ async def _annotate_equivalents(pool, out: list[dict]) -> None:
d["equivalents"] = by_src.get(str(d["id"]), [])
# ── Gold-set evaluation (#81.7 / #81.8) ──────────────────────────────────────
async def goldset_create_sample(
n: int = 150, batch: str = "default", reset: bool = False,
) -> dict:
"""Stratified sample of halachot (round-robin over case×rule_type) into a
tagging batch. Idempotent (ON CONFLICT); ``reset`` clears the batch first."""
pool = await get_pool()
if reset:
await pool.execute("DELETE FROM halacha_goldset WHERE batch = $1", batch)
rows = await pool.fetch(
"SELECT id, case_law_id, rule_type FROM halachot WHERE rule_statement <> ''"
)
from collections import defaultdict
buckets: dict = defaultdict(list)
for r in rows:
buckets[(r["case_law_id"], r["rule_type"])].append(r["id"])
keys = list(buckets.values())
sample: list = []
i = 0
while len(sample) < n and any(keys):
b = keys[i % len(keys)]
if b:
sample.append(b.pop())
i += 1
if i > n * 50:
break
inserted = 0
for hid in sample:
res = await pool.execute(
"INSERT INTO halacha_goldset (halacha_id, batch) VALUES ($1, $2) "
"ON CONFLICT (halacha_id, batch) DO NOTHING", hid, batch,
)
if res.endswith(" 1"):
inserted += 1
total = await pool.fetchval(
"SELECT count(*) FROM halacha_goldset WHERE batch = $1", batch)
return {"batch": batch, "inserted": inserted, "total": total}
async def goldset_list(batch: str = "default") -> list[dict]:
"""Gold-set items joined with the halacha content + the machine's labels."""
pool = await get_pool()
rows = await pool.fetch(
"SELECT g.id, g.halacha_id::text AS halacha_id, g.is_holding, "
" g.correct_type, g.quote_complete, g.tagged_by, g.tagged_at, "
" h.rule_statement, h.supporting_quote, h.reasoning_summary, "
" h.rule_type, h.confidence, h.quality_flags, h.review_status, "
" cl.case_number, cl.case_name "
"FROM halacha_goldset g JOIN halachot h ON h.id = g.halacha_id "
"LEFT JOIN case_law cl ON cl.id = h.case_law_id "
"WHERE g.batch = $1 ORDER BY g.created_at, g.id", batch,
)
out = []
for r in rows:
d = dict(r)
if d.get("tagged_at") is not None:
d["tagged_at"] = d["tagged_at"].isoformat()
if d.get("confidence") is not None:
d["confidence"] = float(d["confidence"])
out.append(d)
return out
async def goldset_tag(
goldset_id: UUID, *, is_holding: bool | None = None,
correct_type: str | None = None, quote_complete: bool | None = None,
tagged_by: str = "chair",
) -> dict | None:
"""Save one human tag (partial — only provided fields change)."""
pool = await get_pool()
sets = ["tagged_by = $2", "tagged_at = now()"]
params: list = [goldset_id, tagged_by]
i = 3
if is_holding is not None:
sets.append(f"is_holding = ${i}"); params.append(is_holding); i += 1
if correct_type is not None:
sets.append(f"correct_type = ${i}"); params.append(correct_type); i += 1
if quote_complete is not None:
sets.append(f"quote_complete = ${i}"); params.append(quote_complete); i += 1
row = await pool.fetchrow(
f"UPDATE halacha_goldset SET {', '.join(sets)} WHERE id = $1 RETURNING *", *params,
)
return dict(row) if row else None
async def goldset_score(batch: str = "default") -> dict:
"""Measure each extraction validator against the human tags (#81.8).
A validator flag predicts "NOT a clean holding"; ground truth is
is_holding == false. truncated_quote is scored against quote_complete."""
items = await goldset_list(batch)
labeled = [r for r in items if r.get("is_holding") is not None]
from collections import defaultdict
counters: dict = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0, "tn": 0})
def tally(name: str, predicted_bad: bool, truly_bad: bool) -> None:
c = counters[name]
key = ("tp" if truly_bad else "fp") if predicted_bad else ("fn" if truly_bad else "tn")
c[key] += 1
for r in labeled:
rule = r.get("rule_statement") or ""
quote = r.get("supporting_quote") or ""
rtype = r.get("rule_type") or "binding"
qc = r["quote_complete"] if r["quote_complete"] is not None else True
truly_bad = r["is_holding"] is False
flags = halacha_quality.compute_quality_flags(rule, quote, "", qc, rtype)
tally("any_flag", bool(flags), truly_bad)
tally("application", halacha_quality.FLAG_APPLICATION in flags, truly_bad)
tally("non_decision", halacha_quality.FLAG_NON_DECISION in flags, truly_bad)
tally("thin_restatement", halacha_quality.FLAG_THIN_RESTATEMENT in flags, truly_bad)
tally("truncated_quote", halacha_quality.is_quote_truncated(quote), qc is False)
def prf(c: dict) -> dict:
p = c["tp"] / (c["tp"] + c["fp"]) if (c["tp"] + c["fp"]) else 0.0
rec = c["tp"] / (c["tp"] + c["fn"]) if (c["tp"] + c["fn"]) else 0.0
f1 = 2 * p * rec / (p + rec) if (p + rec) else 0.0
return {"precision": round(p, 3), "recall": round(rec, 3), "f1": round(f1, 3), **c}
return {
"batch": batch, "total": len(items), "labeled": len(labeled),
"validators": {name: prf(c) for name, c in counters.items()},
}
async def list_corroboration_for_halacha(halacha_id: UUID) -> list[dict]:
"""Return all corroboration rows for one halacha, ordered by match_score DESC."""
pool = await get_pool()