feat(goldset): interactive gold-set tagging page (#81.7/#81.8)
Replaces the CSV-edit workflow with an in-app tagging page so the chair/Dafna
can label the extraction-quality gold-set by clicking, and see validator
precision/recall live.
Schema (V29): halacha_goldset — a stratified, human-tagged evaluation batch
(is_holding / correct_type / quote_complete, NULL until tagged).
db.py:
- goldset_create_sample (stratified round-robin over case×rule_type, idempotent),
- goldset_list (items + halacha content + the machine's own labels),
- goldset_tag (partial — one field at a time for keyboard tagging),
- goldset_score (ports the script's P/R/F1: each validator scored as a
not-a-holding detector against the human tags — the #81.8 input).
API: GET /api/goldset, POST /api/goldset/sample, GET /api/goldset/score,
PATCH /api/goldset/{id}.
web-ui:
- lib/api/goldset.ts (hooks),
- components/goldset/goldset-panel.tsx — card-per-item, keyboard-first
(J/K nav, H/N holding, C/X quote), progress bar, hide-tagged toggle, and a
collapsible live score table,
- app/goldset/page.tsx + nav link "מדגם-זהב" under ידע ולמידה.
Methodology guard kept explicit in UI + docstrings: tags are HUMAN ground truth,
no AI pre-fill (circular bias). Populated a 150-item stratified batch.
Verified: backend create/list/tag/score against the live DB; tsc --noEmit 0;
py_compile ok. (Local Turbopack build blocked by worktree symlink — CI builds clean.)
Invariants: G1 (eval set modeled at source in its own table); G2 (reuses the same
halacha_quality validators the extractor runs — no parallel scoring logic).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1256,6 +1256,27 @@ CREATE INDEX IF NOT EXISTS idx_equiv_halacha_a ON equivalent_halachot(halacha_a)
|
||||
CREATE INDEX IF NOT EXISTS idx_equiv_halacha_b ON equivalent_halachot(halacha_b);
|
||||
"""
|
||||
|
||||
SCHEMA_V29_SQL = """
|
||||
-- halacha_goldset (#81.7/#81.8): a human-tagged evaluation set. A stratified
|
||||
-- sample of halachot the chair/Dafna labels (is_holding / correct_type /
|
||||
-- quote_complete) so we can measure the extraction validators' precision/recall
|
||||
-- and recalibrate the auto-approve threshold. The tags are the ground truth —
|
||||
-- they MUST be human (no AI pre-fill) to avoid circular bias.
|
||||
CREATE TABLE IF NOT EXISTS halacha_goldset (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
halacha_id UUID NOT NULL REFERENCES halachot(id) ON DELETE CASCADE,
|
||||
batch TEXT NOT NULL DEFAULT 'default',
|
||||
is_holding BOOLEAN, -- NULL until tagged
|
||||
correct_type TEXT DEFAULT '', -- binding | interpretive | obiter | application | ''
|
||||
quote_complete BOOLEAN,
|
||||
tagged_by TEXT DEFAULT '',
|
||||
tagged_at TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ DEFAULT now(),
|
||||
UNIQUE (halacha_id, batch)
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_goldset_batch ON halacha_goldset(batch);
|
||||
"""
|
||||
|
||||
|
||||
async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
|
||||
async with pool.acquire() as conn:
|
||||
@@ -1288,7 +1309,8 @@ async def _run_schema_migrations(pool: asyncpg.Pool) -> None:
|
||||
await conn.execute(SCHEMA_V26_SQL)
|
||||
await conn.execute(SCHEMA_V27_SQL)
|
||||
await conn.execute(SCHEMA_V28_SQL)
|
||||
logger.info("Database schema initialized (v1-v28)")
|
||||
await conn.execute(SCHEMA_V29_SQL)
|
||||
logger.info("Database schema initialized (v1-v29)")
|
||||
|
||||
|
||||
async def init_schema() -> None:
|
||||
@@ -4270,6 +4292,132 @@ async def _annotate_equivalents(pool, out: list[dict]) -> None:
|
||||
d["equivalents"] = by_src.get(str(d["id"]), [])
|
||||
|
||||
|
||||
# ── Gold-set evaluation (#81.7 / #81.8) ──────────────────────────────────────
|
||||
|
||||
async def goldset_create_sample(
|
||||
n: int = 150, batch: str = "default", reset: bool = False,
|
||||
) -> dict:
|
||||
"""Stratified sample of halachot (round-robin over case×rule_type) into a
|
||||
tagging batch. Idempotent (ON CONFLICT); ``reset`` clears the batch first."""
|
||||
pool = await get_pool()
|
||||
if reset:
|
||||
await pool.execute("DELETE FROM halacha_goldset WHERE batch = $1", batch)
|
||||
rows = await pool.fetch(
|
||||
"SELECT id, case_law_id, rule_type FROM halachot WHERE rule_statement <> ''"
|
||||
)
|
||||
from collections import defaultdict
|
||||
buckets: dict = defaultdict(list)
|
||||
for r in rows:
|
||||
buckets[(r["case_law_id"], r["rule_type"])].append(r["id"])
|
||||
keys = list(buckets.values())
|
||||
sample: list = []
|
||||
i = 0
|
||||
while len(sample) < n and any(keys):
|
||||
b = keys[i % len(keys)]
|
||||
if b:
|
||||
sample.append(b.pop())
|
||||
i += 1
|
||||
if i > n * 50:
|
||||
break
|
||||
inserted = 0
|
||||
for hid in sample:
|
||||
res = await pool.execute(
|
||||
"INSERT INTO halacha_goldset (halacha_id, batch) VALUES ($1, $2) "
|
||||
"ON CONFLICT (halacha_id, batch) DO NOTHING", hid, batch,
|
||||
)
|
||||
if res.endswith(" 1"):
|
||||
inserted += 1
|
||||
total = await pool.fetchval(
|
||||
"SELECT count(*) FROM halacha_goldset WHERE batch = $1", batch)
|
||||
return {"batch": batch, "inserted": inserted, "total": total}
|
||||
|
||||
|
||||
async def goldset_list(batch: str = "default") -> list[dict]:
|
||||
"""Gold-set items joined with the halacha content + the machine's labels."""
|
||||
pool = await get_pool()
|
||||
rows = await pool.fetch(
|
||||
"SELECT g.id, g.halacha_id::text AS halacha_id, g.is_holding, "
|
||||
" g.correct_type, g.quote_complete, g.tagged_by, g.tagged_at, "
|
||||
" h.rule_statement, h.supporting_quote, h.reasoning_summary, "
|
||||
" h.rule_type, h.confidence, h.quality_flags, h.review_status, "
|
||||
" cl.case_number, cl.case_name "
|
||||
"FROM halacha_goldset g JOIN halachot h ON h.id = g.halacha_id "
|
||||
"LEFT JOIN case_law cl ON cl.id = h.case_law_id "
|
||||
"WHERE g.batch = $1 ORDER BY g.created_at, g.id", batch,
|
||||
)
|
||||
out = []
|
||||
for r in rows:
|
||||
d = dict(r)
|
||||
if d.get("tagged_at") is not None:
|
||||
d["tagged_at"] = d["tagged_at"].isoformat()
|
||||
if d.get("confidence") is not None:
|
||||
d["confidence"] = float(d["confidence"])
|
||||
out.append(d)
|
||||
return out
|
||||
|
||||
|
||||
async def goldset_tag(
|
||||
goldset_id: UUID, *, is_holding: bool | None = None,
|
||||
correct_type: str | None = None, quote_complete: bool | None = None,
|
||||
tagged_by: str = "chair",
|
||||
) -> dict | None:
|
||||
"""Save one human tag (partial — only provided fields change)."""
|
||||
pool = await get_pool()
|
||||
sets = ["tagged_by = $2", "tagged_at = now()"]
|
||||
params: list = [goldset_id, tagged_by]
|
||||
i = 3
|
||||
if is_holding is not None:
|
||||
sets.append(f"is_holding = ${i}"); params.append(is_holding); i += 1
|
||||
if correct_type is not None:
|
||||
sets.append(f"correct_type = ${i}"); params.append(correct_type); i += 1
|
||||
if quote_complete is not None:
|
||||
sets.append(f"quote_complete = ${i}"); params.append(quote_complete); i += 1
|
||||
row = await pool.fetchrow(
|
||||
f"UPDATE halacha_goldset SET {', '.join(sets)} WHERE id = $1 RETURNING *", *params,
|
||||
)
|
||||
return dict(row) if row else None
|
||||
|
||||
|
||||
async def goldset_score(batch: str = "default") -> dict:
|
||||
"""Measure each extraction validator against the human tags (#81.8).
|
||||
|
||||
A validator flag predicts "NOT a clean holding"; ground truth is
|
||||
is_holding == false. truncated_quote is scored against quote_complete."""
|
||||
items = await goldset_list(batch)
|
||||
labeled = [r for r in items if r.get("is_holding") is not None]
|
||||
from collections import defaultdict
|
||||
counters: dict = defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0, "tn": 0})
|
||||
|
||||
def tally(name: str, predicted_bad: bool, truly_bad: bool) -> None:
|
||||
c = counters[name]
|
||||
key = ("tp" if truly_bad else "fp") if predicted_bad else ("fn" if truly_bad else "tn")
|
||||
c[key] += 1
|
||||
|
||||
for r in labeled:
|
||||
rule = r.get("rule_statement") or ""
|
||||
quote = r.get("supporting_quote") or ""
|
||||
rtype = r.get("rule_type") or "binding"
|
||||
qc = r["quote_complete"] if r["quote_complete"] is not None else True
|
||||
truly_bad = r["is_holding"] is False
|
||||
flags = halacha_quality.compute_quality_flags(rule, quote, "", qc, rtype)
|
||||
tally("any_flag", bool(flags), truly_bad)
|
||||
tally("application", halacha_quality.FLAG_APPLICATION in flags, truly_bad)
|
||||
tally("non_decision", halacha_quality.FLAG_NON_DECISION in flags, truly_bad)
|
||||
tally("thin_restatement", halacha_quality.FLAG_THIN_RESTATEMENT in flags, truly_bad)
|
||||
tally("truncated_quote", halacha_quality.is_quote_truncated(quote), qc is False)
|
||||
|
||||
def prf(c: dict) -> dict:
|
||||
p = c["tp"] / (c["tp"] + c["fp"]) if (c["tp"] + c["fp"]) else 0.0
|
||||
rec = c["tp"] / (c["tp"] + c["fn"]) if (c["tp"] + c["fn"]) else 0.0
|
||||
f1 = 2 * p * rec / (p + rec) if (p + rec) else 0.0
|
||||
return {"precision": round(p, 3), "recall": round(rec, 3), "f1": round(f1, 3), **c}
|
||||
|
||||
return {
|
||||
"batch": batch, "total": len(items), "labeled": len(labeled),
|
||||
"validators": {name: prf(c) for name, c in counters.items()},
|
||||
}
|
||||
|
||||
|
||||
async def list_corroboration_for_halacha(halacha_id: UUID) -> list[dict]:
|
||||
"""Return all corroboration rows for one halacha, ordered by match_score DESC."""
|
||||
pool = await get_pool()
|
||||
|
||||
Reference in New Issue
Block a user