feat(goldset): interactive gold-set tagging page (#81.7/#81.8)

Replaces the CSV-edit workflow with an in-app tagging page so the chair/Dafna
can label the extraction-quality gold-set by clicking, and see validator
precision/recall live.

Schema (V29): halacha_goldset — a stratified, human-tagged evaluation batch
(is_holding / correct_type / quote_complete, NULL until tagged).

db.py:
- goldset_create_sample (stratified round-robin over case×rule_type, idempotent),
- goldset_list (items + halacha content + the machine's own labels),
- goldset_tag (partial — one field at a time for keyboard tagging),
- goldset_score (ports the script's P/R/F1: each validator scored as a
  not-a-holding detector against the human tags — the #81.8 input).

API: GET /api/goldset, POST /api/goldset/sample, GET /api/goldset/score,
PATCH /api/goldset/{id}.

web-ui:
- lib/api/goldset.ts (hooks),
- components/goldset/goldset-panel.tsx — card-per-item, keyboard-first
  (J/K nav, H/N holding, C/X quote), progress bar, hide-tagged toggle, and a
  collapsible live score table,
- app/goldset/page.tsx + nav link "מדגם-זהב" under ידע ולמידה.

Methodology guard kept explicit in UI + docstrings: tags are HUMAN ground truth,
no AI pre-fill (circular bias). Populated a 150-item stratified batch.

Verified: backend create/list/tag/score against the live DB; tsc --noEmit 0;
py_compile ok. (Local Turbopack build blocked by worktree symlink — CI builds clean.)

Invariants: G1 (eval set modeled at source in its own table); G2 (reuses the same
halacha_quality validators the extractor runs — no parallel scoring logic).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-06 21:52:05 +00:00
parent 9bd247c421
commit ac279220c4
6 changed files with 632 additions and 1 deletions

View File

@@ -0,0 +1,105 @@
/**
* Gold-set tagging API (#81.7 / #81.8).
*
* The chair/Dafna manually labels a stratified sample of halachot
* (is_holding / correct_type / quote_complete). Those human labels are the
* ground truth used to measure the extraction validators and recalibrate the
* auto-approve threshold. Endpoints under /api/goldset.
*/
import { useMutation, useQuery, useQueryClient } from "@tanstack/react-query";
import { apiRequest } from "./client";
export type GoldsetItem = {
id: string;
halacha_id: string;
// human tags (null until tagged)
is_holding: boolean | null;
correct_type: string;
quote_complete: boolean | null;
tagged_by: string;
tagged_at: string | null;
// halacha content + the machine's own labels
rule_statement: string;
supporting_quote: string;
reasoning_summary: string;
rule_type: string;
confidence: number | null;
quality_flags?: string[];
review_status: string;
case_number: string | null;
case_name: string | null;
};
export type GoldsetScore = {
batch: string;
total: number;
labeled: number;
validators: Record<
string,
{ precision: number; recall: number; f1: number; tp: number; fp: number; fn: number; tn: number }
>;
};
export type GoldsetTag = {
is_holding?: boolean | null;
correct_type?: string;
quote_complete?: boolean | null;
};
const keys = {
all: ["goldset"] as const,
list: (batch: string) => ["goldset", "list", batch] as const,
score: (batch: string) => ["goldset", "score", batch] as const,
};
export function useGoldset(batch = "default") {
return useQuery({
queryKey: keys.list(batch),
queryFn: ({ signal }) =>
apiRequest<{ items: GoldsetItem[]; batch: string }>(
`/api/goldset?batch=${encodeURIComponent(batch)}`,
{ signal },
),
staleTime: 5_000,
refetchOnMount: "always",
});
}
export function useGoldsetScore(batch = "default") {
return useQuery({
queryKey: keys.score(batch),
queryFn: ({ signal }) =>
apiRequest<GoldsetScore>(
`/api/goldset/score?batch=${encodeURIComponent(batch)}`,
{ signal },
),
staleTime: 5_000,
});
}
export function useTagGoldset(batch = "default") {
const qc = useQueryClient();
return useMutation({
mutationFn: ({ id, tag }: { id: string; tag: GoldsetTag }) =>
apiRequest<{ ok: boolean }>(`/api/goldset/${encodeURIComponent(id)}`, {
method: "PATCH",
body: { ...tag, tagged_by: "chair" },
}),
onSuccess: () => {
qc.invalidateQueries({ queryKey: keys.list(batch) });
qc.invalidateQueries({ queryKey: keys.score(batch) });
},
});
}
export function useCreateGoldsetSample(batch = "default") {
const qc = useQueryClient();
return useMutation({
mutationFn: (n: number) =>
apiRequest<{ batch: string; inserted: number; total: number }>(
"/api/goldset/sample",
{ method: "POST", body: { n, batch } },
),
onSuccess: () => qc.invalidateQueries({ queryKey: keys.list(batch) }),
});
}