feat(goldset): interactive gold-set tagging page (#81.7/#81.8)
Replaces the CSV-edit workflow with an in-app tagging page so the chair/Dafna
can label the extraction-quality gold-set by clicking, and see validator
precision/recall live.
Schema (V29): halacha_goldset — a stratified, human-tagged evaluation batch
(is_holding / correct_type / quote_complete, NULL until tagged).
db.py:
- goldset_create_sample (stratified round-robin over case×rule_type, idempotent),
- goldset_list (items + halacha content + the machine's own labels),
- goldset_tag (partial — one field at a time for keyboard tagging),
- goldset_score (ports the script's P/R/F1: each validator scored as a
not-a-holding detector against the human tags — the #81.8 input).
API: GET /api/goldset, POST /api/goldset/sample, GET /api/goldset/score,
PATCH /api/goldset/{id}.
web-ui:
- lib/api/goldset.ts (hooks),
- components/goldset/goldset-panel.tsx — card-per-item, keyboard-first
(J/K nav, H/N holding, C/X quote), progress bar, hide-tagged toggle, and a
collapsible live score table,
- app/goldset/page.tsx + nav link "מדגם-זהב" under ידע ולמידה.
Methodology guard kept explicit in UI + docstrings: tags are HUMAN ground truth,
no AI pre-fill (circular bias). Populated a 150-item stratified batch.
Verified: backend create/list/tag/score against the live DB; tsc --noEmit 0;
py_compile ok. (Local Turbopack build blocked by worktree symlink — CI builds clean.)
Invariants: G1 (eval set modeled at source in its own table); G2 (reuses the same
halacha_quality validators the extractor runs — no parallel scoring logic).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
105
web-ui/src/lib/api/goldset.ts
Normal file
105
web-ui/src/lib/api/goldset.ts
Normal file
@@ -0,0 +1,105 @@
|
||||
/**
|
||||
* Gold-set tagging API (#81.7 / #81.8).
|
||||
*
|
||||
* The chair/Dafna manually labels a stratified sample of halachot
|
||||
* (is_holding / correct_type / quote_complete). Those human labels are the
|
||||
* ground truth used to measure the extraction validators and recalibrate the
|
||||
* auto-approve threshold. Endpoints under /api/goldset.
|
||||
*/
|
||||
import { useMutation, useQuery, useQueryClient } from "@tanstack/react-query";
|
||||
import { apiRequest } from "./client";
|
||||
|
||||
export type GoldsetItem = {
|
||||
id: string;
|
||||
halacha_id: string;
|
||||
// human tags (null until tagged)
|
||||
is_holding: boolean | null;
|
||||
correct_type: string;
|
||||
quote_complete: boolean | null;
|
||||
tagged_by: string;
|
||||
tagged_at: string | null;
|
||||
// halacha content + the machine's own labels
|
||||
rule_statement: string;
|
||||
supporting_quote: string;
|
||||
reasoning_summary: string;
|
||||
rule_type: string;
|
||||
confidence: number | null;
|
||||
quality_flags?: string[];
|
||||
review_status: string;
|
||||
case_number: string | null;
|
||||
case_name: string | null;
|
||||
};
|
||||
|
||||
export type GoldsetScore = {
|
||||
batch: string;
|
||||
total: number;
|
||||
labeled: number;
|
||||
validators: Record<
|
||||
string,
|
||||
{ precision: number; recall: number; f1: number; tp: number; fp: number; fn: number; tn: number }
|
||||
>;
|
||||
};
|
||||
|
||||
export type GoldsetTag = {
|
||||
is_holding?: boolean | null;
|
||||
correct_type?: string;
|
||||
quote_complete?: boolean | null;
|
||||
};
|
||||
|
||||
const keys = {
|
||||
all: ["goldset"] as const,
|
||||
list: (batch: string) => ["goldset", "list", batch] as const,
|
||||
score: (batch: string) => ["goldset", "score", batch] as const,
|
||||
};
|
||||
|
||||
export function useGoldset(batch = "default") {
|
||||
return useQuery({
|
||||
queryKey: keys.list(batch),
|
||||
queryFn: ({ signal }) =>
|
||||
apiRequest<{ items: GoldsetItem[]; batch: string }>(
|
||||
`/api/goldset?batch=${encodeURIComponent(batch)}`,
|
||||
{ signal },
|
||||
),
|
||||
staleTime: 5_000,
|
||||
refetchOnMount: "always",
|
||||
});
|
||||
}
|
||||
|
||||
export function useGoldsetScore(batch = "default") {
|
||||
return useQuery({
|
||||
queryKey: keys.score(batch),
|
||||
queryFn: ({ signal }) =>
|
||||
apiRequest<GoldsetScore>(
|
||||
`/api/goldset/score?batch=${encodeURIComponent(batch)}`,
|
||||
{ signal },
|
||||
),
|
||||
staleTime: 5_000,
|
||||
});
|
||||
}
|
||||
|
||||
export function useTagGoldset(batch = "default") {
|
||||
const qc = useQueryClient();
|
||||
return useMutation({
|
||||
mutationFn: ({ id, tag }: { id: string; tag: GoldsetTag }) =>
|
||||
apiRequest<{ ok: boolean }>(`/api/goldset/${encodeURIComponent(id)}`, {
|
||||
method: "PATCH",
|
||||
body: { ...tag, tagged_by: "chair" },
|
||||
}),
|
||||
onSuccess: () => {
|
||||
qc.invalidateQueries({ queryKey: keys.list(batch) });
|
||||
qc.invalidateQueries({ queryKey: keys.score(batch) });
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
export function useCreateGoldsetSample(batch = "default") {
|
||||
const qc = useQueryClient();
|
||||
return useMutation({
|
||||
mutationFn: (n: number) =>
|
||||
apiRequest<{ batch: string; inserted: number; total: number }>(
|
||||
"/api/goldset/sample",
|
||||
{ method: "POST", body: { n, batch } },
|
||||
),
|
||||
onSuccess: () => qc.invalidateQueries({ queryKey: keys.list(batch) }),
|
||||
});
|
||||
}
|
||||
Reference in New Issue
Block a user