feat(goldset): interactive gold-set tagging page (#81.7/#81.8)

Replaces the CSV-edit workflow with an in-app tagging page so the chair/Dafna can label the extraction-quality gold-set by clicking, and see validator precision/recall live. Schema (V29): halacha_goldset — a stratified, human-tagged evaluation batch (is_holding / correct_type / quote_complete, NULL until tagged). db.py: - goldset_create_sample (stratified round-robin over case×rule_type, idempotent), - goldset_list (items + halacha content + the machine's own labels), - goldset_tag (partial — one field at a time for keyboard tagging), - goldset_score (ports the script's P/R/F1: each validator scored as a not-a-holding detector against the human tags — the #81.8 input). API: GET /api/goldset, POST /api/goldset/sample, GET /api/goldset/score, PATCH /api/goldset/{id}. web-ui: - lib/api/goldset.ts (hooks), - components/goldset/goldset-panel.tsx — card-per-item, keyboard-first (J/K nav, H/N holding, C/X quote), progress bar, hide-tagged toggle, and a collapsible live score table, - app/goldset/page.tsx + nav link "מדגם-זהב" under ידע ולמידה. Methodology guard kept explicit in UI + docstrings: tags are HUMAN ground truth, no AI pre-fill (circular bias). Populated a 150-item stratified batch. Verified: backend create/list/tag/score against the live DB; tsc --noEmit 0; py_compile ok. (Local Turbopack build blocked by worktree symlink — CI builds clean.) Invariants: G1 (eval set modeled at source in its own table); G2 (reuses the same halacha_quality validators the extractor runs — no parallel scoring logic). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-06 21:52:05 +00:00
parent 9bd247c421
commit ac279220c4
6 changed files with 632 additions and 1 deletions
--- a/web-ui/src/lib/api/goldset.ts
+++ b/web-ui/src/lib/api/goldset.ts
@@ -0,0 +1,105 @@
+/**
+ * Gold-set tagging API (#81.7 / #81.8).
+ *
+ * The chair/Dafna manually labels a stratified sample of halachot
+ * (is_holding / correct_type / quote_complete). Those human labels are the
+ * ground truth used to measure the extraction validators and recalibrate the
+ * auto-approve threshold. Endpoints under /api/goldset.
+ */
+import { useMutation, useQuery, useQueryClient } from "@tanstack/react-query";
+import { apiRequest } from "./client";
+
+export type GoldsetItem = {
+  id: string;
+  halacha_id: string;
+  // human tags (null until tagged)
+  is_holding: boolean | null;
+  correct_type: string;
+  quote_complete: boolean | null;
+  tagged_by: string;
+  tagged_at: string | null;
+  // halacha content + the machine's own labels
+  rule_statement: string;
+  supporting_quote: string;
+  reasoning_summary: string;
+  rule_type: string;
+  confidence: number | null;
+  quality_flags?: string[];
+  review_status: string;
+  case_number: string | null;
+  case_name: string | null;
+};
+
+export type GoldsetScore = {
+  batch: string;
+  total: number;
+  labeled: number;
+  validators: Record<
+    string,
+    { precision: number; recall: number; f1: number; tp: number; fp: number; fn: number; tn: number }
+  >;
+};
+
+export type GoldsetTag = {
+  is_holding?: boolean | null;
+  correct_type?: string;
+  quote_complete?: boolean | null;
+};
+
+const keys = {
+  all: ["goldset"] as const,
+  list: (batch: string) => ["goldset", "list", batch] as const,
+  score: (batch: string) => ["goldset", "score", batch] as const,
+};
+
+export function useGoldset(batch = "default") {
+  return useQuery({
+    queryKey: keys.list(batch),
+    queryFn: ({ signal }) =>
+      apiRequest<{ items: GoldsetItem[]; batch: string }>(
+        `/api/goldset?batch=${encodeURIComponent(batch)}`,
+        { signal },
+      ),
+    staleTime: 5_000,
+    refetchOnMount: "always",
+  });
+}
+
+export function useGoldsetScore(batch = "default") {
+  return useQuery({
+    queryKey: keys.score(batch),
+    queryFn: ({ signal }) =>
+      apiRequest<GoldsetScore>(
+        `/api/goldset/score?batch=${encodeURIComponent(batch)}`,
+        { signal },
+      ),
+    staleTime: 5_000,
+  });
+}
+
+export function useTagGoldset(batch = "default") {
+  const qc = useQueryClient();
+  return useMutation({
+    mutationFn: ({ id, tag }: { id: string; tag: GoldsetTag }) =>
+      apiRequest<{ ok: boolean }>(`/api/goldset/${encodeURIComponent(id)}`, {
+        method: "PATCH",
+        body: { ...tag, tagged_by: "chair" },
+      }),
+    onSuccess: () => {
+      qc.invalidateQueries({ queryKey: keys.list(batch) });
+      qc.invalidateQueries({ queryKey: keys.score(batch) });
+    },
+  });
+}
+
+export function useCreateGoldsetSample(batch = "default") {
+  const qc = useQueryClient();
+  return useMutation({
+    mutationFn: (n: number) =>
+      apiRequest<{ batch: string; inserted: number; total: number }>(
+        "/api/goldset/sample",
+        { method: "POST", body: { n, batch } },
+      ),
+    onSuccess: () => qc.invalidateQueries({ queryKey: keys.list(batch) }),
+  });
+}