From 0e35060d3dbe0e0e17df2193aa6696c2fbda5421 Mon Sep 17 00:00:00 2001 From: Chaim Date: Sun, 7 Jun 2026 14:24:35 +0000 Subject: [PATCH] =?UTF-8?q?feat(goldset):=20AI=20second-opinion=20per=20it?= =?UTF-8?q?em=20(QA=20aid)=20=E2=80=94=20compare=20vs=20human=20tag?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The chair wanted an independent recommendation beside each tag, to reconsider his own judgments. Adds a NON-ground-truth AI second-opinion: - schema: halacha_goldset.ai_is_holding / ai_correct_type / ai_rationale / ai_generated_at (additive). - db.goldset_set_ai_recommendation + goldset_list now returns the ai_* fields. - scripts/goldset_ai_recommend.py โ€” local claude_session judges is_holding + type + a one-line rationale per item, INDEPENDENTLY (own legal rubric). Independent of the rule-based validators #81.8 measures โ†’ no circularity. Never auto-applied; QA aid only. - web-ui: each card shows "๐Ÿค– ื”ืžืœืฆืช AI: ื”ืœื›ื”/ืœื ยท type" + rationale and an agreement/disagreement chip vs the human tag (amber on disagree); a "โš  ืื™-ื”ืกื›ืžื•ืช AI (N)" filter to review only the conflicts. Methodology note kept explicit: the human stays the ground truth; the AI is a prompt to reconsider, not to copy. Verified: tsc --noEmit 0; generator stores recs and flags disagreements with existing human tags. Co-Authored-By: Claude Opus 4.8 (1M context) --- mcp-server/src/legal_mcp/services/db.py | 25 +++++ scripts/SCRIPTS.md | 3 +- scripts/goldset_ai_recommend.py | 100 ++++++++++++++++++ .../src/components/goldset/goldset-panel.tsx | 54 +++++++++- web-ui/src/lib/api/goldset.ts | 5 + 5 files changed, 184 insertions(+), 3 deletions(-) create mode 100644 scripts/goldset_ai_recommend.py diff --git a/mcp-server/src/legal_mcp/services/db.py b/mcp-server/src/legal_mcp/services/db.py index 2f6369f..56c9309 100644 --- a/mcp-server/src/legal_mcp/services/db.py +++ b/mcp-server/src/legal_mcp/services/db.py @@ -1275,6 +1275,15 @@ CREATE TABLE IF NOT EXISTS halacha_goldset ( UNIQUE (halacha_id, batch) ); CREATE INDEX IF NOT EXISTS idx_goldset_batch ON halacha_goldset(batch); + +-- AI second-opinion (a QA aid, NOT ground truth): an INDEPENDENT local-LLM +-- judgment shown beside the human tag so the chair can spot disagreements and +-- reconsider. Independent of the rule-based validators that #81.8 measures, so +-- no circularity. Generated locally (claude_session); never auto-applied. +ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_is_holding BOOLEAN; +ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_correct_type TEXT DEFAULT ''; +ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_rationale TEXT DEFAULT ''; +ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_generated_at TIMESTAMPTZ; """ @@ -4338,6 +4347,7 @@ async def goldset_list(batch: str = "default") -> list[dict]: rows = await pool.fetch( "SELECT g.id, g.halacha_id::text AS halacha_id, g.is_holding, " " g.correct_type, g.quote_complete, g.tagged_by, g.tagged_at, " + " g.ai_is_holding, g.ai_correct_type, g.ai_rationale, g.ai_generated_at, " " h.rule_statement, h.supporting_quote, h.reasoning_summary, " " h.rule_type, h.confidence, h.quality_flags, h.review_status, " " cl.case_number, cl.case_name, cl.source_type " @@ -4350,12 +4360,27 @@ async def goldset_list(batch: str = "default") -> list[dict]: d = dict(r) if d.get("tagged_at") is not None: d["tagged_at"] = d["tagged_at"].isoformat() + if d.get("ai_generated_at") is not None: + d["ai_generated_at"] = d["ai_generated_at"].isoformat() if d.get("confidence") is not None: d["confidence"] = float(d["confidence"]) out.append(d) return out +async def goldset_set_ai_recommendation( + goldset_id: UUID, *, ai_is_holding: bool | None, + ai_correct_type: str = "", ai_rationale: str = "", +) -> None: + """Store the independent AI second-opinion for a gold-set item (QA aid).""" + pool = await get_pool() + await pool.execute( + "UPDATE halacha_goldset SET ai_is_holding = $2, ai_correct_type = $3, " + "ai_rationale = $4, ai_generated_at = now() WHERE id = $1", + goldset_id, ai_is_holding, ai_correct_type, ai_rationale, + ) + + async def goldset_tag( goldset_id: UUID, *, is_holding: bool | None = None, correct_type: str | None = None, quote_complete: bool | None = None, diff --git a/scripts/SCRIPTS.md b/scripts/SCRIPTS.md index 1b4c16f..27a8784 100644 --- a/scripts/SCRIPTS.md +++ b/scripts/SCRIPTS.md @@ -38,7 +38,8 @@ | `rechunk_legacy_precedents.py` | python | **#57** โ€” re-chunk + re-embed ืคืกื™ืงื” ืฉื”ื•ื˜ืžืขื” ืœืคื ื™ ืชื™ืงื•ืŸ ื”-chunker (#55). ื‘ื•ื—ืจ ื›ืœ `case_law` ืขื chunk ื–ืขื™ืจ (`length(trim(content))<50` โ€” ื˜ื‘ื™ืขืช-ื”ืืฆื‘ืข ืฉืœ ื”-chunker ื”ื™ืฉืŸ) ื•ืžืจื™ืฅ `ingest.reindex_case_law` (re-chunk+re-embed ืž-`full_text` ืฉืžื•ืจ ื‘ืœื‘ื“ โ€” ืœืœื re-OCR/LLM, feedback_no_reocr_retrofit; idempotent DELETE-then-INSERT). idempotent ื‘ืจืžืช-ื”ื‘ืื˜ืฅ' (ืฉื•ืื‘ ืžื—ื“ืฉ ืืช ื”ืกื˜ ื”ืžื•ืฉืคืข ื‘ื›ืœ ืจื™ืฆื”). ื“ื’ืœ `--limit N`. ืจืฅ ืขื venv ืฉืœ mcp-server (`cd mcp-server && .venv/bin/python ../scripts/rechunk_legacy_precedents.py`) | ื—ื“-ืคืขืžื™ โ€” ืžื™ื’ืจืฆื™ื™ืช-ื ืชื•ื ื™ื ืฉืœ ืคืกื™ืงื” legacy (ืชื•ืงืŸ 2026-06-03) | | `backfill_nevo_preamble.py` | python | **#86.2** โ€” ืžื™ื’ืจืฆื™ื™ืช-ื ืชื•ื ื™ื: ื—ื™ืชื•ืš preamble/ืจืฆื™ื• ืฉืœ ื ื‘ื• ืฉื“ืœืฃ ืœืคืกื™ืงื” ืฉื”ื•ื˜ืžืขื” ืœืคื ื™ ืชื™ืงื•ืŸ #86.1. ืžืืชืจ ื›ืœ `case_law` ืฉ-`strip_nevo_preamble(full_text)` ืขื“ื™ื™ืŸ ืžืงืฆืจ (ื“ืœื™ืคื” ื”ื™ืกื˜ื•ืจื™ืช), ื•ืžื‘ืฆืข: (1) ืœื›ื™ื“ืช ื”-ืžื™ื ื™-ืจืฆื™ื• ืœ-`case_law.nevo_ratio` (gold-set ืœ-#86.3); (2) ืฉื›ืชื•ื‘ `full_text` ื”ื—ืชื•ืš + ื—ื™ืฉื•ื‘-ืžื—ื“ืฉ ืฉืœ `content_hash`; (3) `reindex_case_law` (re-chunk+embed, ืœืœื re-OCR/LLM); (4) **ืกื™ืžื•ืŸ (ืœื ืžื—ื™ืงื”)** ื”ืœื›ื•ืช ืฉ-`supporting_quote` ืฉืœื”ืŸ ื‘ืชื•ืš ื”-preamble ืฉื”ื•ืกืจ โ†’ `pending_review` + quality_flag `nevo_preamble_leak`. **ืฉื•ืžืจ-ื‘ื˜ื™ื—ื•ืช:** ืฉื•ืจื•ืช ืขื keep%<`--min-keep` (ื‘ืจื™ืจืช-ืžื—ื“ืœ 60) ืžื•ื—ืจื’ื•ืช ืž-`--apply` ื›ื—ืฉื“ over-strip (ืืœื ืื `--include-suspicious`). **dry-run ื›ื‘ืจื™ืจืช-ืžื—ื“ืœ**; `--apply` ื›ื•ืชื‘ backup JSON + manifest CSV ืœ-`data/audit/` ืชื—ื™ืœื”. idempotent. ืจืฅ ืขื venv ืฉืœ mcp-server. **chair-gated** (ืœืืžืช manifest ืœืคื ื™ apply) | ืžื™ื’ืจืฆื™ื™ืช-ื ืชื•ื ื™ื โ€” dry-run ื‘ื•ืฆืข (19 ืคืกืงื™ื, 27 ื”ืœื›ื•ืช ืžื–ื•ื”ืžื•ืช); apply ืžืžืชื™ืŸ ืœืื™ืฉื•ืจ | | `nevo_ratio_benchmark.py` | python | **#86.3** โ€” ืžื“ื™ื“ืช ืื™ื›ื•ืช ื—ื™ืœื•ืฅ-ื”ืœื›ื•ืช ืžื•ืœ ื”-ืžื™ื ื™-ืจืฆื™ื• ืฉืœ ื ื‘ื• (gold-set ืžืงืฆื•ืขื™ ื—ื™ื ืžื™). ืœื›ืœ ืคืกืง ืขื `nevo_ratio` (ืื• ื ื’ื–ืจ ืž-`full_text` ืื ื˜ืจื ื‘ื•ืฆืข backfill): LLM-judge ืžืงื•ืžื™ (`claude_session`, ืืคืก ืขืœื•ืช) ืžืžืคื” ืกืžื ื˜ื™ืช ืืช ื”ืœื›ื•ืช-ื”ืžืขืจื›ืช ืžื•ืœ ื”ืœื›ื•ืช-ื ื‘ื• ื•ืžืคื™ืง **recall** (ื›ื™ืกื•ื™ ื”ืœื›ื•ืช-ื ื‘ื•), **precision** (ืื—ื•ื– ื”ืœื›ื•ืชื™ื ื• ื”ืžืžื•ืคื•ืช), **granularity** (ื™ื—ืก ืคื™ืจื•ืง โ€” ืื™ืชื•ืช over-extraction ืœ-#81.5). `--case ` / `--all [--limit N]` / `--model` / `--out`. ื›ื•ืชื‘ CSV ืœ-`data/audit/`. ืจืฅ ืขื venv ืฉืœ mcp-server (ื“ื•ืจืฉ Claude CLI ืžืงื•ืžื™). ืื•ืžืช ืขืœ ื‘ื’"ืฅ 1764/05: recall 0.875, precision 1.0, granularity 1.75x | ื™ื“ื ื™ โ€” ืžื“ื™ื“ืช-ืื™ื›ื•ืช (CI/ad-hoc) | -| `halacha_goldset.py` | python | **#81.7** โ€” ื”ืืจื ืก gold-set ืœืื™ื›ื•ืช ื—ื™ืœื•ืฅ-ื”ืœื›ื•ืช. `export --n N` ืžื™ื™ืฆื ืžื“ื’ื ืžืจื•ื‘ื“ (ืœืคื™ precedentร—rule_type) ืœ-CSV ืขื ืขืžื•ื“ื•ืช-ืชื™ื•ื’ ืจื™ืงื•ืช (`is_holding`/`correct_type`/`quote_complete`) ืœืชื™ื•ื’ ื™ื“ื ื™ (ื—ื™ื™ื/ื“ืคื ื”). `score --in ` ืงื•ืจื ืืช ื”-CSV ื”ืžืชื•ื™ื’ ื•ืžื•ื“ื“ ื›ืœ ื•ืœื™ื“ื˜ื•ืจ (`compute_quality_flags`/`is_fact_dependent`/`is_quote_truncated`/`is_thin_restatement`) ืžื•ืœ ืืžืช-ื”ืžื™ื“ื” ื”ืื ื•ืฉื™ืช: P/R/F1 + confusion. ื‘ืกื™ืก ืœ-#81.8 (ื›ื™ื•ืœ ืกืฃ ื”ืื™ืฉื•ืจ). ืžื™ื™ื‘ื ืืช ืื•ืชื ื•ืœื™ื“ื˜ื•ืจื™ื ืฉื”-extractor ืžืจื™ืฅ. ืจืฅ ืขื venv ืฉืœ mcp-server | ื™ื“ื ื™ โ€” exportโ†’ืชื™ื•ื’โ†’score | +| `halacha_goldset.py` | python | **#81.7** โ€” ื”ืืจื ืก gold-set ืœืื™ื›ื•ืช ื—ื™ืœื•ืฅ-ื”ืœื›ื•ืช. `export --n N` ืžื™ื™ืฆื ืžื“ื’ื ืžืจื•ื‘ื“ (ืœืคื™ precedentร—rule_type) ืœ-CSV ืขื ืขืžื•ื“ื•ืช-ืชื™ื•ื’ ืจื™ืงื•ืช (`is_holding`/`correct_type`/`quote_complete`) ืœืชื™ื•ื’ ื™ื“ื ื™ (ื—ื™ื™ื/ื“ืคื ื”). `score --in ` ืงื•ืจื ืืช ื”-CSV ื”ืžืชื•ื™ื’ ื•ืžื•ื“ื“ ื›ืœ ื•ืœื™ื“ื˜ื•ืจ (`compute_quality_flags`/`is_fact_dependent`/`is_quote_truncated`/`is_thin_restatement`) ืžื•ืœ ืืžืช-ื”ืžื™ื“ื” ื”ืื ื•ืฉื™ืช: P/R/F1 + confusion. ื‘ืกื™ืก ืœ-#81.8 (ื›ื™ื•ืœ ืกืฃ ื”ืื™ืฉื•ืจ). ืžื™ื™ื‘ื ืืช ืื•ืชื ื•ืœื™ื“ื˜ื•ืจื™ื ืฉื”-extractor ืžืจื™ืฅ. ืจืฅ ืขื venv ืฉืœ mcp-server. **ื”ืขืจื”:** ืงื™ื™ื ื’ื ื“ืฃ-ืชื™ื•ื’ ืื™ื ื˜ืจืืงื˜ื™ื‘ื™ DB-backed (`/goldset`) โ€” ื–ื” ื”-CSV-fallback | ื™ื“ื ื™ โ€” exportโ†’ืชื™ื•ื’โ†’score | +| `goldset_ai_recommend.py` | python | **#81.7 QA** โ€” ืžื™ื™ืฆืจ **ื—ื•ื•ืช-ื“ืขืช-AI ืฉื ื™ื™ื”** (claude ืžืงื•ืžื™, ืืคืก ืขืœื•ืช) ืœื›ืœ ืคืจื™ื˜ ื‘-`halacha_goldset`: `is_holding`+`type`+ื ื™ืžื•ืง, ื ืฉืžืจ ื‘-`ai_*` ื•ืžื•ืฆื’ ื‘ื“ืฃ ืœืฆื“ ื”ืชื™ื•ื’ ื”ืื ื•ืฉื™ ืœื–ื™ื”ื•ื™ ืื™-ื”ืกื›ืžื•ืช. **ืขืฆืžืื™** ืžื”ื•ื•ืœื™ื“ื˜ื•ืจื™ื ืฉื ืžื“ื“ื™ื (ืื™ืŸ ืžืขื’ืœื™ื•ืช) ื•**ืœื** ืžื•ื—ืœ ืื•ื˜ื•ืžื˜ื™ืช. `--force` (ื—ื™ื“ื•ืฉ)/`--limit N`. **ื—ื•ื‘ื” ืžืงื•ืžื™** (claude_session). | ื™ื“ื ื™ โ€” ืœืื—ืจ ื™ืฆื™ืจืช/ื”ืจื—ื‘ืช batch | | `halacha_batch_reconcile.py` | python | **#82.7** โ€” dedup ื—ื•ืฆื”-ืคืกืงื™ื offline (ืฉืžืจื ื™, **dry-run ื‘ืœื‘ื“**). dedup-on-insert ืžืฉื•ื•ื” ืจืง ืชื•ืš-ืคืกืง; ื›ืืŸ ืกืฃ ืžื—ืžื™ืจ (cosine โ‰ฅ0.95, `--cosine`) ื•ืœื-ื”ืจืกื ื™: ืžืืชืจ ื–ื•ื’ื•ืช ื”ืœื›ื•ืช near-duplicate ื‘ื™ืŸ ืคืกืงื™ื ืฉื•ื ื™ื (pgvector `<=>` exact) ืขื ืื™ืชื•ืช ืœืงืกื™ืงืœื™ (Jaccard/Levenshtein) ื•ืžื“ื•ื•ื— ืœ-CSV ื‘-`data/audit/` ืœืกืงื™ืจืช ื”ื™ื•"ืจ. ืœื ืžื“ืœื’/ืžืžื–ื’/ืžื•ื—ืง. `--include-pending`. **`--link`** ืจื•ืฉื ืืช ื”ื–ื•ื’ื•ืช ืฉื ืžืฆืื• ื›-`equivalent_halachot` (parallel authority, #84.2 โ€” ืงื™ืฉื•ืจ-ืžืงื‘ื™ืœ ื‘ืจืžืช-ื”ืœื›ื”, **ืœื** ืฆื™ื˜ื•ื˜; idempotent, ืœื-ื”ืจืกื ื™). ืจืฅ ืขื venv ืฉืœ mcp-server. ืื•ืžืช: 800 ื”ืœื›ื•ืช โ†’ 5 ื–ื•ื’ื•ืช (ืงื•ืฉืจื•). | ื™ื“ื ื™ โ€” ื“ื•ื—-ืกืงื™ืจื” / `--link` ืœืงื™ืฉื•ืจ | | `calibrate_halacha_dedup.py` | python | **#82.1** โ€” ื›ื™ื•ืœ ืกืคื™ ื”-dedup ื”ืœืงืกื™ืงืœื™ (#82.3) ืžื•ืœ gold-set ื”ื ื™ืงื•ื™. ืงื•ืจื `halacha-cleanup-manifest-*.csv` (ื–ื•ื’ื•ืช duplicateโ†”survivor ืžืชื•ื™ื’ื™-ืื“ื), ื˜ื•ืขืŸ ื˜ืงืกื˜-survivor ืžื”-DB, ื•-sweep ืฉืœ (jaccard_min ร— levenshtein_min) ืขื P/R/F1, ืžืกืžืŸ ืืช ื ืงื•ื“ืช-ื”ืขื‘ื•ื“ื” ื”ืžื•ื’ื“ืจืช. ืื™ืžืช ืฉ-(0.55, 0.70) โ†’ **precision 1.0** (ืืคืก false-merge), recall 0.30 โ€” ืžืชืื™ื ืœืื™ืชื•ืช-ืžืฉื ื™ ืฉื—ื•ืกื auto-approve. `--manifest `. ืจืฅ ืขื venv ืฉืœ mcp-server | ื—ื“-ืคืขืžื™ โ€” ื›ื™ื•ืœ (ื‘ื•ืฆืข 2026-06-06) | | `audit_corpus_integrity.py` | python | ื‘ื“ื™ืงื” ืชืงื•ืคืชื™ืช ืฉืœ ืขืงื‘ื™ื•ืช ื”ืงื•ืจืคื•ืก โ€” 3 ื‘ื“ื™ืงื•ืช SQL read-only ืขืœ `case_law` ื•-`cases`: (A) `external_upload` ืขื prefix ืคื ื™ืžื™ `ืขืจืจ`/`ื‘ืœ"ืž`; (B) `internal_committee` ื—ืกืจ `chair_name`/`district`; (C) `cases.practice_area` ืžื—ื•ืฅ ืœ-{`rishuy_uvniya`, `betterment_levy`, `compensation_197`, `''`}. ื›ื•ืชื‘ log ืžืฆื˜ื‘ืจ ืœ-`data/logs/corpus_integrity_audit.log` ื•ื‘ืžืฆื‘ ื”ืคืจื•ืช ืฉื•ืœื— wakeup ืœ-CEO ื‘-Paperclip (best-effort, ืจืง ืื `PAPERCLIP_API_URL`+`PAPERCLIP_API_KEY` ืžื•ื’ื“ืจื™ื). ื“ื’ืœ: `--no-notify`. Idempotent, ื™ื•ืฆื 0. **Cron ื™ื•ืžื™ 07:00**: `0 7 * * * /home/chaim/legal-ai/mcp-server/.venv/bin/python /home/chaim/legal-ai/scripts/audit_corpus_integrity.py` | `0 7 * * *` (cron) | diff --git a/scripts/goldset_ai_recommend.py b/scripts/goldset_ai_recommend.py new file mode 100644 index 0000000..ab36fa8 --- /dev/null +++ b/scripts/goldset_ai_recommend.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +"""Generate the AI second-opinion for gold-set items (#81.7 QA aid). + +For each gold-set halacha, an INDEPENDENT local-LLM (claude_session, zero cost) +judges: is it a real generalizable holding, what is its correct rule_type, and a +one-line rationale. Stored in halacha_goldset.ai_* and shown beside the human +tag so the chair can spot disagreements and reconsider. + +This is a QA aid, NOT ground truth and NOT auto-applied. It is also independent +of the rule-based validators that #81.8 measures, so it doesn't bias that score. + +Must run locally (claude_session needs the local CLI โ€” not the container): + + cd ~/legal-ai/mcp-server + .venv/bin/python ../scripts/goldset_ai_recommend.py # missing only + .venv/bin/python ../scripts/goldset_ai_recommend.py --force # regenerate all + .venv/bin/python ../scripts/goldset_ai_recommend.py --limit 10 # smoke +""" +from __future__ import annotations + +import argparse +import asyncio +import sys +from uuid import UUID + +from legal_mcp.services import claude_session, db + +VALID_TYPES = {"binding", "interpretive", "obiter", "application", "procedural", "persuasive"} + +SYSTEM = ( + "ืืชื” ื‘ื•ื—ืŸ-ืื™ื›ื•ืช ืžืฉืคื˜ื™ ื”ืžืกื•ื•ื’ 'ื”ืœื›ื•ืช' ืฉื—ื•ืœืฆื• ืžื”ื—ืœื˜ื•ืช ื•ืขื“ืช-ืขืจืจ ื•ืžืคืกืงื™-ื“ื™ืŸ. " + "ืœื›ืœ ืคืจื™ื˜ ื”ื›ืจืข ืฉืชื™ ืฉืืœื•ืช, ื‘ืื•ืคืŸ ืขืฆืžืื™ ื•ืœืคื™ ื”ืžื”ื•ืช:\n" + "1) is_holding โ€” ื”ืื ื–ื• ื”ืœื›ื” ืืžื™ืชื™ืช ื‘ืช-ื”ื›ืœืœื” ื•ื‘ืช-ื”ืกืชืžื›ื•ืช (true), ืื• ืฉื–ื• ื™ื™ืฉื•ื " + "ืชืœื•ื™-ืขื•ื‘ื“ื•ืช / ืืžืจืช-ืื’ื‘ / ืฆื™ื˜ื•ื˜-ืขื•ื‘ื“ื” ื•ืœื ื›ืœืœ ื‘ืจ-ื”ื›ืœืœื” (false).\n" + "2) type โ€” ื”ืกื•ื’ ื”ื ื›ื•ืŸ: 'binding' (ืขื™ืงืจื•ืŸ ื”ื›ืจื—ื™ ืœื”ื›ืจืขื”), 'interpretive' (ืคืจืฉื ื•ืช " + "ื—ื•ืง/ืžื•ื ื—/ืชื›ื ื™ืช), 'procedural' (ืกื“ืจ-ื“ื™ืŸ: ืžื•ืขื“ื™ื/ืกืžื›ื•ืช/ืžื™ืฆื•ื™/ื ื˜ืœ), 'persuasive' " + "(ืืกืžื›ืชื” ืœื-ืžื—ื™ื™ื‘ืช), 'application' (ื”ื—ืœื” ืขืœ ืขื•ื‘ื“ื•ืช ื”ืชื™ืง โ€” ืœืจื•ื‘ ืœื-ื”ืœื›ื”), " + "'obiter' (ืืžืจืช-ืื’ื‘ ืฉืœื ื”ื•ื›ืจืขื” โ€” ืœื-ื”ืœื›ื”).\n" + "ืขืงื‘ื™ื•ืช: is_holding=true โ†’ binding/interpretive/procedural/persuasive; " + "is_holding=false โ†’ application/obiter.\n" + 'ื”ื—ื–ืจ JSON ื‘ืœื‘ื“: {"is_holding": true/false, "type": "<ืื—ื“ ืžื”ืฉื™ืฉื”>", ' + '"rationale": "<ืžืฉืคื˜ ืื—ื“ ืงืฆืจ ื‘ืขื‘ืจื™ืช>"}. ืœืœื markdown.' +) + + +def _prompt(item: dict) -> str: + src = "ืคืกืง-ื“ื™ืŸ" if item.get("source_type") == "court_ruling" else "ื”ื—ืœื˜ืช ื•ืขื“ืช-ืขืจืจ" + return ( + f"ืžืงื•ืจ: {src} ({item.get('case_number') or ''}).\n" + f"ืกื•ื’ ืฉื”ืžื›ื•ื ื” ื ืชื ื”: {item.get('rule_type')}.\n\n" + f"ื ื™ืกื•ื— ื”ื›ืœืœ:\n{item.get('rule_statement') or ''}\n\n" + f"ืฆื™ื˜ื•ื˜ ืชื•ืžืš:\n{item.get('supporting_quote') or ''}" + ) + + +async def main(args: argparse.Namespace) -> int: + items = await db.goldset_list(args.batch) + todo = [it for it in items if args.force or not it.get("ai_generated_at")] + if args.limit: + todo = todo[: args.limit] + print(f"gold-set {args.batch}: {len(items)} items, {len(todo)} to recommend", flush=True) + + ok, fail, disagree = 0, 0, 0 + for i, it in enumerate(todo, 1): + try: + v = await claude_session.query_json(_prompt(it), system=SYSTEM, effort="low") + except Exception as e: # noqa: BLE001 + fail += 1 + print(f"[{i}/{len(todo)}] {it['case_number']}: FAIL {e}", flush=True) + continue + if not isinstance(v, dict): + fail += 1 + continue + ai_hold = bool(v.get("is_holding")) + ai_type = str(v.get("type") or "").strip() + if ai_type not in VALID_TYPES: + ai_type = "" + await db.goldset_set_ai_recommendation( + UUID(str(it["id"])), ai_is_holding=ai_hold, ai_correct_type=ai_type, + ai_rationale=str(v.get("rationale") or "")[:300], + ) + ok += 1 + # note disagreements with the human tag (if tagged) + flag = "" + if it.get("is_holding") is not None and it["is_holding"] != ai_hold: + disagree += 1 + flag = " โš  DISAGREE is_holding" + print(f"[{i}/{len(todo)}] {it['case_number']}: ai={ai_hold}/{ai_type}{flag}", flush=True) + + print(f"\nDONE โ€” {ok} stored, {fail} failed, {disagree} disagree with existing human tag", + flush=True) + return 0 + + +if __name__ == "__main__": + ap = argparse.ArgumentParser() + ap.add_argument("--batch", default="default") + ap.add_argument("--force", action="store_true", help="regenerate even if present") + ap.add_argument("--limit", type=int, default=None) + sys.exit(asyncio.run(main(ap.parse_args()))) diff --git a/web-ui/src/components/goldset/goldset-panel.tsx b/web-ui/src/components/goldset/goldset-panel.tsx index d571e62..0bd2ebe 100644 --- a/web-ui/src/components/goldset/goldset-panel.tsx +++ b/web-ui/src/components/goldset/goldset-panel.tsx @@ -67,6 +67,16 @@ function isTagged(it: GoldsetItem): boolean { return it.is_holding !== null && it.quote_complete !== null && !!it.correct_type; } +// The AI second-opinion disagrees with the human tag (on is_holding or type). +function aiDisagrees(it: GoldsetItem): boolean { + if (!it.ai_generated_at) return false; + const holdDiff = it.is_holding !== null && it.ai_is_holding !== null + && it.is_holding !== it.ai_is_holding; + const typeDiff = !!it.correct_type && !!it.ai_correct_type + && it.correct_type !== it.ai_correct_type; + return holdDiff || typeDiff; +} + // โ”€โ”€โ”€ Score panel โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ function ScorePanel({ batch }: { batch: string }) { @@ -248,6 +258,36 @@ function TagCard({ “{it.supporting_quote}” + {it.ai_generated_at && (() => { + const aiType = TYPES.find((t) => t.value === it.ai_correct_type)?.label ?? it.ai_correct_type; + const holdDisagree = it.is_holding !== null && it.ai_is_holding !== null + && it.is_holding !== it.ai_is_holding; + const typeDisagree = !!it.correct_type && !!it.ai_correct_type + && it.correct_type !== it.ai_correct_type; + const anyTag = it.is_holding !== null || !!it.correct_type; + return ( +
+
+ ๐Ÿค– ื”ืžืœืฆืช AI: + {it.ai_is_holding ? "ื”ืœื›ื”" : "ืœื ื”ืœื›ื”"} + {aiType && ยท {aiType}} + {anyTag && ( + + {holdDisagree ? "โš  ื—ื•ืœืง ืขืœ 'ื”ืœื›ื”/ืœื'" + : typeDisagree ? "โš  ื—ื•ืœืง ืขืœ ื”ืกื•ื’" + : "โœ“ ืžืกื›ื™ื ืื™ืชืš"} + + )} +
+ {it.ai_rationale &&
{it.ai_rationale}
} +
+ ); + })()} +
{/* is_holding */}
@@ -308,11 +348,13 @@ export function GoldsetPanel() { const createSample = useCreateGoldsetSample(batch); const [focusedId, setFocusedId] = useState(null); const [hideTagged, setHideTagged] = useState(false); + const [disagreeOnly, setDisagreeOnly] = useState(false); const [sourceFilter, setSourceFilter] = useState<"all" | "court_ruling" | "appeals_committee">("all"); const items = useMemo(() => data?.items ?? [], [data]); const taggedCount = items.filter(isTagged).length; + const disagreeCount = items.filter(aiDisagrees).length; const sourceCounts = useMemo(() => ({ court_ruling: items.filter((i) => i.source_type === "court_ruling").length, appeals_committee: items.filter((i) => i.source_type === "appeals_committee").length, @@ -321,11 +363,12 @@ export function GoldsetPanel() { let v = items; if (sourceFilter !== "all") v = v.filter((i) => i.source_type === sourceFilter); if (hideTagged) v = v.filter((i) => !isTagged(i)); + if (disagreeOnly) v = v.filter(aiDisagrees); // group-sort: ื›ืœ ืคืกืงื™-ื”ื“ื™ืŸ ื™ื—ื“, ื•ืื– ื›ืœ ื”ื—ืœื˜ื•ืช ื•ืขื“ืช-ื”ืขืจืจ (ื”ืคืจื“ื” ื‘ืจื•ืจื”). const order = (s: string | null) => s === "court_ruling" ? 0 : s === "appeals_committee" ? 1 : 2; return [...v].sort((a, b) => order(a.source_type) - order(b.source_type)); - }, [items, hideTagged, sourceFilter]); + }, [items, hideTagged, sourceFilter, disagreeOnly]); const focused = focusedId ? visible.find((i) => i.id === focusedId) ?? null : null; @@ -424,7 +467,14 @@ export function GoldsetPanel() { {" "}ยท ื”ืœื›ื” H / ืœื N {" "}ยท ืฆื™ื˜ื•ื˜ ืฉืœื C / ืงื˜ื•ืข X - + )} +
diff --git a/web-ui/src/lib/api/goldset.ts b/web-ui/src/lib/api/goldset.ts index 060b03c..976254b 100644 --- a/web-ui/src/lib/api/goldset.ts +++ b/web-ui/src/lib/api/goldset.ts @@ -29,6 +29,11 @@ export type GoldsetItem = { case_number: string | null; case_name: string | null; source_type: string | null; // 'court_ruling' | 'appeals_committee' | '' + // AI second-opinion (QA aid โ€” independent, not ground truth, not auto-applied) + ai_is_holding: boolean | null; + ai_correct_type: string; + ai_rationale: string; + ai_generated_at: string | null; }; export type GoldsetScore = {