feat(goldset): AI second-opinion per item (QA aid) — compare vs human tag

The chair wanted an independent recommendation beside each tag, to reconsider his own judgments. Adds a NON-ground-truth AI second-opinion: - schema: halacha_goldset.ai_is_holding / ai_correct_type / ai_rationale / ai_generated_at (additive). - db.goldset_set_ai_recommendation + goldset_list now returns the ai_* fields. - scripts/goldset_ai_recommend.py — local claude_session judges is_holding + type + a one-line rationale per item, INDEPENDENTLY (own legal rubric). Independent of the rule-based validators #81.8 measures → no circularity. Never auto-applied; QA aid only. - web-ui: each card shows "🤖 המלצת AI: הלכה/לא · type" + rationale and an agreement/disagreement chip vs the human tag (amber on disagree); a "⚠ אי-הסכמות AI (N)" filter to review only the conflicts. Methodology note kept explicit: the human stays the ground truth; the AI is a prompt to reconsider, not to copy. Verified: tsc --noEmit 0; generator stores recs and flags disagreements with existing human tags. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-07 14:24:35 +00:00
parent a0c1b74c55
commit 0e35060d3d
5 changed files with 184 additions and 3 deletions
--- a/mcp-server/src/legal_mcp/services/db.py
+++ b/mcp-server/src/legal_mcp/services/db.py
@@ -1275,6 +1275,15 @@ CREATE TABLE IF NOT EXISTS halacha_goldset (
    UNIQUE (halacha_id, batch)
 );
 CREATE INDEX IF NOT EXISTS idx_goldset_batch ON halacha_goldset(batch);
+
+-- AI second-opinion (a QA aid, NOT ground truth): an INDEPENDENT local-LLM
+-- judgment shown beside the human tag so the chair can spot disagreements and
+-- reconsider. Independent of the rule-based validators that #81.8 measures, so
+-- no circularity. Generated locally (claude_session); never auto-applied.
+ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_is_holding BOOLEAN;
+ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_correct_type TEXT DEFAULT '';
+ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_rationale TEXT DEFAULT '';
+ALTER TABLE halacha_goldset ADD COLUMN IF NOT EXISTS ai_generated_at TIMESTAMPTZ;
 """


@@ -4338,6 +4347,7 @@ async def goldset_list(batch: str = "default") -> list[dict]:
    rows = await pool.fetch(
        "SELECT g.id, g.halacha_id::text AS halacha_id, g.is_holding, "
        "       g.correct_type, g.quote_complete, g.tagged_by, g.tagged_at, "
+        "       g.ai_is_holding, g.ai_correct_type, g.ai_rationale, g.ai_generated_at, "
        "       h.rule_statement, h.supporting_quote, h.reasoning_summary, "
        "       h.rule_type, h.confidence, h.quality_flags, h.review_status, "
        "       cl.case_number, cl.case_name, cl.source_type "
@@ -4350,12 +4360,27 @@ async def goldset_list(batch: str = "default") -> list[dict]:
        d = dict(r)
        if d.get("tagged_at") is not None:
            d["tagged_at"] = d["tagged_at"].isoformat()
+        if d.get("ai_generated_at") is not None:
+            d["ai_generated_at"] = d["ai_generated_at"].isoformat()
        if d.get("confidence") is not None:
            d["confidence"] = float(d["confidence"])
        out.append(d)
    return out


+async def goldset_set_ai_recommendation(
+    goldset_id: UUID, *, ai_is_holding: bool | None,
+    ai_correct_type: str = "", ai_rationale: str = "",
+) -> None:
+    """Store the independent AI second-opinion for a gold-set item (QA aid)."""
+    pool = await get_pool()
+    await pool.execute(
+        "UPDATE halacha_goldset SET ai_is_holding = $2, ai_correct_type = $3, "
+        "ai_rationale = $4, ai_generated_at = now() WHERE id = $1",
+        goldset_id, ai_is_holding, ai_correct_type, ai_rationale,
+    )
+
+
 async def goldset_tag(
    goldset_id: UUID, *, is_holding: bool | None = None,
    correct_type: str | None = None, quote_complete: bool | None = None,