#!/usr/bin/env python3 """Generate the AI second-opinion for gold-set items (#81.7 QA aid). For each gold-set halacha, an INDEPENDENT local-LLM (claude_session, zero cost) judges: is it a real generalizable holding, what is its correct rule_type, and a one-line rationale. Stored in halacha_goldset.ai_* and shown beside the human tag so the chair can spot disagreements and reconsider. This is a QA aid, NOT ground truth and NOT auto-applied. It is also independent of the rule-based validators that #81.8 measures, so it doesn't bias that score. Must run locally (claude_session needs the local CLI — not the container): cd ~/legal-ai/mcp-server .venv/bin/python ../scripts/goldset_ai_recommend.py # missing only .venv/bin/python ../scripts/goldset_ai_recommend.py --force # regenerate all .venv/bin/python ../scripts/goldset_ai_recommend.py --limit 10 # smoke """ from __future__ import annotations import argparse import asyncio import sys from uuid import UUID from legal_mcp.services import claude_session, db VALID_TYPES = {"binding", "interpretive", "obiter", "application", "procedural", "persuasive"} SYSTEM = ( "אתה בוחן-איכות משפטי המסווג 'הלכות' שחולצו מהחלטות ועדת-ערר ומפסקי-דין. " "לכל פריט הכרע שתי שאלות, באופן עצמאי ולפי המהות:\n" "1) is_holding — האם זו הלכה אמיתית בת-הכללה ובת-הסתמכות (true), או שזו יישום " "תלוי-עובדות / אמרת-אגב / ציטוט-עובדה ולא כלל בר-הכללה (false).\n" "2) type — הסוג הנכון: 'binding' (עיקרון הכרחי להכרעה), 'interpretive' (פרשנות " "חוק/מונח/תכנית), 'procedural' (סדר-דין: מועדים/סמכות/מיצוי/נטל), 'persuasive' " "(אסמכתה לא-מחייבת), 'application' (החלה על עובדות התיק — לרוב לא-הלכה), " "'obiter' (אמרת-אגב שלא הוכרעה — לא-הלכה).\n" "עקביות: is_holding=true → binding/interpretive/procedural/persuasive; " "is_holding=false → application/obiter.\n" 'החזר JSON בלבד: {"is_holding": true/false, "type": "<אחד מהשישה>", ' '"rationale": "<משפט אחד קצר בעברית>"}. ללא markdown.' ) def _prompt(item: dict) -> str: src = "פסק-דין" if item.get("source_type") == "court_ruling" else "החלטת ועדת-ערר" return ( f"מקור: {src} ({item.get('case_number') or ''}).\n" f"סוג שהמכונה נתנה: {item.get('rule_type')}.\n\n" f"ניסוח הכלל:\n{item.get('rule_statement') or ''}\n\n" f"ציטוט תומך:\n{item.get('supporting_quote') or ''}" ) async def main(args: argparse.Namespace) -> int: items = await db.goldset_list(args.batch) todo = [it for it in items if args.force or not it.get("ai_generated_at")] if args.limit: todo = todo[: args.limit] print(f"gold-set {args.batch}: {len(items)} items, {len(todo)} to recommend", flush=True) ok, fail, disagree = 0, 0, 0 for i, it in enumerate(todo, 1): try: v = await claude_session.query_json(_prompt(it), system=SYSTEM, effort="low") except Exception as e: # noqa: BLE001 fail += 1 print(f"[{i}/{len(todo)}] {it['case_number']}: FAIL {e}", flush=True) continue if not isinstance(v, dict): fail += 1 continue ai_hold = bool(v.get("is_holding")) ai_type = str(v.get("type") or "").strip() if ai_type not in VALID_TYPES: ai_type = "" await db.goldset_set_ai_recommendation( UUID(str(it["id"])), ai_is_holding=ai_hold, ai_correct_type=ai_type, ai_rationale=str(v.get("rationale") or "")[:300], ) ok += 1 # note disagreements with the human tag (if tagged) flag = "" if it.get("is_holding") is not None and it["is_holding"] != ai_hold: disagree += 1 flag = " ⚠ DISAGREE is_holding" print(f"[{i}/{len(todo)}] {it['case_number']}: ai={ai_hold}/{ai_type}{flag}", flush=True) print(f"\nDONE — {ok} stored, {fail} failed, {disagree} disagree with existing human tag", flush=True) return 0 if __name__ == "__main__": ap = argparse.ArgumentParser() ap.add_argument("--batch", default="default") ap.add_argument("--force", action="store_true", help="regenerate even if present") ap.add_argument("--limit", type=int, default=None) sys.exit(asyncio.run(main(ap.parse_args())))