#!/usr/bin/env python3 """One-time backfill: recover the rule ROLE for pre-split halachot (INV-DM7). Before the authority/role split, the extractor stored ``rule_type='binding'`` for higher-court sources and ``'persuasive'`` for committee sources — i.e. it recorded the source's AUTHORITY in the role field. Those 276 rows therefore have NO genuine role. This script re-classifies each into one of the five real roles (holding/interpretive/procedural/application/obiter) using the same local claude_session judge the gold-set trusts (zero API cost), and writes it back to ``halachot.rule_type``. authority is NOT touched — it is derived from ``case_law.precedent_level`` at read time and was never stored. cd ~/legal-ai/mcp-server .venv/bin/python ../scripts/halacha_rule_role_backfill.py --limit 5 # smoke (dry) .venv/bin/python ../scripts/halacha_rule_role_backfill.py --apply # full backfill Local-only (claude_session needs the local CLI, not the container). """ from __future__ import annotations import argparse import asyncio import csv import sys from datetime import datetime, timezone from pathlib import Path from uuid import UUID from legal_mcp.services import claude_session, db REPO_ROOT = Path(__file__).resolve().parent.parent AUDIT_DIR = REPO_ROOT / "data" / "audit" VALID_ROLES = {"holding", "interpretive", "procedural", "application", "obiter"} SYSTEM = ( "אתה משפטן בכיר המסווג 'הלכות' שחולצו מפסיקה לפי **סוג הכלל** בלבד " "(אל תסווג מחייב/משכנע — דרגת-המחייבות נגזרת אוטומטית מזהות הערכאה). " "בחר ערך אחד מתוך:\n" "- holding — עיקרון מהותי שהיה הכרחי להכרעה (ratio; מבחן Wambaugh).\n" "- interpretive — פרשנות הוראת-חוק/מונח/תכנית.\n" "- procedural — סדר-דין: סמכות/מועדים/זכות-עמידה/מיצוי/נטל.\n" "- application — החלה תלוית-עובדות על נסיבות התיק (לרוב לא-הלכה בת-הכללה).\n" "- obiter — אמרת-אגב שלא הוכרעה.\n" 'החזר JSON בלבד: {"role": "<אחד מהחמישה>"}. ללא markdown, ללא הסבר.' ) def _prompt(row: dict) -> str: return ( f"מקור: {row.get('case_number') or ''} " f"(precedent_level={row.get('precedent_level') or ''}).\n" f"סיווג ישן (סמכות, להתעלם): {row.get('rule_type')}.\n\n" f"ניסוח הכלל:\n{row.get('rule_statement') or ''}\n\n" f"היגיון:\n{row.get('reasoning_summary') or ''}\n\n" f"ציטוט תומך:\n{row.get('supporting_quote') or ''}" ) async def _classify(row: dict) -> str | None: """Return the role for one row, or None on failure (caller keeps old value).""" try: raw = await claude_session.query_json(_prompt(row), system=SYSTEM) except Exception as e: # noqa: BLE001 — log and skip, never crash the batch print(f" ! {row['id']}: judge error ({e}) — skipped", flush=True) return None role = "" if isinstance(raw, dict): role = str(raw.get("role") or "").strip().lower() if role not in VALID_ROLES: print(f" ? {row['id']}: invalid role {role!r} — skipped", flush=True) return None return role async def _fetch_legacy_rows() -> list[dict]: pool = await db.get_pool() rows = await pool.fetch( "SELECT h.id, h.rule_type, h.rule_statement, h.reasoning_summary, " " h.supporting_quote, cl.case_number, cl.precedent_level " "FROM halachot h LEFT JOIN case_law cl ON cl.id = h.case_law_id " "WHERE h.rule_type IN ('binding','persuasive') " "ORDER BY h.case_law_id, h.halacha_index" ) return [dict(r) for r in rows] def _backup(rows: list[dict]) -> Path: ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") AUDIT_DIR.mkdir(parents=True, exist_ok=True) out = AUDIT_DIR / f"halacha-rule-role-backfill-backup-{ts}.csv" with out.open("w", encoding="utf-8", newline="") as f: w = csv.writer(f) w.writerow(["id", "old_rule_type", "case_number", "precedent_level"]) for r in rows: w.writerow([r["id"], r["rule_type"], r.get("case_number") or "", r.get("precedent_level") or ""]) return out async def main(args: argparse.Namespace) -> int: rows = await _fetch_legacy_rows() if args.limit: rows = rows[: args.limit] print(f"legacy binding/persuasive rows to reclassify: {len(rows)}", flush=True) if not rows: return 0 backup = _backup(rows) print(f"backup written → {backup}", flush=True) pool = await db.get_pool() changed = skipped = 0 sem = asyncio.Semaphore(args.concurrency) async def _one(row: dict): nonlocal changed, skipped async with sem: role = await _classify(row) if role is None: skipped += 1 return old = row["rule_type"] print(f" {row.get('case_number') or '':<14} {old:>10} → {role}", flush=True) if args.apply and role != old: await pool.execute( "UPDATE halachot SET rule_type = $2, updated_at = now() WHERE id = $1", row["id"], role, ) changed += 1 # process in chunks to bound concurrent CLI subprocesses for i in range(0, len(rows), args.concurrency): await asyncio.gather(*(_one(r) for r in rows[i : i + args.concurrency])) mode = "APPLIED" if args.apply else "DRY-RUN (no writes)" print(f"\n{mode}: {changed} reclassified, {skipped} skipped (kept old).", flush=True) if not args.apply: print("re-run with --apply to write changes.", flush=True) return 0 if __name__ == "__main__": ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) ap.add_argument("--apply", action="store_true", help="write changes (default: dry-run)") ap.add_argument("--limit", type=int, default=0, help="only first N rows (smoke test)") ap.add_argument("--concurrency", type=int, default=4, help="parallel judge calls") sys.exit(asyncio.run(main(ap.parse_args())))