#!/usr/bin/env python """A/B (NON-DESTRUCTIVE): re-extract halachot for ONE precedent via the Codex CLI (gpt-5.5) and compare against the existing stored halachot. Purpose: benchmark gpt-5.5 against the current Claude Opus production output — WITHOUT deleting or storing anything in the DB. Mirrors ab_halacha_opus48.py but replaces `claude -p` with `codex exec --model gpt-5.5 --dangerously-bypass-approvals-and-sandbox -o FILE -` The model's last message (written to FILE via `-o`) is parsed as JSON. Usage: DOTENV_PATH=/home/chaim/.env DATA_DIR=/home/chaim/legal-ai/data \ AB_MODEL=gpt-5.5 AB_REASONING=medium \ mcp-server/.venv/bin/python scripts/ab_halacha_codex.py Env knobs: AB_MODEL model slug (default gpt-5.5) AB_REASONING reasoning effort: low/medium/high/xhigh (default medium) AB_CONCURRENCY concurrent chunks (default 1 — codex sessions, be conservative) AB_CHUNK_TIMEOUT seconds per chunk (default 300) CODEX_BIN path to codex binary (default: VS Code extension arm64 build) """ from __future__ import annotations import asyncio import json import os import statistics import sys import tempfile from collections import Counter from pathlib import Path from uuid import UUID from legal_mcp.config import parse_llm_json from legal_mcp.services import db from legal_mcp.services import halacha_extractor as hx # ── configuration ───────────────────────────────────────────────────────────── MODEL = os.environ.get("AB_MODEL", "gpt-5.5") REASONING = os.environ.get("AB_REASONING", "medium") CONCURRENCY = int(os.environ.get("AB_CONCURRENCY", "1")) CHUNK_TIMEOUT = int(os.environ.get("AB_CHUNK_TIMEOUT", "300")) # ARM64 build bundled with the VS Code ChatGPT extension — authenticated via # ~/.codex/auth.json (ChatGPT subscription, no OPENAI_API_KEY needed). CODEX_BIN = os.environ.get( "CODEX_BIN", "/home/chaim/.vscode-server/extensions/" "openai.chatgpt-26.609.30741-linux-arm64/bin/linux-aarch64/codex", ) # ── codex invocation ────────────────────────────────────────────────────────── async def run_codex(system: str, prompt: str, timeout: int = CHUNK_TIMEOUT): """One `codex exec` call. Returns parsed JSON from the model's last message. Codex is an agentic runner; we steer it to output-only mode via an explicit instruction prepended to the system prompt. The `-o FILE` flag captures the final text message; `parse_llm_json` strips any markdown fences. """ preamble = ( "Your output MUST be a valid JSON array and nothing else.\n" "Do NOT use shell commands, do NOT write files.\n" "Respond ONLY with the JSON array — no explanation, no markdown fences.\n\n" ) full_input = preamble + system + "\n\n" + prompt out_fd, out_path = tempfile.mkstemp(suffix=".txt", prefix="codex_ab_") os.close(out_fd) cmd = [ CODEX_BIN, "exec", "--model", MODEL, "-c", f"model_reasoning_effort={json.dumps(REASONING)}", "--dangerously-bypass-approvals-and-sandbox", "--skip-git-repo-check", "--ephemeral", "-o", out_path, "-", ] try: proc = await asyncio.create_subprocess_exec( *cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.DEVNULL, stderr=asyncio.subprocess.PIPE, env={**os.environ, "HOME": "/home/chaim"}, ) _, err_b = await asyncio.wait_for( proc.communicate(input=full_input.encode("utf-8")), timeout=timeout, ) if proc.returncode != 0: raise RuntimeError( f"codex exit {proc.returncode}: " f"{err_b.decode('utf-8', 'replace').strip()[:400]}" ) raw = Path(out_path).read_text(encoding="utf-8").strip() finally: Path(out_path).unlink(missing_ok=True) if not raw: raise RuntimeError("codex returned empty last-message") return parse_llm_json(raw) # ── extraction (mirrors ab_halacha_opus48) ──────────────────────────────────── async def extract_chunk(chunk_text, section_type, idx, total, context, is_binding): base_prompt = ( hx.HALACHA_EXTRACTION_PROMPT_BINDING if is_binding else hx.HALACHA_EXTRACTION_PROMPT_PERSUASIVE ) chunk_label = f" (חלק {idx + 1}/{total})" if total > 1 else "" user_msg = ( f"## הקלט\n" f"סוג קטע: {section_type}\n" f"{context}{chunk_label}\n\n" f"--- תחילת הטקסט ---\n{chunk_text}\n--- סוף הטקסט ---" ) try: result = await run_codex(base_prompt, user_msg) except Exception as e: print(f" ! chunk {idx + 1}/{total} failed: {e}", file=sys.stderr) return [], False if isinstance(result, list): return result, True print(f" ! chunk {idx + 1}/{total} non-list: {type(result).__name__}", file=sys.stderr) return [], False # ── statistics ──────────────────────────────────────────────────────────────── def stats(halachot: list[dict], label: str) -> dict: n = len(halachot) def fconf(x): try: return float(x.get("confidence")) except (TypeError, ValueError): return None confs = [c for c in (fconf(h) for h in halachot) if c is not None] qv = Counter(bool(h.get("quote_verified")) for h in halachot) rt = Counter(h.get("rule_type") for h in halachot) return { "label": label, "n": n, "quote_verified_true": qv.get(True, 0), "quote_verified_false": qv.get(False, 0), "conf_min": min(confs) if confs else None, "conf_median": statistics.median(confs) if confs else None, "conf_max": max(confs) if confs else None, "conf_below_0_7": sum(1 for c in confs if c < 0.7), "rule_types": dict(rt), } def print_stats(s: dict): print(f"\n=== {s['label']} ===") print(f" count : {s['n']}") print(f" quote_verified : {s['quote_verified_true']} ✓ / {s['quote_verified_false']} ✗") if s["conf_median"] is not None: print(f" confidence min/med/max: {s['conf_min']:.2f} / {s['conf_median']:.2f} / {s['conf_max']:.2f}") print(f" confidence < 0.7 : {s['conf_below_0_7']} / {s['n']}") print(f" rule_type dist : {s['rule_types']}") # ── main ────────────────────────────────────────────────────────────────────── async def main(): if len(sys.argv) < 2: print( "usage: ab_halacha_codex.py \n" " case_law_id: UUID of the case_law row (e.g. 246a22a0-46b5-...)\n" "env: AB_MODEL (default gpt-5.5), AB_REASONING (default medium),\n" " AB_CONCURRENCY (default 1), AB_CHUNK_TIMEOUT (default 300)", file=sys.stderr, ) sys.exit(2) if not Path(CODEX_BIN).exists(): print(f"codex binary not found: {CODEX_BIN}", file=sys.stderr) print("set CODEX_BIN= to the correct path", file=sys.stderr) sys.exit(1) case_law_id = UUID(sys.argv[1]) record = await db.get_case_law(case_law_id) if not record: print("case_law not found", file=sys.stderr) sys.exit(1) is_binding = bool(record.get("is_binding")) citation = record.get("case_number", "") court = record.get("court", "") date_str = str(record.get("date") or "") full_text = record.get("full_text") or "" print(f"Precedent: {citation} — {record.get('case_name')}") print(f" court={court} is_binding={is_binding} prompt={'BINDING' if is_binding else 'PERSUASIVE'}") print(f" model={MODEL} reasoning={REASONING} concurrency={CONCURRENCY}") print(f" codex_bin={CODEX_BIN}") # ---- Side A: existing stored halachot (current production / Opus output) ---- existing = await db.list_halachot(case_law_id=case_law_id, limit=500) by_status = Counter(h.get("review_status") for h in existing) print(f"\n[A] existing halachot in DB: {len(existing)} status breakdown: {dict(by_status)}") approved = by_status.get("approved", 0) + by_status.get("published", 0) if approved: print(f" (extracted by Claude Opus — a REAL re-run would DELETE the {approved} approved)") # ---- Side B: fresh extraction via codex / gpt-5.5 (no DB writes) ---- chunks = await db.list_precedent_chunks(case_law_id, section_types=hx.EXTRACTABLE_SECTIONS) if not chunks: chunks = await db.list_precedent_chunks(case_law_id) print(f"\n[B] extracting from {len(chunks)} chunks via codex/{MODEL} @ {REASONING} ...") context = f"מקור: {citation} — {court}, {date_str}" sem = asyncio.Semaphore(CONCURRENCY) async def bounded(i, c): async with sem: return await extract_chunk( c["content"], c["section_type"], i, len(chunks), context, is_binding ) results = await asyncio.gather(*[bounded(i, c) for i, c in enumerate(chunks)]) raw_b, failed = [], 0 for items, ok in results: raw_b.extend(items) if not ok: failed += 1 cleaned_b = [] for raw in raw_b: coerced = hx._coerce_halacha(raw) if coerced is None: continue coerced["quote_verified"] = hx._verify_quote(coerced["supporting_quote"], full_text) cleaned_b.append(coerced) print(f" raw={len(raw_b)} valid={len(cleaned_b)} failed_chunks={failed}/{len(chunks)}") # ---- Comparison ---- a_stats = stats(existing, f"A · Opus (production, n={len(existing)})") b_stats = stats(cleaned_b, f"B · codex/{MODEL} @ {REASONING}") print_stats(a_stats) print_stats(b_stats) # Dump B halachot for human quality review safe_citation = citation.replace("/", "_").replace('"', "").strip() out_path = ( f"/home/chaim/legal-ai/data/" f"ab_halacha_codex_{safe_citation}_{MODEL}_{REASONING}.json" ) with open(out_path, "w", encoding="utf-8") as f: json.dump( { "precedent": citation, "model": MODEL, "reasoning": REASONING, "engine": "codex", "A_stats": a_stats, "B_stats": b_stats, "B_halachot": cleaned_b, }, f, ensure_ascii=False, indent=2, ) print(f"\nB halachot written to: {out_path}") if __name__ == "__main__": asyncio.run(main())