legal-ai/scripts/ab_halacha_codex.py

#!/usr/bin/env python
"""A/B (NON-DESTRUCTIVE): re-extract halachot for ONE precedent via the
Codex CLI (gpt-5.5) and compare against the existing stored halachot.

Purpose: benchmark gpt-5.5 against the current Claude Opus production output —
WITHOUT deleting or storing anything in the DB.

Mirrors ab_halacha_opus48.py but replaces `claude -p` with
  `codex exec --model gpt-5.5 --dangerously-bypass-approvals-and-sandbox -o FILE -`
The model's last message (written to FILE via `-o`) is parsed as JSON.

Usage:
    DOTENV_PATH=/home/chaim/.env DATA_DIR=/home/chaim/legal-ai/data \
      AB_MODEL=gpt-5.5 AB_REASONING=medium \
      mcp-server/.venv/bin/python scripts/ab_halacha_codex.py <case_law_id>

Env knobs:
    AB_MODEL       model slug (default gpt-5.5)
    AB_REASONING   reasoning effort: low/medium/high/xhigh (default medium)
    AB_CONCURRENCY concurrent chunks (default 1 — codex sessions, be conservative)
    AB_CHUNK_TIMEOUT seconds per chunk (default 300)
    CODEX_BIN      path to codex binary (default: VS Code extension arm64 build)
"""
from __future__ import annotations

import asyncio
import json
import os
import statistics
import sys
import tempfile
from collections import Counter
from pathlib import Path
from uuid import UUID

from legal_mcp.config import parse_llm_json
from legal_mcp.services import db
from legal_mcp.services import halacha_extractor as hx

# ── configuration ─────────────────────────────────────────────────────────────

MODEL = os.environ.get("AB_MODEL", "gpt-5.5")
REASONING = os.environ.get("AB_REASONING", "medium")
CONCURRENCY = int(os.environ.get("AB_CONCURRENCY", "1"))
CHUNK_TIMEOUT = int(os.environ.get("AB_CHUNK_TIMEOUT", "300"))

# ARM64 build bundled with the VS Code ChatGPT extension — authenticated via
# ~/.codex/auth.json (ChatGPT subscription, no OPENAI_API_KEY needed).
CODEX_BIN = os.environ.get(
    "CODEX_BIN",
    "/home/chaim/.vscode-server/extensions/"
    "openai.chatgpt-26.609.30741-linux-arm64/bin/linux-aarch64/codex",
)


# ── codex invocation ──────────────────────────────────────────────────────────

async def run_codex(system: str, prompt: str, timeout: int = CHUNK_TIMEOUT):
    """One `codex exec` call. Returns parsed JSON from the model's last message.

    Codex is an agentic runner; we steer it to output-only mode via an explicit
    instruction prepended to the system prompt.  The `-o FILE` flag captures the
    final text message; `parse_llm_json` strips any markdown fences.
    """
    preamble = (
        "Your output MUST be a valid JSON array and nothing else.\n"
        "Do NOT use shell commands, do NOT write files.\n"
        "Respond ONLY with the JSON array — no explanation, no markdown fences.\n\n"
    )
    full_input = preamble + system + "\n\n" + prompt

    out_fd, out_path = tempfile.mkstemp(suffix=".txt", prefix="codex_ab_")
    os.close(out_fd)

    cmd = [
        CODEX_BIN, "exec",
        "--model", MODEL,
        "-c", f"model_reasoning_effort={json.dumps(REASONING)}",
        "--dangerously-bypass-approvals-and-sandbox",
        "--skip-git-repo-check",
        "--ephemeral",
        "-o", out_path,
        "-",
    ]
    try:
        proc = await asyncio.create_subprocess_exec(
            *cmd,
            stdin=asyncio.subprocess.PIPE,
            stdout=asyncio.subprocess.DEVNULL,
            stderr=asyncio.subprocess.PIPE,
            env={**os.environ, "HOME": "/home/chaim"},
        )
        _, err_b = await asyncio.wait_for(
            proc.communicate(input=full_input.encode("utf-8")),
            timeout=timeout,
        )
        if proc.returncode != 0:
            raise RuntimeError(
                f"codex exit {proc.returncode}: "
                f"{err_b.decode('utf-8', 'replace').strip()[:400]}"
            )
        raw = Path(out_path).read_text(encoding="utf-8").strip()
    finally:
        Path(out_path).unlink(missing_ok=True)

    if not raw:
        raise RuntimeError("codex returned empty last-message")
    return parse_llm_json(raw)


# ── extraction (mirrors ab_halacha_opus48) ────────────────────────────────────

async def extract_chunk(chunk_text, section_type, idx, total, context, is_binding):
    base_prompt = (
        hx.HALACHA_EXTRACTION_PROMPT_BINDING if is_binding
        else hx.HALACHA_EXTRACTION_PROMPT_PERSUASIVE
    )
    chunk_label = f" (חלק {idx + 1}/{total})" if total > 1 else ""
    user_msg = (
        f"## הקלט\n"
        f"סוג קטע: {section_type}\n"
        f"{context}{chunk_label}\n\n"
        f"--- תחילת הטקסט ---\n{chunk_text}\n--- סוף הטקסט ---"
    )
    try:
        result = await run_codex(base_prompt, user_msg)
    except Exception as e:
        print(f"  ! chunk {idx + 1}/{total} failed: {e}", file=sys.stderr)
        return [], False
    if isinstance(result, list):
        return result, True
    print(f"  ! chunk {idx + 1}/{total} non-list: {type(result).__name__}", file=sys.stderr)
    return [], False


# ── statistics ────────────────────────────────────────────────────────────────

def stats(halachot: list[dict], label: str) -> dict:
    n = len(halachot)

    def fconf(x):
        try:
            return float(x.get("confidence"))
        except (TypeError, ValueError):
            return None

    confs = [c for c in (fconf(h) for h in halachot) if c is not None]
    qv = Counter(bool(h.get("quote_verified")) for h in halachot)
    rt = Counter(h.get("rule_type") for h in halachot)
    return {
        "label": label, "n": n,
        "quote_verified_true": qv.get(True, 0),
        "quote_verified_false": qv.get(False, 0),
        "conf_min": min(confs) if confs else None,
        "conf_median": statistics.median(confs) if confs else None,
        "conf_max": max(confs) if confs else None,
        "conf_below_0_7": sum(1 for c in confs if c < 0.7),
        "rule_types": dict(rt),
    }


def print_stats(s: dict):
    print(f"\n=== {s['label']} ===")
    print(f"  count               : {s['n']}")
    print(f"  quote_verified      : {s['quote_verified_true']} ✓ / {s['quote_verified_false']} ✗")
    if s["conf_median"] is not None:
        print(f"  confidence min/med/max: {s['conf_min']:.2f} / {s['conf_median']:.2f} / {s['conf_max']:.2f}")
        print(f"  confidence < 0.7    : {s['conf_below_0_7']} / {s['n']}")
    print(f"  rule_type dist      : {s['rule_types']}")


# ── main ──────────────────────────────────────────────────────────────────────

async def main():
    if len(sys.argv) < 2:
        print(
            "usage: ab_halacha_codex.py <case_law_id>\n"
            "  case_law_id: UUID of the case_law row (e.g. 246a22a0-46b5-...)\n"
            "env: AB_MODEL (default gpt-5.5), AB_REASONING (default medium),\n"
            "     AB_CONCURRENCY (default 1), AB_CHUNK_TIMEOUT (default 300)",
            file=sys.stderr,
        )
        sys.exit(2)

    if not Path(CODEX_BIN).exists():
        print(f"codex binary not found: {CODEX_BIN}", file=sys.stderr)
        print("set CODEX_BIN= to the correct path", file=sys.stderr)
        sys.exit(1)

    case_law_id = UUID(sys.argv[1])

    record = await db.get_case_law(case_law_id)
    if not record:
        print("case_law not found", file=sys.stderr)
        sys.exit(1)

    is_binding = bool(record.get("is_binding"))
    citation = record.get("case_number", "")
    court = record.get("court", "")
    date_str = str(record.get("date") or "")
    full_text = record.get("full_text") or ""

    print(f"Precedent: {citation} — {record.get('case_name')}")
    print(f"  court={court}  is_binding={is_binding}  prompt={'BINDING' if is_binding else 'PERSUASIVE'}")
    print(f"  model={MODEL}  reasoning={REASONING}  concurrency={CONCURRENCY}")
    print(f"  codex_bin={CODEX_BIN}")

    # ---- Side A: existing stored halachot (current production / Opus output) ----
    existing = await db.list_halachot(case_law_id=case_law_id, limit=500)
    by_status = Counter(h.get("review_status") for h in existing)
    print(f"\n[A] existing halachot in DB: {len(existing)}  status breakdown: {dict(by_status)}")
    approved = by_status.get("approved", 0) + by_status.get("published", 0)
    if approved:
        print(f"    (extracted by Claude Opus — a REAL re-run would DELETE the {approved} approved)")

    # ---- Side B: fresh extraction via codex / gpt-5.5 (no DB writes) ----
    chunks = await db.list_precedent_chunks(case_law_id, section_types=hx.EXTRACTABLE_SECTIONS)
    if not chunks:
        chunks = await db.list_precedent_chunks(case_law_id)
    print(f"\n[B] extracting from {len(chunks)} chunks via codex/{MODEL} @ {REASONING} ...")
    context = f"מקור: {citation} — {court}, {date_str}"
    sem = asyncio.Semaphore(CONCURRENCY)

    async def bounded(i, c):
        async with sem:
            return await extract_chunk(
                c["content"], c["section_type"], i, len(chunks), context, is_binding
            )

    results = await asyncio.gather(*[bounded(i, c) for i, c in enumerate(chunks)])
    raw_b, failed = [], 0
    for items, ok in results:
        raw_b.extend(items)
        if not ok:
            failed += 1

    cleaned_b = []
    for raw in raw_b:
        coerced = hx._coerce_halacha(raw)
        if coerced is None:
            continue
        coerced["quote_verified"] = hx._verify_quote(coerced["supporting_quote"], full_text)
        cleaned_b.append(coerced)

    print(f"    raw={len(raw_b)}  valid={len(cleaned_b)}  failed_chunks={failed}/{len(chunks)}")

    # ---- Comparison ----
    a_stats = stats(existing, f"A · Opus (production, n={len(existing)})")
    b_stats = stats(cleaned_b, f"B · codex/{MODEL} @ {REASONING}")
    print_stats(a_stats)
    print_stats(b_stats)

    # Dump B halachot for human quality review
    safe_citation = citation.replace("/", "_").replace('"', "").strip()
    out_path = (
        f"/home/chaim/legal-ai/data/"
        f"ab_halacha_codex_{safe_citation}_{MODEL}_{REASONING}.json"
    )
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(
            {
                "precedent": citation,
                "model": MODEL,
                "reasoning": REASONING,
                "engine": "codex",
                "A_stats": a_stats,
                "B_stats": b_stats,
                "B_halachot": cleaned_b,
            },
            f,
            ensure_ascii=False,
            indent=2,
        )
    print(f"\nB halachot written to: {out_path}")


if __name__ == "__main__":
    asyncio.run(main())