הפרמטר `is_binding` הוסר מ-`_coerce_halacha` כשבוצע split rule_type→role ב-INV-DM7. הבאג היה בשני סקריפטי ה-A/B: - ab_halacha_codex.py — גרם לקריסה בריצה הראשונה - ab_halacha_opus48.py — אותה שגיאה לטנטית Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
279 lines
11 KiB
Python
279 lines
11 KiB
Python
#!/usr/bin/env python
|
|
"""A/B (NON-DESTRUCTIVE): re-extract halachot for ONE precedent via the
|
|
Codex CLI (gpt-5.5) and compare against the existing stored halachot.
|
|
|
|
Purpose: benchmark gpt-5.5 against the current Claude Opus production output —
|
|
WITHOUT deleting or storing anything in the DB.
|
|
|
|
Mirrors ab_halacha_opus48.py but replaces `claude -p` with
|
|
`codex exec --model gpt-5.5 --dangerously-bypass-approvals-and-sandbox -o FILE -`
|
|
The model's last message (written to FILE via `-o`) is parsed as JSON.
|
|
|
|
Usage:
|
|
DOTENV_PATH=/home/chaim/.env DATA_DIR=/home/chaim/legal-ai/data \
|
|
AB_MODEL=gpt-5.5 AB_REASONING=medium \
|
|
mcp-server/.venv/bin/python scripts/ab_halacha_codex.py <case_law_id>
|
|
|
|
Env knobs:
|
|
AB_MODEL model slug (default gpt-5.5)
|
|
AB_REASONING reasoning effort: low/medium/high/xhigh (default medium)
|
|
AB_CONCURRENCY concurrent chunks (default 1 — codex sessions, be conservative)
|
|
AB_CHUNK_TIMEOUT seconds per chunk (default 300)
|
|
CODEX_BIN path to codex binary (default: VS Code extension arm64 build)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import json
|
|
import os
|
|
import statistics
|
|
import sys
|
|
import tempfile
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
from uuid import UUID
|
|
|
|
from legal_mcp.config import parse_llm_json
|
|
from legal_mcp.services import db
|
|
from legal_mcp.services import halacha_extractor as hx
|
|
|
|
# ── configuration ─────────────────────────────────────────────────────────────
|
|
|
|
MODEL = os.environ.get("AB_MODEL", "gpt-5.5")
|
|
REASONING = os.environ.get("AB_REASONING", "medium")
|
|
CONCURRENCY = int(os.environ.get("AB_CONCURRENCY", "1"))
|
|
CHUNK_TIMEOUT = int(os.environ.get("AB_CHUNK_TIMEOUT", "300"))
|
|
|
|
# ARM64 build bundled with the VS Code ChatGPT extension — authenticated via
|
|
# ~/.codex/auth.json (ChatGPT subscription, no OPENAI_API_KEY needed).
|
|
CODEX_BIN = os.environ.get(
|
|
"CODEX_BIN",
|
|
"/home/chaim/.vscode-server/extensions/"
|
|
"openai.chatgpt-26.609.30741-linux-arm64/bin/linux-aarch64/codex",
|
|
)
|
|
|
|
|
|
# ── codex invocation ──────────────────────────────────────────────────────────
|
|
|
|
async def run_codex(system: str, prompt: str, timeout: int = CHUNK_TIMEOUT):
|
|
"""One `codex exec` call. Returns parsed JSON from the model's last message.
|
|
|
|
Codex is an agentic runner; we steer it to output-only mode via an explicit
|
|
instruction prepended to the system prompt. The `-o FILE` flag captures the
|
|
final text message; `parse_llm_json` strips any markdown fences.
|
|
"""
|
|
preamble = (
|
|
"Your output MUST be a valid JSON array and nothing else.\n"
|
|
"Do NOT use shell commands, do NOT write files.\n"
|
|
"Respond ONLY with the JSON array — no explanation, no markdown fences.\n\n"
|
|
)
|
|
full_input = preamble + system + "\n\n" + prompt
|
|
|
|
out_fd, out_path = tempfile.mkstemp(suffix=".txt", prefix="codex_ab_")
|
|
os.close(out_fd)
|
|
|
|
cmd = [
|
|
CODEX_BIN, "exec",
|
|
"--model", MODEL,
|
|
"-c", f"model_reasoning_effort={json.dumps(REASONING)}",
|
|
"--dangerously-bypass-approvals-and-sandbox",
|
|
"--skip-git-repo-check",
|
|
"--ephemeral",
|
|
"-o", out_path,
|
|
"-",
|
|
]
|
|
try:
|
|
proc = await asyncio.create_subprocess_exec(
|
|
*cmd,
|
|
stdin=asyncio.subprocess.PIPE,
|
|
stdout=asyncio.subprocess.DEVNULL,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
env={**os.environ, "HOME": "/home/chaim"},
|
|
)
|
|
_, err_b = await asyncio.wait_for(
|
|
proc.communicate(input=full_input.encode("utf-8")),
|
|
timeout=timeout,
|
|
)
|
|
if proc.returncode != 0:
|
|
raise RuntimeError(
|
|
f"codex exit {proc.returncode}: "
|
|
f"{err_b.decode('utf-8', 'replace').strip()[:400]}"
|
|
)
|
|
raw = Path(out_path).read_text(encoding="utf-8").strip()
|
|
finally:
|
|
Path(out_path).unlink(missing_ok=True)
|
|
|
|
if not raw:
|
|
raise RuntimeError("codex returned empty last-message")
|
|
return parse_llm_json(raw)
|
|
|
|
|
|
# ── extraction (mirrors ab_halacha_opus48) ────────────────────────────────────
|
|
|
|
async def extract_chunk(chunk_text, section_type, idx, total, context, is_binding):
|
|
base_prompt = (
|
|
hx.HALACHA_EXTRACTION_PROMPT_BINDING if is_binding
|
|
else hx.HALACHA_EXTRACTION_PROMPT_PERSUASIVE
|
|
)
|
|
chunk_label = f" (חלק {idx + 1}/{total})" if total > 1 else ""
|
|
user_msg = (
|
|
f"## הקלט\n"
|
|
f"סוג קטע: {section_type}\n"
|
|
f"{context}{chunk_label}\n\n"
|
|
f"--- תחילת הטקסט ---\n{chunk_text}\n--- סוף הטקסט ---"
|
|
)
|
|
try:
|
|
result = await run_codex(base_prompt, user_msg)
|
|
except Exception as e:
|
|
print(f" ! chunk {idx + 1}/{total} failed: {e}", file=sys.stderr)
|
|
return [], False
|
|
if isinstance(result, list):
|
|
return result, True
|
|
print(f" ! chunk {idx + 1}/{total} non-list: {type(result).__name__}", file=sys.stderr)
|
|
return [], False
|
|
|
|
|
|
# ── statistics ────────────────────────────────────────────────────────────────
|
|
|
|
def stats(halachot: list[dict], label: str) -> dict:
|
|
n = len(halachot)
|
|
|
|
def fconf(x):
|
|
try:
|
|
return float(x.get("confidence"))
|
|
except (TypeError, ValueError):
|
|
return None
|
|
|
|
confs = [c for c in (fconf(h) for h in halachot) if c is not None]
|
|
qv = Counter(bool(h.get("quote_verified")) for h in halachot)
|
|
rt = Counter(h.get("rule_type") for h in halachot)
|
|
return {
|
|
"label": label, "n": n,
|
|
"quote_verified_true": qv.get(True, 0),
|
|
"quote_verified_false": qv.get(False, 0),
|
|
"conf_min": min(confs) if confs else None,
|
|
"conf_median": statistics.median(confs) if confs else None,
|
|
"conf_max": max(confs) if confs else None,
|
|
"conf_below_0_7": sum(1 for c in confs if c < 0.7),
|
|
"rule_types": dict(rt),
|
|
}
|
|
|
|
|
|
def print_stats(s: dict):
|
|
print(f"\n=== {s['label']} ===")
|
|
print(f" count : {s['n']}")
|
|
print(f" quote_verified : {s['quote_verified_true']} ✓ / {s['quote_verified_false']} ✗")
|
|
if s["conf_median"] is not None:
|
|
print(f" confidence min/med/max: {s['conf_min']:.2f} / {s['conf_median']:.2f} / {s['conf_max']:.2f}")
|
|
print(f" confidence < 0.7 : {s['conf_below_0_7']} / {s['n']}")
|
|
print(f" rule_type dist : {s['rule_types']}")
|
|
|
|
|
|
# ── main ──────────────────────────────────────────────────────────────────────
|
|
|
|
async def main():
|
|
if len(sys.argv) < 2:
|
|
print(
|
|
"usage: ab_halacha_codex.py <case_law_id>\n"
|
|
" case_law_id: UUID of the case_law row (e.g. 246a22a0-46b5-...)\n"
|
|
"env: AB_MODEL (default gpt-5.5), AB_REASONING (default medium),\n"
|
|
" AB_CONCURRENCY (default 1), AB_CHUNK_TIMEOUT (default 300)",
|
|
file=sys.stderr,
|
|
)
|
|
sys.exit(2)
|
|
|
|
if not Path(CODEX_BIN).exists():
|
|
print(f"codex binary not found: {CODEX_BIN}", file=sys.stderr)
|
|
print("set CODEX_BIN= to the correct path", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
case_law_id = UUID(sys.argv[1])
|
|
|
|
record = await db.get_case_law(case_law_id)
|
|
if not record:
|
|
print("case_law not found", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
is_binding = bool(record.get("is_binding"))
|
|
citation = record.get("case_number", "")
|
|
court = record.get("court", "")
|
|
date_str = str(record.get("date") or "")
|
|
full_text = record.get("full_text") or ""
|
|
|
|
print(f"Precedent: {citation} — {record.get('case_name')}")
|
|
print(f" court={court} is_binding={is_binding} prompt={'BINDING' if is_binding else 'PERSUASIVE'}")
|
|
print(f" model={MODEL} reasoning={REASONING} concurrency={CONCURRENCY}")
|
|
print(f" codex_bin={CODEX_BIN}")
|
|
|
|
# ---- Side A: existing stored halachot (current production / Opus output) ----
|
|
existing = await db.list_halachot(case_law_id=case_law_id, limit=500)
|
|
by_status = Counter(h.get("review_status") for h in existing)
|
|
print(f"\n[A] existing halachot in DB: {len(existing)} status breakdown: {dict(by_status)}")
|
|
approved = by_status.get("approved", 0) + by_status.get("published", 0)
|
|
if approved:
|
|
print(f" (extracted by Claude Opus — a REAL re-run would DELETE the {approved} approved)")
|
|
|
|
# ---- Side B: fresh extraction via codex / gpt-5.5 (no DB writes) ----
|
|
chunks = await db.list_precedent_chunks(case_law_id, section_types=hx.EXTRACTABLE_SECTIONS)
|
|
if not chunks:
|
|
chunks = await db.list_precedent_chunks(case_law_id)
|
|
print(f"\n[B] extracting from {len(chunks)} chunks via codex/{MODEL} @ {REASONING} ...")
|
|
context = f"מקור: {citation} — {court}, {date_str}"
|
|
sem = asyncio.Semaphore(CONCURRENCY)
|
|
|
|
async def bounded(i, c):
|
|
async with sem:
|
|
return await extract_chunk(
|
|
c["content"], c["section_type"], i, len(chunks), context, is_binding
|
|
)
|
|
|
|
results = await asyncio.gather(*[bounded(i, c) for i, c in enumerate(chunks)])
|
|
raw_b, failed = [], 0
|
|
for items, ok in results:
|
|
raw_b.extend(items)
|
|
if not ok:
|
|
failed += 1
|
|
|
|
cleaned_b = []
|
|
for raw in raw_b:
|
|
coerced = hx._coerce_halacha(raw)
|
|
if coerced is None:
|
|
continue
|
|
coerced["quote_verified"] = hx._verify_quote(coerced["supporting_quote"], full_text)
|
|
cleaned_b.append(coerced)
|
|
|
|
print(f" raw={len(raw_b)} valid={len(cleaned_b)} failed_chunks={failed}/{len(chunks)}")
|
|
|
|
# ---- Comparison ----
|
|
a_stats = stats(existing, f"A · Opus (production, n={len(existing)})")
|
|
b_stats = stats(cleaned_b, f"B · codex/{MODEL} @ {REASONING}")
|
|
print_stats(a_stats)
|
|
print_stats(b_stats)
|
|
|
|
# Dump B halachot for human quality review
|
|
safe_citation = citation.replace("/", "_").replace('"', "").strip()
|
|
out_path = (
|
|
f"/home/chaim/legal-ai/data/"
|
|
f"ab_halacha_codex_{safe_citation}_{MODEL}_{REASONING}.json"
|
|
)
|
|
with open(out_path, "w", encoding="utf-8") as f:
|
|
json.dump(
|
|
{
|
|
"precedent": citation,
|
|
"model": MODEL,
|
|
"reasoning": REASONING,
|
|
"engine": "codex",
|
|
"A_stats": a_stats,
|
|
"B_stats": b_stats,
|
|
"B_halachot": cleaned_b,
|
|
},
|
|
f,
|
|
ensure_ascii=False,
|
|
indent=2,
|
|
)
|
|
print(f"\nB halachot written to: {out_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|