"""Claude Code session bridge — runs prompts via the local `claude` CLI. All LLM calls in legal-ai go through this module. We shell out to the local Claude Code CLI which uses the developer's claude.ai session — zero direct API cost. **Architectural rule (do not violate):** this module only works when invoked from the local MCP server (the Python process at `/home/chaim/legal-ai/mcp-server/`, launched per `~/.claude.json`). It will **not** work when called from the legal-ai Docker container — that container has no `claude` CLI and no claude.ai session. Any code path under `web/` (FastAPI) that calls this module — directly or via an extractor like `halacha_extractor`, `claims_extractor`, `precedent_metadata_extractor`, `block_writer`, `qa_validator`, `learning_loop`, `local_classifier`, `appraiser_facts_extractor`, `brainstorm`, `style_analyzer` — is wrong. LLM-dependent operations must be exposed as MCP tools and triggered from agents (or the chair via Claude Code), where this module runs locally with CLI access. Async history: originally synchronous (``subprocess.run``) with a 120 s timeout. That broke for large legal documents — sync subprocess stalled the asyncio loop, and 120 s was far too short for cold-cache Hebrew prompts (case 8174-24 hit three timeouts in a row). Fixed by going async with a 30-minute ceiling. """ from __future__ import annotations import asyncio import json import logging from legal_mcp.config import parse_llm_json logger = logging.getLogger(__name__) # Default ceiling for any single ``claude -p`` invocation, in seconds. # 30 min covers any single-document call we make in practice (chunking # handles the rest); the bound exists only to prevent runaway zombies. DEFAULT_TIMEOUT = 1800 LONG_TIMEOUT = 3600 # opus block writing on full case context async def query( prompt: str, timeout: int = DEFAULT_TIMEOUT, max_turns: int = 1, *, system: str | None = None, ) -> str: """Send a prompt to Claude Code headless and return the text response. Passes the prompt via stdin (not argv) to avoid the OS ARG_MAX limit — prompts can be 500K+ chars when analyzing a full style corpus. Args: prompt: The prompt to send. timeout: Max seconds before the subprocess is killed. max_turns: Max conversation turns (1 = single response). system: Optional repeated-instruction text. Prepended to ``prompt`` for the CLI; we don't pass it as a separate arg because the CLI doesn't expose API-level caching. The parameter exists so extractors can structure their calls cleanly today, and to make a future SDK-backed path drop-in. Returns: The text response from Claude. Raises: RuntimeError: if the CLI is unavailable (e.g., called from the container — see module docstring), or fails, or times out. """ full_prompt = f"{system}\n\n{prompt}" if system else prompt if len(full_prompt) > 150_000: logger.warning("Large prompt: %d chars — may hit context limits", len(full_prompt)) cmd = [ "claude", "-p", "--output-format", "json", "--max-turns", str(max_turns), ] try: proc = await asyncio.create_subprocess_exec( *cmd, stdin=asyncio.subprocess.PIPE, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) except FileNotFoundError: raise RuntimeError( "Claude CLI not found. This module only works when invoked " "from the local MCP server — see the architectural rule in " "the module docstring. If this error came from a FastAPI " "endpoint in the container, refactor the call into an MCP " "tool that the chair triggers from Claude Code." ) try: stdout_b, stderr_b = await asyncio.wait_for( proc.communicate(input=full_prompt.encode("utf-8")), timeout=timeout, ) except asyncio.TimeoutError: # wait_for cancellation alone leaves the child running. try: proc.kill() await proc.wait() except ProcessLookupError: pass raise RuntimeError(f"Claude CLI timed out after {timeout}s") if proc.returncode != 0: stderr = stderr_b.decode("utf-8", errors="replace").strip()[:500] or "unknown error" size_info = f"; prompt_len={len(full_prompt):,} chars" if len(full_prompt) > 100_000 else "" raise RuntimeError(f"Claude CLI failed (exit {proc.returncode}): {stderr}{size_info}") stdout = stdout_b.decode("utf-8", errors="replace").strip() if not stdout: raise RuntimeError("Claude CLI returned empty response") # claude -p --output-format json returns {"type":"result","result":"..."} try: data = json.loads(stdout) if isinstance(data, dict) and "result" in data: return data["result"] return stdout except json.JSONDecodeError: return stdout async def query_json( prompt: str, timeout: int = DEFAULT_TIMEOUT, *, system: str | None = None, ) -> dict | list | None: """Send a prompt and parse the response as JSON. Uses parse_llm_json for robust parsing (handles markdown wrapping, truncation). """ raw = await query(prompt, timeout=timeout, system=system) return parse_llm_json(raw)