legal-ai/mcp-server/src/legal_mcp/services/claude_session.py

"""Claude Code session bridge — runs prompts via the local `claude` CLI.

All LLM calls in legal-ai go through this module. We shell out to the local
Claude Code CLI which uses the developer's claude.ai session — zero direct
API cost.

**Architectural rule (do not violate):** this module only works when invoked
from the local MCP server (the Python process at
`/home/chaim/legal-ai/mcp-server/`, launched per `~/.claude.json`). It will
**not** work when called from the legal-ai Docker container — that container
has no `claude` CLI and no claude.ai session. Any code path under `web/`
(FastAPI) that calls this module — directly or via an extractor like
`halacha_extractor`, `claims_extractor`, `precedent_metadata_extractor`,
`block_writer`, `qa_validator`, `learning_loop`, `local_classifier`,
`appraiser_facts_extractor`, `brainstorm`, `style_analyzer` — is wrong.
LLM-dependent operations must be exposed as MCP tools and triggered from
agents (or the chair via Claude Code), where this module runs locally with
CLI access.

Async history: originally synchronous (``subprocess.run``) with a 120 s
timeout. That broke for large legal documents — sync subprocess stalled the
asyncio loop, and 120 s was far too short for cold-cache Hebrew prompts
(case 8174-24 hit three timeouts in a row). Fixed by going async with a
30-minute ceiling.
"""

from __future__ import annotations

import asyncio
import json
import logging

from legal_mcp.config import parse_llm_json

logger = logging.getLogger(__name__)

# Default ceiling for any single ``claude -p`` invocation, in seconds.
# 30 min covers any single-document call we make in practice (chunking
# handles the rest); the bound exists only to prevent runaway zombies.
DEFAULT_TIMEOUT = 1800
LONG_TIMEOUT = 3600  # opus block writing on full case context


async def query(
    prompt: str,
    timeout: int = DEFAULT_TIMEOUT,
    max_turns: int = 1,
    *,
    system: str | None = None,
) -> str:
    """Send a prompt to Claude Code headless and return the text response.

    Passes the prompt via stdin (not argv) to avoid the OS ARG_MAX limit —
    prompts can be 500K+ chars when analyzing a full style corpus.

    Args:
        prompt: The prompt to send.
        timeout: Max seconds before the subprocess is killed.
        max_turns: Max conversation turns (1 = single response).
        system: Optional repeated-instruction text. Prepended to ``prompt``
            for the CLI; we don't pass it as a separate arg because the
            CLI doesn't expose API-level caching. The parameter exists so
            extractors can structure their calls cleanly today, and to make
            a future SDK-backed path drop-in.

    Returns:
        The text response from Claude.

    Raises:
        RuntimeError: if the CLI is unavailable (e.g., called from the
            container — see module docstring), or fails, or times out.
    """
    full_prompt = f"{system}\n\n{prompt}" if system else prompt

    if len(full_prompt) > 150_000:
        logger.warning("Large prompt: %d chars — may hit context limits", len(full_prompt))

    cmd = [
        "claude", "-p",
        "--output-format", "json",
        "--max-turns", str(max_turns),
    ]

    try:
        proc = await asyncio.create_subprocess_exec(
            *cmd,
            stdin=asyncio.subprocess.PIPE,
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.PIPE,
        )
    except FileNotFoundError:
        raise RuntimeError(
            "Claude CLI not found. This module only works when invoked "
            "from the local MCP server — see the architectural rule in "
            "the module docstring. If this error came from a FastAPI "
            "endpoint in the container, refactor the call into an MCP "
            "tool that the chair triggers from Claude Code."
        )

    try:
        stdout_b, stderr_b = await asyncio.wait_for(
            proc.communicate(input=full_prompt.encode("utf-8")),
            timeout=timeout,
        )
    except asyncio.TimeoutError:
        # wait_for cancellation alone leaves the child running.
        try:
            proc.kill()
            await proc.wait()
        except ProcessLookupError:
            pass
        raise RuntimeError(f"Claude CLI timed out after {timeout}s")

    if proc.returncode != 0:
        stderr = stderr_b.decode("utf-8", errors="replace").strip()[:500] or "unknown error"
        size_info = f"; prompt_len={len(full_prompt):,} chars" if len(full_prompt) > 100_000 else ""
        raise RuntimeError(f"Claude CLI failed (exit {proc.returncode}): {stderr}{size_info}")

    stdout = stdout_b.decode("utf-8", errors="replace").strip()
    if not stdout:
        raise RuntimeError("Claude CLI returned empty response")

    # claude -p --output-format json returns {"type":"result","result":"..."}
    try:
        data = json.loads(stdout)
        if isinstance(data, dict) and "result" in data:
            return data["result"]
        return stdout
    except json.JSONDecodeError:
        return stdout


async def query_json(
    prompt: str,
    timeout: int = DEFAULT_TIMEOUT,
    *,
    system: str | None = None,
) -> dict | list | None:
    """Send a prompt and parse the response as JSON.

    Uses parse_llm_json for robust parsing (handles markdown wrapping, truncation).
    """
    raw = await query(prompt, timeout=timeout, system=system)
    return parse_llm_json(raw)