diff --git a/mcp-server/src/legal_mcp/services/claude_session.py b/mcp-server/src/legal_mcp/services/claude_session.py index d49435a..f6b077b 100644 --- a/mcp-server/src/legal_mcp/services/claude_session.py +++ b/mcp-server/src/legal_mcp/services/claude_session.py @@ -41,25 +41,31 @@ logger = logging.getLogger(__name__) DEFAULT_TIMEOUT = 1800 LONG_TIMEOUT = 3600 # opus block writing on full case context -# Environment markers a running Claude Code session exports into its child -# processes. When an agent itself runs as ``claude -p`` (e.g. the CEO MCP -# instance launched by Paperclip's claude_local adapter), its MCP server -# inherits these, and a *nested* ``claude -p`` we spawn here inherits them -# too. Stripping them makes every nested invocation launch as a clean -# top-level session, decoupled from the parent's session/project state — -# defensive hardening against the #85 nested-``exit 1`` failure (which could -# not be reproduced in a plain interactive session, so the markers are a -# suspect, not a proven cause). Auth/config vars (CLAUDE_CONFIG_DIR, -# ANTHROPIC_*, PATH, HOME) are kept. See TaskMaster legal-ai #85. +# #85 — two complementary hardenings for the same symptom (`claude -p` failing +# with a fast non-zero exit + empty stderr on large/slow cold prompts: CEO +# write_interim_draft, learning_loop distillation): +# +# 1. CLEAN ENV (defensive): a running Claude Code session exports markers into +# child processes; a *nested* ``claude -p`` inherits them. Stripping them lets +# every nested invocation launch as a clean top-level session. Could not be +# reproduced deterministically, so it's a suspect, not a proven cause. Auth/ +# config (CLAUDE_CONFIG_DIR, ANTHROPIC_*, PATH, HOME) are kept. +# 2. RETRY (the real fix): the SAME large prompt that exits 1 once succeeds on a +# plain retry — the bail is transient. Retry with linear backoff. Timeouts and +# "CLI not found" stay deterministic and are NOT retried. +# See TaskMaster legal-ai #85. _SESSION_MARKER_PREFIXES = ("CLAUDECODE", "CLAUDE_CODE_", "CLAUDE_AGENT_") _SESSION_MARKER_EXACT = frozenset({"AI_AGENT", "CLAUDE_EFFORT"}) +MAX_RETRIES = 3 +RETRY_BACKOFF_BASE = 5 # seconds; sleep = base * attempt_number + def _clean_subprocess_env() -> dict[str, str]: """Copy the current env minus Claude Code session markers. Lets a nested ``claude -p`` start fresh instead of detecting it is - already inside a Claude Code session (the #85 nested-exit-1 bug). + already inside a Claude Code session (#85). """ env = dict(os.environ) for key in list(env): @@ -121,66 +127,74 @@ async def query( if effort: cmd += ["--effort", effort] - try: - proc = await asyncio.create_subprocess_exec( - *cmd, - stdin=asyncio.subprocess.PIPE, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - env=_clean_subprocess_env(), - cwd=os.path.expanduser("~"), - ) - except FileNotFoundError: - raise RuntimeError( - "Claude CLI not found. This module only works when invoked " - "from the local MCP server — see the architectural rule in " - "the module docstring. If this error came from a FastAPI " - "endpoint in the container, refactor the call into an MCP " - "tool that the chair triggers from Claude Code." - ) + size_info = f"; prompt_len={len(full_prompt):,} chars" if len(full_prompt) > 100_000 else "" + last_err = "unknown error" - try: - stdout_b, stderr_b = await asyncio.wait_for( - proc.communicate(input=full_prompt.encode("utf-8")), - timeout=timeout, - ) - except asyncio.TimeoutError: - # wait_for cancellation alone leaves the child running. + for attempt in range(1, MAX_RETRIES + 1): try: - proc.kill() - await proc.wait() - except ProcessLookupError: - pass - raise RuntimeError(f"Claude CLI timed out after {timeout}s") + proc = await asyncio.create_subprocess_exec( + *cmd, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + env=_clean_subprocess_env(), + cwd=os.path.expanduser("~"), + ) + except FileNotFoundError: + # Deterministic — never retry. + raise RuntimeError( + "Claude CLI not found. This module only works when invoked " + "from the local MCP server — see the architectural rule in " + "the module docstring. If this error came from a FastAPI " + "endpoint in the container, refactor the call into an MCP " + "tool that the chair triggers from Claude Code." + ) - if proc.returncode != 0: - # Surface the real cause: the CLI sometimes writes its diagnostic to - # stdout (or nowhere) rather than stderr, so a stderr-only message - # collapsed to "unknown error" and hid the actual failure (#85). - stderr = stderr_b.decode("utf-8", errors="replace").strip() - stdout = stdout_b.decode("utf-8", errors="replace").strip() - diagnostic = stderr or stdout or "no output on stderr/stdout" - size_info = f"; prompt_len={len(full_prompt):,} chars" if len(full_prompt) > 100_000 else "" - logger.error( - "claude -p failed (exit %s): stderr=%r stdout=%r", - proc.returncode, stderr[:500], stdout[:500], - ) - raise RuntimeError( - f"Claude CLI failed (exit {proc.returncode}): {diagnostic[:500]}{size_info}" - ) + try: + stdout_b, stderr_b = await asyncio.wait_for( + proc.communicate(input=full_prompt.encode("utf-8")), + timeout=timeout, + ) + except asyncio.TimeoutError: + # wait_for cancellation alone leaves the child running. A timeout is + # a real ceiling, not a transient blip — don't retry. + try: + proc.kill() + await proc.wait() + except ProcessLookupError: + pass + raise RuntimeError(f"Claude CLI timed out after {timeout}s") - stdout = stdout_b.decode("utf-8", errors="replace").strip() - if not stdout: - raise RuntimeError("Claude CLI returned empty response") + if proc.returncode != 0: + # The CLI sometimes writes its diagnostic to stdout (or nowhere) + # rather than stderr (#85) — surface whichever is present. + stderr = stderr_b.decode("utf-8", errors="replace").strip() + stdout = stdout_b.decode("utf-8", errors="replace").strip() + last_err = f"exit {proc.returncode}: {(stderr or stdout or 'no output')[:500]}" + else: + stdout = stdout_b.decode("utf-8", errors="replace").strip() + if stdout: + # claude -p --output-format json returns {"type":"result","result":"..."} + try: + data = json.loads(stdout) + if isinstance(data, dict) and "result" in data: + return data["result"] + return stdout + except json.JSONDecodeError: + return stdout + last_err = "empty response" - # claude -p --output-format json returns {"type":"result","result":"..."} - try: - data = json.loads(stdout) - if isinstance(data, dict) and "result" in data: - return data["result"] - return stdout - except json.JSONDecodeError: - return stdout + # Transient failure — retry with linear backoff unless this was the last try. + if attempt < MAX_RETRIES: + logger.warning( + "claude -p attempt %d/%d failed (%s%s) — retrying in %ds", + attempt, MAX_RETRIES, last_err, size_info, RETRY_BACKOFF_BASE * attempt, + ) + await asyncio.sleep(RETRY_BACKOFF_BASE * attempt) + + raise RuntimeError( + f"Claude CLI failed after {MAX_RETRIES} attempts ({last_err}){size_info}" + ) async def query_json(