From 9d2536a66707c47994aebdceb334b76cb4c63291 Mon Sep 17 00:00:00 2001 From: Chaim Date: Sat, 6 Jun 2026 20:05:57 +0000 Subject: [PATCH] =?UTF-8?q?fix(#85):=20claude=5Fsession=20=E2=80=94=20retr?= =?UTF-8?q?y=20=D7=A2=D7=9C=20=D7=9B=D7=A9=D7=9C=D7=99=D7=9D=20=D7=97?= =?UTF-8?q?=D7=95=D7=9C=D7=A4=D7=99=D7=9D=20=D7=A9=D7=9C=20claude=20-p?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit שורש #85 התברר: `claude -p` נכשל מדי פעם ב-exit מהיר + stderr ריק על פרומפטים גדולים/איטיים (CEO write_interim_draft, learning_loop distillation), **אותו פרומפט מצליח בריצה חוזרת** — כשל חולף, לא nesting (אומת: nested claude מ-bash וגם פרומפט 70K הצליחו; הכשל אינו דטרמיניסטי). query() עוטף spawn+communicate ב-לולאת retry (MAX_RETRIES=3, backoff לינארי 5s*attempt). FileNotFoundError + timeout נשארים דטרמיניסטיים (ללא retry). empty-response גם מטופל כ-transient. אומת e2e: distillation על 1130-25 רץ בהצלחה → pair=analyzed (9 שינויים, 6 style_method, 33.8% diff). פותר גם את write_interim_draft של ה-CEO. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/legal_mcp/services/claude_session.py | 108 +++++++++++------- 1 file changed, 66 insertions(+), 42 deletions(-) diff --git a/mcp-server/src/legal_mcp/services/claude_session.py b/mcp-server/src/legal_mcp/services/claude_session.py index c44916d..1d6ed15 100644 --- a/mcp-server/src/legal_mcp/services/claude_session.py +++ b/mcp-server/src/legal_mcp/services/claude_session.py @@ -40,6 +40,14 @@ logger = logging.getLogger(__name__) DEFAULT_TIMEOUT = 1800 LONG_TIMEOUT = 3600 # opus block writing on full case context +# #85 — `claude -p` fails intermittently with a fast non-zero exit and empty +# stderr (observed on large/slow cold prompts: CEO write_interim_draft, +# learning_loop distillation). The SAME prompt succeeds on retry, so the bail is +# transient — retry with linear backoff. Timeouts and "CLI not found" are +# deterministic and are NOT retried. +MAX_RETRIES = 3 +RETRY_BACKOFF_BASE = 5 # seconds; sleep = base * attempt_number + async def query( prompt: str, @@ -94,53 +102,69 @@ async def query( if effort: cmd += ["--effort", effort] - try: - proc = await asyncio.create_subprocess_exec( - *cmd, - stdin=asyncio.subprocess.PIPE, - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - ) - except FileNotFoundError: - raise RuntimeError( - "Claude CLI not found. This module only works when invoked " - "from the local MCP server — see the architectural rule in " - "the module docstring. If this error came from a FastAPI " - "endpoint in the container, refactor the call into an MCP " - "tool that the chair triggers from Claude Code." - ) + size_info = f"; prompt_len={len(full_prompt):,} chars" if len(full_prompt) > 100_000 else "" + last_err = "unknown error" - try: - stdout_b, stderr_b = await asyncio.wait_for( - proc.communicate(input=full_prompt.encode("utf-8")), - timeout=timeout, - ) - except asyncio.TimeoutError: - # wait_for cancellation alone leaves the child running. + for attempt in range(1, MAX_RETRIES + 1): try: - proc.kill() - await proc.wait() - except ProcessLookupError: - pass - raise RuntimeError(f"Claude CLI timed out after {timeout}s") + proc = await asyncio.create_subprocess_exec( + *cmd, + stdin=asyncio.subprocess.PIPE, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + except FileNotFoundError: + # Deterministic — never retry. + raise RuntimeError( + "Claude CLI not found. This module only works when invoked " + "from the local MCP server — see the architectural rule in " + "the module docstring. If this error came from a FastAPI " + "endpoint in the container, refactor the call into an MCP " + "tool that the chair triggers from Claude Code." + ) - if proc.returncode != 0: - stderr = stderr_b.decode("utf-8", errors="replace").strip()[:500] or "unknown error" - size_info = f"; prompt_len={len(full_prompt):,} chars" if len(full_prompt) > 100_000 else "" - raise RuntimeError(f"Claude CLI failed (exit {proc.returncode}): {stderr}{size_info}") + try: + stdout_b, stderr_b = await asyncio.wait_for( + proc.communicate(input=full_prompt.encode("utf-8")), + timeout=timeout, + ) + except asyncio.TimeoutError: + # wait_for cancellation alone leaves the child running. A timeout is + # a real ceiling, not a transient blip — don't retry. + try: + proc.kill() + await proc.wait() + except ProcessLookupError: + pass + raise RuntimeError(f"Claude CLI timed out after {timeout}s") - stdout = stdout_b.decode("utf-8", errors="replace").strip() - if not stdout: - raise RuntimeError("Claude CLI returned empty response") + if proc.returncode != 0: + stderr = stderr_b.decode("utf-8", errors="replace").strip()[:500] or "unknown error" + last_err = f"exit {proc.returncode}: {stderr}" + else: + stdout = stdout_b.decode("utf-8", errors="replace").strip() + if stdout: + # claude -p --output-format json returns {"type":"result","result":"..."} + try: + data = json.loads(stdout) + if isinstance(data, dict) and "result" in data: + return data["result"] + return stdout + except json.JSONDecodeError: + return stdout + last_err = "empty response" - # claude -p --output-format json returns {"type":"result","result":"..."} - try: - data = json.loads(stdout) - if isinstance(data, dict) and "result" in data: - return data["result"] - return stdout - except json.JSONDecodeError: - return stdout + # Transient failure — retry with linear backoff unless this was the last try. + if attempt < MAX_RETRIES: + logger.warning( + "claude -p attempt %d/%d failed (%s%s) — retrying in %ds", + attempt, MAX_RETRIES, last_err, size_info, RETRY_BACKOFF_BASE * attempt, + ) + await asyncio.sleep(RETRY_BACKOFF_BASE * attempt) + + raise RuntimeError( + f"Claude CLI failed after {MAX_RETRIES} attempts ({last_err}){size_info}" + ) async def query_json(