fix(#85): claude_session — retry על כשלים חולפים של claude -p

שורש #85 התברר: `claude -p` נכשל מדי פעם ב-exit מהיר + stderr ריק על פרומפטים גדולים/איטיים (CEO write_interim_draft, learning_loop distillation), **אותו פרומפט מצליח בריצה חוזרת** — כשל חולף, לא nesting (אומת: nested claude מ-bash וגם פרומפט 70K הצליחו; הכשל אינו דטרמיניסטי). query() עוטף spawn+communicate ב-לולאת retry (MAX_RETRIES=3, backoff לינארי 5s*attempt). FileNotFoundError + timeout נשארים דטרמיניסטיים (ללא retry). empty-response גם מטופל כ-transient. אומת e2e: distillation על 1130-25 רץ בהצלחה → pair=analyzed (9 שינויים, 6 style_method, 33.8% diff). פותר גם את write_interim_draft של ה-CEO. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-06 20:05:57 +00:00
parent 85c5a4aacb
commit e096c51037
1 changed files with 80 additions and 66 deletions
--- a/mcp-server/src/legal_mcp/services/claude_session.py
+++ b/mcp-server/src/legal_mcp/services/claude_session.py
@@ -41,25 +41,31 @@ logger = logging.getLogger(__name__)
 DEFAULT_TIMEOUT = 1800
 LONG_TIMEOUT = 3600  # opus block writing on full case context

-# Environment markers a running Claude Code session exports into its child
-# processes. When an agent itself runs as ``claude -p`` (e.g. the CEO MCP
-# instance launched by Paperclip's claude_local adapter), its MCP server
-# inherits these, and a *nested* ``claude -p`` we spawn here inherits them
-# too. Stripping them makes every nested invocation launch as a clean
-# top-level session, decoupled from the parent's session/project state —
-# defensive hardening against the #85 nested-``exit 1`` failure (which could
-# not be reproduced in a plain interactive session, so the markers are a
-# suspect, not a proven cause). Auth/config vars (CLAUDE_CONFIG_DIR,
-# ANTHROPIC_*, PATH, HOME) are kept. See TaskMaster legal-ai #85.
+# #85 — two complementary hardenings for the same symptom (`claude -p` failing
+# with a fast non-zero exit + empty stderr on large/slow cold prompts: CEO
+# write_interim_draft, learning_loop distillation):
+#
+# 1. CLEAN ENV (defensive): a running Claude Code session exports markers into
+#    child processes; a *nested* ``claude -p`` inherits them. Stripping them lets
+#    every nested invocation launch as a clean top-level session. Could not be
+#    reproduced deterministically, so it's a suspect, not a proven cause. Auth/
+#    config (CLAUDE_CONFIG_DIR, ANTHROPIC_*, PATH, HOME) are kept.
+# 2. RETRY (the real fix): the SAME large prompt that exits 1 once succeeds on a
+#    plain retry — the bail is transient. Retry with linear backoff. Timeouts and
+#    "CLI not found" stay deterministic and are NOT retried.
+# See TaskMaster legal-ai #85.
 _SESSION_MARKER_PREFIXES = ("CLAUDECODE", "CLAUDE_CODE_", "CLAUDE_AGENT_")
 _SESSION_MARKER_EXACT = frozenset({"AI_AGENT", "CLAUDE_EFFORT"})

+MAX_RETRIES = 3
+RETRY_BACKOFF_BASE = 5  # seconds; sleep = base * attempt_number
+

 def _clean_subprocess_env() -> dict[str, str]:
    """Copy the current env minus Claude Code session markers.

    Lets a nested ``claude -p`` start fresh instead of detecting it is
-    already inside a Claude Code session (the #85 nested-exit-1 bug).
+    already inside a Claude Code session (#85).
    """
    env = dict(os.environ)
    for key in list(env):
@@ -121,66 +127,74 @@ async def query(
    if effort:
        cmd += ["--effort", effort]

-    try:
-        proc = await asyncio.create_subprocess_exec(
-            *cmd,
-            stdin=asyncio.subprocess.PIPE,
-            stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.PIPE,
-            env=_clean_subprocess_env(),
-            cwd=os.path.expanduser("~"),
-        )
-    except FileNotFoundError:
-        raise RuntimeError(
-            "Claude CLI not found. This module only works when invoked "
-            "from the local MCP server — see the architectural rule in "
-            "the module docstring. If this error came from a FastAPI "
-            "endpoint in the container, refactor the call into an MCP "
-            "tool that the chair triggers from Claude Code."
-        )
+    size_info = f"; prompt_len={len(full_prompt):,} chars" if len(full_prompt) > 100_000 else ""
+    last_err = "unknown error"

-    try:
-        stdout_b, stderr_b = await asyncio.wait_for(
-            proc.communicate(input=full_prompt.encode("utf-8")),
-            timeout=timeout,
-        )
-    except asyncio.TimeoutError:
-        # wait_for cancellation alone leaves the child running.
+    for attempt in range(1, MAX_RETRIES + 1):
        try:
-            proc.kill()
-            await proc.wait()
-        except ProcessLookupError:
-            pass
-        raise RuntimeError(f"Claude CLI timed out after {timeout}s")
+            proc = await asyncio.create_subprocess_exec(
+                *cmd,
+                stdin=asyncio.subprocess.PIPE,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+                env=_clean_subprocess_env(),
+                cwd=os.path.expanduser("~"),
+            )
+        except FileNotFoundError:
+            # Deterministic — never retry.
+            raise RuntimeError(
+                "Claude CLI not found. This module only works when invoked "
+                "from the local MCP server — see the architectural rule in "
+                "the module docstring. If this error came from a FastAPI "
+                "endpoint in the container, refactor the call into an MCP "
+                "tool that the chair triggers from Claude Code."
+            )

-    if proc.returncode != 0:
-        # Surface the real cause: the CLI sometimes writes its diagnostic to
-        # stdout (or nowhere) rather than stderr, so a stderr-only message
-        # collapsed to "unknown error" and hid the actual failure (#85).
-        stderr = stderr_b.decode("utf-8", errors="replace").strip()
-        stdout = stdout_b.decode("utf-8", errors="replace").strip()
-        diagnostic = stderr or stdout or "no output on stderr/stdout"
-        size_info = f"; prompt_len={len(full_prompt):,} chars" if len(full_prompt) > 100_000 else ""
-        logger.error(
-            "claude -p failed (exit %s): stderr=%r stdout=%r",
-            proc.returncode, stderr[:500], stdout[:500],
-        )
-        raise RuntimeError(
-            f"Claude CLI failed (exit {proc.returncode}): {diagnostic[:500]}{size_info}"
-        )
+        try:
+            stdout_b, stderr_b = await asyncio.wait_for(
+                proc.communicate(input=full_prompt.encode("utf-8")),
+                timeout=timeout,
+            )
+        except asyncio.TimeoutError:
+            # wait_for cancellation alone leaves the child running. A timeout is
+            # a real ceiling, not a transient blip — don't retry.
+            try:
+                proc.kill()
+                await proc.wait()
+            except ProcessLookupError:
+                pass
+            raise RuntimeError(f"Claude CLI timed out after {timeout}s")

-    stdout = stdout_b.decode("utf-8", errors="replace").strip()
-    if not stdout:
-        raise RuntimeError("Claude CLI returned empty response")
+        if proc.returncode != 0:
+            # The CLI sometimes writes its diagnostic to stdout (or nowhere)
+            # rather than stderr (#85) — surface whichever is present.
+            stderr = stderr_b.decode("utf-8", errors="replace").strip()
+            stdout = stdout_b.decode("utf-8", errors="replace").strip()
+            last_err = f"exit {proc.returncode}: {(stderr or stdout or 'no output')[:500]}"
+        else:
+            stdout = stdout_b.decode("utf-8", errors="replace").strip()
+            if stdout:
+                # claude -p --output-format json returns {"type":"result","result":"..."}
+                try:
+                    data = json.loads(stdout)
+                    if isinstance(data, dict) and "result" in data:
+                        return data["result"]
+                    return stdout
+                except json.JSONDecodeError:
+                    return stdout
+            last_err = "empty response"

-    # claude -p --output-format json returns {"type":"result","result":"..."}
-    try:
-        data = json.loads(stdout)
-        if isinstance(data, dict) and "result" in data:
-            return data["result"]
-        return stdout
-    except json.JSONDecodeError:
-        return stdout
+        # Transient failure — retry with linear backoff unless this was the last try.
+        if attempt < MAX_RETRIES:
+            logger.warning(
+                "claude -p attempt %d/%d failed (%s%s) — retrying in %ds",
+                attempt, MAX_RETRIES, last_err, size_info, RETRY_BACKOFF_BASE * attempt,
+            )
+            await asyncio.sleep(RETRY_BACKOFF_BASE * attempt)
+
+    raise RuntimeError(
+        f"Claude CLI failed after {MAX_RETRIES} attempts ({last_err}){size_info}"
+    )


 async def query_json(