fix(#85): claude_session — retry על כשלים חולפים של claude -p

שורש #85 התברר: `claude -p` נכשל מדי פעם ב-exit מהיר + stderr ריק על
פרומפטים גדולים/איטיים (CEO write_interim_draft, learning_loop distillation),
**אותו פרומפט מצליח בריצה חוזרת** — כשל חולף, לא nesting (אומת: nested claude
מ-bash וגם פרומפט 70K הצליחו; הכשל אינו דטרמיניסטי).

query() עוטף spawn+communicate ב-לולאת retry (MAX_RETRIES=3, backoff לינארי
5s*attempt). FileNotFoundError + timeout נשארים דטרמיניסטיים (ללא retry).
empty-response גם מטופל כ-transient.

אומת e2e: distillation על 1130-25 רץ בהצלחה → pair=analyzed (9 שינויים,
6 style_method, 33.8% diff). פותר גם את write_interim_draft של ה-CEO.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-06 20:05:57 +00:00
parent 85c5a4aacb
commit e096c51037

View File

@@ -41,25 +41,31 @@ logger = logging.getLogger(__name__)
DEFAULT_TIMEOUT = 1800 DEFAULT_TIMEOUT = 1800
LONG_TIMEOUT = 3600 # opus block writing on full case context LONG_TIMEOUT = 3600 # opus block writing on full case context
# Environment markers a running Claude Code session exports into its child # #85 — two complementary hardenings for the same symptom (`claude -p` failing
# processes. When an agent itself runs as ``claude -p`` (e.g. the CEO MCP # with a fast non-zero exit + empty stderr on large/slow cold prompts: CEO
# instance launched by Paperclip's claude_local adapter), its MCP server # write_interim_draft, learning_loop distillation):
# inherits these, and a *nested* ``claude -p`` we spawn here inherits them #
# too. Stripping them makes every nested invocation launch as a clean # 1. CLEAN ENV (defensive): a running Claude Code session exports markers into
# top-level session, decoupled from the parent's session/project state — # child processes; a *nested* ``claude -p`` inherits them. Stripping them lets
# defensive hardening against the #85 nested-``exit 1`` failure (which could # every nested invocation launch as a clean top-level session. Could not be
# not be reproduced in a plain interactive session, so the markers are a # reproduced deterministically, so it's a suspect, not a proven cause. Auth/
# suspect, not a proven cause). Auth/config vars (CLAUDE_CONFIG_DIR, # config (CLAUDE_CONFIG_DIR, ANTHROPIC_*, PATH, HOME) are kept.
# ANTHROPIC_*, PATH, HOME) are kept. See TaskMaster legal-ai #85. # 2. RETRY (the real fix): the SAME large prompt that exits 1 once succeeds on a
# plain retry — the bail is transient. Retry with linear backoff. Timeouts and
# "CLI not found" stay deterministic and are NOT retried.
# See TaskMaster legal-ai #85.
_SESSION_MARKER_PREFIXES = ("CLAUDECODE", "CLAUDE_CODE_", "CLAUDE_AGENT_") _SESSION_MARKER_PREFIXES = ("CLAUDECODE", "CLAUDE_CODE_", "CLAUDE_AGENT_")
_SESSION_MARKER_EXACT = frozenset({"AI_AGENT", "CLAUDE_EFFORT"}) _SESSION_MARKER_EXACT = frozenset({"AI_AGENT", "CLAUDE_EFFORT"})
MAX_RETRIES = 3
RETRY_BACKOFF_BASE = 5 # seconds; sleep = base * attempt_number
def _clean_subprocess_env() -> dict[str, str]: def _clean_subprocess_env() -> dict[str, str]:
"""Copy the current env minus Claude Code session markers. """Copy the current env minus Claude Code session markers.
Lets a nested ``claude -p`` start fresh instead of detecting it is Lets a nested ``claude -p`` start fresh instead of detecting it is
already inside a Claude Code session (the #85 nested-exit-1 bug). already inside a Claude Code session (#85).
""" """
env = dict(os.environ) env = dict(os.environ)
for key in list(env): for key in list(env):
@@ -121,66 +127,74 @@ async def query(
if effort: if effort:
cmd += ["--effort", effort] cmd += ["--effort", effort]
try: size_info = f"; prompt_len={len(full_prompt):,} chars" if len(full_prompt) > 100_000 else ""
proc = await asyncio.create_subprocess_exec( last_err = "unknown error"
*cmd,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
env=_clean_subprocess_env(),
cwd=os.path.expanduser("~"),
)
except FileNotFoundError:
raise RuntimeError(
"Claude CLI not found. This module only works when invoked "
"from the local MCP server — see the architectural rule in "
"the module docstring. If this error came from a FastAPI "
"endpoint in the container, refactor the call into an MCP "
"tool that the chair triggers from Claude Code."
)
try: for attempt in range(1, MAX_RETRIES + 1):
stdout_b, stderr_b = await asyncio.wait_for(
proc.communicate(input=full_prompt.encode("utf-8")),
timeout=timeout,
)
except asyncio.TimeoutError:
# wait_for cancellation alone leaves the child running.
try: try:
proc.kill() proc = await asyncio.create_subprocess_exec(
await proc.wait() *cmd,
except ProcessLookupError: stdin=asyncio.subprocess.PIPE,
pass stdout=asyncio.subprocess.PIPE,
raise RuntimeError(f"Claude CLI timed out after {timeout}s") stderr=asyncio.subprocess.PIPE,
env=_clean_subprocess_env(),
cwd=os.path.expanduser("~"),
)
except FileNotFoundError:
# Deterministic — never retry.
raise RuntimeError(
"Claude CLI not found. This module only works when invoked "
"from the local MCP server — see the architectural rule in "
"the module docstring. If this error came from a FastAPI "
"endpoint in the container, refactor the call into an MCP "
"tool that the chair triggers from Claude Code."
)
if proc.returncode != 0: try:
# Surface the real cause: the CLI sometimes writes its diagnostic to stdout_b, stderr_b = await asyncio.wait_for(
# stdout (or nowhere) rather than stderr, so a stderr-only message proc.communicate(input=full_prompt.encode("utf-8")),
# collapsed to "unknown error" and hid the actual failure (#85). timeout=timeout,
stderr = stderr_b.decode("utf-8", errors="replace").strip() )
stdout = stdout_b.decode("utf-8", errors="replace").strip() except asyncio.TimeoutError:
diagnostic = stderr or stdout or "no output on stderr/stdout" # wait_for cancellation alone leaves the child running. A timeout is
size_info = f"; prompt_len={len(full_prompt):,} chars" if len(full_prompt) > 100_000 else "" # a real ceiling, not a transient blip — don't retry.
logger.error( try:
"claude -p failed (exit %s): stderr=%r stdout=%r", proc.kill()
proc.returncode, stderr[:500], stdout[:500], await proc.wait()
) except ProcessLookupError:
raise RuntimeError( pass
f"Claude CLI failed (exit {proc.returncode}): {diagnostic[:500]}{size_info}" raise RuntimeError(f"Claude CLI timed out after {timeout}s")
)
stdout = stdout_b.decode("utf-8", errors="replace").strip() if proc.returncode != 0:
if not stdout: # The CLI sometimes writes its diagnostic to stdout (or nowhere)
raise RuntimeError("Claude CLI returned empty response") # rather than stderr (#85) — surface whichever is present.
stderr = stderr_b.decode("utf-8", errors="replace").strip()
stdout = stdout_b.decode("utf-8", errors="replace").strip()
last_err = f"exit {proc.returncode}: {(stderr or stdout or 'no output')[:500]}"
else:
stdout = stdout_b.decode("utf-8", errors="replace").strip()
if stdout:
# claude -p --output-format json returns {"type":"result","result":"..."}
try:
data = json.loads(stdout)
if isinstance(data, dict) and "result" in data:
return data["result"]
return stdout
except json.JSONDecodeError:
return stdout
last_err = "empty response"
# claude -p --output-format json returns {"type":"result","result":"..."} # Transient failure — retry with linear backoff unless this was the last try.
try: if attempt < MAX_RETRIES:
data = json.loads(stdout) logger.warning(
if isinstance(data, dict) and "result" in data: "claude -p attempt %d/%d failed (%s%s) — retrying in %ds",
return data["result"] attempt, MAX_RETRIES, last_err, size_info, RETRY_BACKOFF_BASE * attempt,
return stdout )
except json.JSONDecodeError: await asyncio.sleep(RETRY_BACKOFF_BASE * attempt)
return stdout
raise RuntimeError(
f"Claude CLI failed after {MAX_RETRIES} attempts ({last_err}){size_info}"
)
async def query_json( async def query_json(