fix(#85): claude_session — retry על כשלים חולפים של claude -p
שורש #85 התברר: `claude -p` נכשל מדי פעם ב-exit מהיר + stderr ריק על פרומפטים גדולים/איטיים (CEO write_interim_draft, learning_loop distillation), **אותו פרומפט מצליח בריצה חוזרת** — כשל חולף, לא nesting (אומת: nested claude מ-bash וגם פרומפט 70K הצליחו; הכשל אינו דטרמיניסטי). query() עוטף spawn+communicate ב-לולאת retry (MAX_RETRIES=3, backoff לינארי 5s*attempt). FileNotFoundError + timeout נשארים דטרמיניסטיים (ללא retry). empty-response גם מטופל כ-transient. אומת e2e: distillation על 1130-25 רץ בהצלחה → pair=analyzed (9 שינויים, 6 style_method, 33.8% diff). פותר גם את write_interim_draft של ה-CEO. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -41,25 +41,31 @@ logger = logging.getLogger(__name__)
|
|||||||
DEFAULT_TIMEOUT = 1800
|
DEFAULT_TIMEOUT = 1800
|
||||||
LONG_TIMEOUT = 3600 # opus block writing on full case context
|
LONG_TIMEOUT = 3600 # opus block writing on full case context
|
||||||
|
|
||||||
# Environment markers a running Claude Code session exports into its child
|
# #85 — two complementary hardenings for the same symptom (`claude -p` failing
|
||||||
# processes. When an agent itself runs as ``claude -p`` (e.g. the CEO MCP
|
# with a fast non-zero exit + empty stderr on large/slow cold prompts: CEO
|
||||||
# instance launched by Paperclip's claude_local adapter), its MCP server
|
# write_interim_draft, learning_loop distillation):
|
||||||
# inherits these, and a *nested* ``claude -p`` we spawn here inherits them
|
#
|
||||||
# too. Stripping them makes every nested invocation launch as a clean
|
# 1. CLEAN ENV (defensive): a running Claude Code session exports markers into
|
||||||
# top-level session, decoupled from the parent's session/project state —
|
# child processes; a *nested* ``claude -p`` inherits them. Stripping them lets
|
||||||
# defensive hardening against the #85 nested-``exit 1`` failure (which could
|
# every nested invocation launch as a clean top-level session. Could not be
|
||||||
# not be reproduced in a plain interactive session, so the markers are a
|
# reproduced deterministically, so it's a suspect, not a proven cause. Auth/
|
||||||
# suspect, not a proven cause). Auth/config vars (CLAUDE_CONFIG_DIR,
|
# config (CLAUDE_CONFIG_DIR, ANTHROPIC_*, PATH, HOME) are kept.
|
||||||
# ANTHROPIC_*, PATH, HOME) are kept. See TaskMaster legal-ai #85.
|
# 2. RETRY (the real fix): the SAME large prompt that exits 1 once succeeds on a
|
||||||
|
# plain retry — the bail is transient. Retry with linear backoff. Timeouts and
|
||||||
|
# "CLI not found" stay deterministic and are NOT retried.
|
||||||
|
# See TaskMaster legal-ai #85.
|
||||||
_SESSION_MARKER_PREFIXES = ("CLAUDECODE", "CLAUDE_CODE_", "CLAUDE_AGENT_")
|
_SESSION_MARKER_PREFIXES = ("CLAUDECODE", "CLAUDE_CODE_", "CLAUDE_AGENT_")
|
||||||
_SESSION_MARKER_EXACT = frozenset({"AI_AGENT", "CLAUDE_EFFORT"})
|
_SESSION_MARKER_EXACT = frozenset({"AI_AGENT", "CLAUDE_EFFORT"})
|
||||||
|
|
||||||
|
MAX_RETRIES = 3
|
||||||
|
RETRY_BACKOFF_BASE = 5 # seconds; sleep = base * attempt_number
|
||||||
|
|
||||||
|
|
||||||
def _clean_subprocess_env() -> dict[str, str]:
|
def _clean_subprocess_env() -> dict[str, str]:
|
||||||
"""Copy the current env minus Claude Code session markers.
|
"""Copy the current env minus Claude Code session markers.
|
||||||
|
|
||||||
Lets a nested ``claude -p`` start fresh instead of detecting it is
|
Lets a nested ``claude -p`` start fresh instead of detecting it is
|
||||||
already inside a Claude Code session (the #85 nested-exit-1 bug).
|
already inside a Claude Code session (#85).
|
||||||
"""
|
"""
|
||||||
env = dict(os.environ)
|
env = dict(os.environ)
|
||||||
for key in list(env):
|
for key in list(env):
|
||||||
@@ -121,66 +127,74 @@ async def query(
|
|||||||
if effort:
|
if effort:
|
||||||
cmd += ["--effort", effort]
|
cmd += ["--effort", effort]
|
||||||
|
|
||||||
try:
|
size_info = f"; prompt_len={len(full_prompt):,} chars" if len(full_prompt) > 100_000 else ""
|
||||||
proc = await asyncio.create_subprocess_exec(
|
last_err = "unknown error"
|
||||||
*cmd,
|
|
||||||
stdin=asyncio.subprocess.PIPE,
|
|
||||||
stdout=asyncio.subprocess.PIPE,
|
|
||||||
stderr=asyncio.subprocess.PIPE,
|
|
||||||
env=_clean_subprocess_env(),
|
|
||||||
cwd=os.path.expanduser("~"),
|
|
||||||
)
|
|
||||||
except FileNotFoundError:
|
|
||||||
raise RuntimeError(
|
|
||||||
"Claude CLI not found. This module only works when invoked "
|
|
||||||
"from the local MCP server — see the architectural rule in "
|
|
||||||
"the module docstring. If this error came from a FastAPI "
|
|
||||||
"endpoint in the container, refactor the call into an MCP "
|
|
||||||
"tool that the chair triggers from Claude Code."
|
|
||||||
)
|
|
||||||
|
|
||||||
try:
|
for attempt in range(1, MAX_RETRIES + 1):
|
||||||
stdout_b, stderr_b = await asyncio.wait_for(
|
|
||||||
proc.communicate(input=full_prompt.encode("utf-8")),
|
|
||||||
timeout=timeout,
|
|
||||||
)
|
|
||||||
except asyncio.TimeoutError:
|
|
||||||
# wait_for cancellation alone leaves the child running.
|
|
||||||
try:
|
try:
|
||||||
proc.kill()
|
proc = await asyncio.create_subprocess_exec(
|
||||||
await proc.wait()
|
*cmd,
|
||||||
except ProcessLookupError:
|
stdin=asyncio.subprocess.PIPE,
|
||||||
pass
|
stdout=asyncio.subprocess.PIPE,
|
||||||
raise RuntimeError(f"Claude CLI timed out after {timeout}s")
|
stderr=asyncio.subprocess.PIPE,
|
||||||
|
env=_clean_subprocess_env(),
|
||||||
|
cwd=os.path.expanduser("~"),
|
||||||
|
)
|
||||||
|
except FileNotFoundError:
|
||||||
|
# Deterministic — never retry.
|
||||||
|
raise RuntimeError(
|
||||||
|
"Claude CLI not found. This module only works when invoked "
|
||||||
|
"from the local MCP server — see the architectural rule in "
|
||||||
|
"the module docstring. If this error came from a FastAPI "
|
||||||
|
"endpoint in the container, refactor the call into an MCP "
|
||||||
|
"tool that the chair triggers from Claude Code."
|
||||||
|
)
|
||||||
|
|
||||||
if proc.returncode != 0:
|
try:
|
||||||
# Surface the real cause: the CLI sometimes writes its diagnostic to
|
stdout_b, stderr_b = await asyncio.wait_for(
|
||||||
# stdout (or nowhere) rather than stderr, so a stderr-only message
|
proc.communicate(input=full_prompt.encode("utf-8")),
|
||||||
# collapsed to "unknown error" and hid the actual failure (#85).
|
timeout=timeout,
|
||||||
stderr = stderr_b.decode("utf-8", errors="replace").strip()
|
)
|
||||||
stdout = stdout_b.decode("utf-8", errors="replace").strip()
|
except asyncio.TimeoutError:
|
||||||
diagnostic = stderr or stdout or "no output on stderr/stdout"
|
# wait_for cancellation alone leaves the child running. A timeout is
|
||||||
size_info = f"; prompt_len={len(full_prompt):,} chars" if len(full_prompt) > 100_000 else ""
|
# a real ceiling, not a transient blip — don't retry.
|
||||||
logger.error(
|
try:
|
||||||
"claude -p failed (exit %s): stderr=%r stdout=%r",
|
proc.kill()
|
||||||
proc.returncode, stderr[:500], stdout[:500],
|
await proc.wait()
|
||||||
)
|
except ProcessLookupError:
|
||||||
raise RuntimeError(
|
pass
|
||||||
f"Claude CLI failed (exit {proc.returncode}): {diagnostic[:500]}{size_info}"
|
raise RuntimeError(f"Claude CLI timed out after {timeout}s")
|
||||||
)
|
|
||||||
|
|
||||||
stdout = stdout_b.decode("utf-8", errors="replace").strip()
|
if proc.returncode != 0:
|
||||||
if not stdout:
|
# The CLI sometimes writes its diagnostic to stdout (or nowhere)
|
||||||
raise RuntimeError("Claude CLI returned empty response")
|
# rather than stderr (#85) — surface whichever is present.
|
||||||
|
stderr = stderr_b.decode("utf-8", errors="replace").strip()
|
||||||
|
stdout = stdout_b.decode("utf-8", errors="replace").strip()
|
||||||
|
last_err = f"exit {proc.returncode}: {(stderr or stdout or 'no output')[:500]}"
|
||||||
|
else:
|
||||||
|
stdout = stdout_b.decode("utf-8", errors="replace").strip()
|
||||||
|
if stdout:
|
||||||
|
# claude -p --output-format json returns {"type":"result","result":"..."}
|
||||||
|
try:
|
||||||
|
data = json.loads(stdout)
|
||||||
|
if isinstance(data, dict) and "result" in data:
|
||||||
|
return data["result"]
|
||||||
|
return stdout
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return stdout
|
||||||
|
last_err = "empty response"
|
||||||
|
|
||||||
# claude -p --output-format json returns {"type":"result","result":"..."}
|
# Transient failure — retry with linear backoff unless this was the last try.
|
||||||
try:
|
if attempt < MAX_RETRIES:
|
||||||
data = json.loads(stdout)
|
logger.warning(
|
||||||
if isinstance(data, dict) and "result" in data:
|
"claude -p attempt %d/%d failed (%s%s) — retrying in %ds",
|
||||||
return data["result"]
|
attempt, MAX_RETRIES, last_err, size_info, RETRY_BACKOFF_BASE * attempt,
|
||||||
return stdout
|
)
|
||||||
except json.JSONDecodeError:
|
await asyncio.sleep(RETRY_BACKOFF_BASE * attempt)
|
||||||
return stdout
|
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Claude CLI failed after {MAX_RETRIES} attempts ({last_err}){size_info}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def query_json(
|
async def query_json(
|
||||||
|
|||||||
Reference in New Issue
Block a user