fix(#85): claude_session — retry על כשלים חולפים של claude -p

שורש #85 התברר: `claude -p` נכשל מדי פעם ב-exit מהיר + stderr ריק על
פרומפטים גדולים/איטיים (CEO write_interim_draft, learning_loop distillation),
**אותו פרומפט מצליח בריצה חוזרת** — כשל חולף, לא nesting (אומת: nested claude
מ-bash וגם פרומפט 70K הצליחו; הכשל אינו דטרמיניסטי).

query() עוטף spawn+communicate ב-לולאת retry (MAX_RETRIES=3, backoff לינארי
5s*attempt). FileNotFoundError + timeout נשארים דטרמיניסטיים (ללא retry).
empty-response גם מטופל כ-transient.

אומת e2e: distillation על 1130-25 רץ בהצלחה → pair=analyzed (9 שינויים,
6 style_method, 33.8% diff). פותר גם את write_interim_draft של ה-CEO.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-06 20:05:57 +00:00
parent 5fa76a09b4
commit 9d2536a667

View File

@@ -40,6 +40,14 @@ logger = logging.getLogger(__name__)
DEFAULT_TIMEOUT = 1800
LONG_TIMEOUT = 3600 # opus block writing on full case context
# #85 — `claude -p` fails intermittently with a fast non-zero exit and empty
# stderr (observed on large/slow cold prompts: CEO write_interim_draft,
# learning_loop distillation). The SAME prompt succeeds on retry, so the bail is
# transient — retry with linear backoff. Timeouts and "CLI not found" are
# deterministic and are NOT retried.
MAX_RETRIES = 3
RETRY_BACKOFF_BASE = 5 # seconds; sleep = base * attempt_number
async def query(
prompt: str,
@@ -94,53 +102,69 @@ async def query(
if effort:
cmd += ["--effort", effort]
try:
proc = await asyncio.create_subprocess_exec(
*cmd,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
except FileNotFoundError:
raise RuntimeError(
"Claude CLI not found. This module only works when invoked "
"from the local MCP server — see the architectural rule in "
"the module docstring. If this error came from a FastAPI "
"endpoint in the container, refactor the call into an MCP "
"tool that the chair triggers from Claude Code."
)
size_info = f"; prompt_len={len(full_prompt):,} chars" if len(full_prompt) > 100_000 else ""
last_err = "unknown error"
try:
stdout_b, stderr_b = await asyncio.wait_for(
proc.communicate(input=full_prompt.encode("utf-8")),
timeout=timeout,
)
except asyncio.TimeoutError:
# wait_for cancellation alone leaves the child running.
for attempt in range(1, MAX_RETRIES + 1):
try:
proc.kill()
await proc.wait()
except ProcessLookupError:
pass
raise RuntimeError(f"Claude CLI timed out after {timeout}s")
proc = await asyncio.create_subprocess_exec(
*cmd,
stdin=asyncio.subprocess.PIPE,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
except FileNotFoundError:
# Deterministic — never retry.
raise RuntimeError(
"Claude CLI not found. This module only works when invoked "
"from the local MCP server — see the architectural rule in "
"the module docstring. If this error came from a FastAPI "
"endpoint in the container, refactor the call into an MCP "
"tool that the chair triggers from Claude Code."
)
if proc.returncode != 0:
stderr = stderr_b.decode("utf-8", errors="replace").strip()[:500] or "unknown error"
size_info = f"; prompt_len={len(full_prompt):,} chars" if len(full_prompt) > 100_000 else ""
raise RuntimeError(f"Claude CLI failed (exit {proc.returncode}): {stderr}{size_info}")
try:
stdout_b, stderr_b = await asyncio.wait_for(
proc.communicate(input=full_prompt.encode("utf-8")),
timeout=timeout,
)
except asyncio.TimeoutError:
# wait_for cancellation alone leaves the child running. A timeout is
# a real ceiling, not a transient blip — don't retry.
try:
proc.kill()
await proc.wait()
except ProcessLookupError:
pass
raise RuntimeError(f"Claude CLI timed out after {timeout}s")
stdout = stdout_b.decode("utf-8", errors="replace").strip()
if not stdout:
raise RuntimeError("Claude CLI returned empty response")
if proc.returncode != 0:
stderr = stderr_b.decode("utf-8", errors="replace").strip()[:500] or "unknown error"
last_err = f"exit {proc.returncode}: {stderr}"
else:
stdout = stdout_b.decode("utf-8", errors="replace").strip()
if stdout:
# claude -p --output-format json returns {"type":"result","result":"..."}
try:
data = json.loads(stdout)
if isinstance(data, dict) and "result" in data:
return data["result"]
return stdout
except json.JSONDecodeError:
return stdout
last_err = "empty response"
# claude -p --output-format json returns {"type":"result","result":"..."}
try:
data = json.loads(stdout)
if isinstance(data, dict) and "result" in data:
return data["result"]
return stdout
except json.JSONDecodeError:
return stdout
# Transient failure — retry with linear backoff unless this was the last try.
if attempt < MAX_RETRIES:
logger.warning(
"claude -p attempt %d/%d failed (%s%s) — retrying in %ds",
attempt, MAX_RETRIES, last_err, size_info, RETRY_BACKOFF_BASE * attempt,
)
await asyncio.sleep(RETRY_BACKOFF_BASE * attempt)
raise RuntimeError(
f"Claude CLI failed after {MAX_RETRIES} attempts ({last_err}){size_info}"
)
async def query_json(