fix(halacha): rate-limit refusal ≠ empty answer — לא checkpoint chunk בכשל (#144)
תיקון-ליבה (b): כש-claude CLI מחזיר exit=0 עם הודעת-מגבלה/שגיאה כ-result, query זיהה אותה כהצלחה → _extract_chunk קיבל []/non-list וסימן chunk כ-done-ריק; resume דילג עליו לתמיד → תת-חילוץ קבוע (3→1→0). עכשיו is_error/_looks_like_limit_notice הופכים אותה לכשל-חולף → retry → raise → chunk נשאר un-checkpointed → resume משחזר (כך force-delete כבר לא הרסני-לצמיתות). + churn-detect במתזמר (Δdone<0 / Δhal<-2 → אזהרה+churn_ok ב-JSON). + scripts/reconcile_under_extracted_halacha.py — שחזור completed-עם-0-הלכות-ו≥3 מקטעי-נימוק (dry-run הראה 15 מועמדים); נתיב-הזמנה קנוני (G2), שמרני (לא remand). הערה: אטומיות-מלאה (staging_run_id) נדחתה — PR #257 מיתן את ה-trigger, ו-(b)+resume מונעים אובדן-קבוע (force-delete מתאושש דרך resume). בדיקות: test_claude_session_limit_notice. כל 354 עוברות. guards נקיים. Invariants: G1, INV-G3/X16 (checkpoint=הושלם-באמת), INV-G4 (churn לא-שקט), G12. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -60,6 +60,31 @@ _SESSION_MARKER_EXACT = frozenset({"AI_AGENT", "CLAUDE_EFFORT"})
|
||||
MAX_RETRIES = 3
|
||||
RETRY_BACKOFF_BASE = 5 # seconds; sleep = base * attempt_number
|
||||
|
||||
# Phrases the CLI emits as the "result" of an exit-0 run that actually hit a
|
||||
# usage/rate limit (a refusal NOTICE, not a real answer). Matched only against a
|
||||
# result that is NOT structured output (doesn't start with [ or {), so a genuine
|
||||
# JSON extraction containing these words as content is never mis-flagged.
|
||||
_LIMIT_NOTICE_MARKERS = (
|
||||
"usage limit",
|
||||
"rate limit",
|
||||
"limit reached",
|
||||
"limit will reset",
|
||||
"try again later",
|
||||
)
|
||||
|
||||
|
||||
def _looks_like_limit_notice(data: dict) -> bool:
|
||||
"""True if an exit-0 ``result`` is really a usage/rate-limit refusal."""
|
||||
result = data.get("result")
|
||||
if not isinstance(result, str):
|
||||
return False
|
||||
stripped = result.lstrip()
|
||||
# Structured output (JSON array/object) is a real answer, never a notice.
|
||||
if stripped.startswith(("[", "{")):
|
||||
return False
|
||||
low = result.lower()
|
||||
return any(m in low for m in _LIMIT_NOTICE_MARKERS)
|
||||
|
||||
|
||||
def _clean_subprocess_env() -> dict[str, str]:
|
||||
"""Copy the current env minus Claude Code session markers.
|
||||
@@ -187,11 +212,27 @@ async def query(
|
||||
try:
|
||||
data = json.loads(stdout)
|
||||
if isinstance(data, dict) and "result" in data:
|
||||
return data["result"]
|
||||
return stdout
|
||||
# A usage/rate-limit hit can exit 0 with a refusal NOTICE
|
||||
# as the "result" (is_error / error subtype). Returning it
|
||||
# as success makes callers treat a throttled run as a real
|
||||
# empty answer — e.g. the halacha extractor then checkpoints
|
||||
# the chunk as done-with-0-halachot and a resume skips it
|
||||
# forever (#138/#144 silent under-extraction). Treat it as a
|
||||
# transient failure → retry, and raise if it persists so the
|
||||
# chunk stays un-checkpointed for a real resume.
|
||||
if data.get("is_error") or _looks_like_limit_notice(data):
|
||||
last_err = (
|
||||
f"error result (subtype={data.get('subtype')}): "
|
||||
f"{str(data.get('result',''))[:200]}"
|
||||
)
|
||||
else:
|
||||
return data["result"]
|
||||
else:
|
||||
return stdout
|
||||
except json.JSONDecodeError:
|
||||
return stdout
|
||||
last_err = "empty response"
|
||||
else:
|
||||
last_err = "empty response"
|
||||
|
||||
# Transient failure — retry with linear backoff unless this was the last try.
|
||||
if attempt < MAX_RETRIES:
|
||||
|
||||
Reference in New Issue
Block a user