fix(halacha): fresh CLI 429 is ground truth over the usage endpoint (rate-limit churn) #257

Merged
chaim merged 1 commits from worktree-halacha-ratelimit-groundtruth into main 2026-06-15 02:28:51 +00:00
Showing only changes of commit 1340bff6f1 - Show all commits

View File

@@ -22,10 +22,13 @@ log tails →
• re-triggers the one-shot drain when idle and the queue is non-empty • re-triggers the one-shot drain when idle and the queue is non-empty
• restarts a HUNG run (online but no new chunk-checkpoint for > 25 min — the • restarts a HUNG run (online but no new chunk-checkpoint for > 25 min — the
REAL liveness signal; the out-log only updates when a whole CASE finishes) REAL liveness signal; the out-log only updates when a whole CASE finishes)
• backs off on rate-limit until quota resets — PRIMARY signal is the authoritative • backs off on rate-limit until quota resets. TWO signals, EITHER triggers a
OAuth usage endpoint (durable; the same util the Claude Code UI shows), with the cooldown: the authoritative OAuth usage endpoint (durable; the util the Claude
log 429 only as a fallback when that endpoint is unreachable. While limited it Code UI shows) AND a fresh CLI 429 in the logs. A fresh 429 is GROUND TRUTH —
STOPS the drain (no 429-hammering) and re-ignites once quota is back. the call is literally failing — so it overrides an endpoint that wrongly reports
the window available, and it VETOES the early resume. While limited it STOPS the
drain (no 429-hammering, no re-extracting/degrading completed cases) and
re-ignites once quota is back AND no fresh 429 remains.
• verifies crash-safe per-chunk staging is committing (nothing lost) • verifies crash-safe per-chunk staging is committing (nothing lost)
The night window is 23:0005:00 IDT, with a bounded early-morning CATCH-UP band The night window is 23:0005:00 IDT, with a bounded early-morning CATCH-UP band
@@ -68,6 +71,7 @@ VENV_PY = os.path.join(REPO, "mcp-server/.venv/bin/python")
STUCK_SILENCE_SEC = 1500 # 25 min with no new chunk-checkpoint while online → hung STUCK_SILENCE_SEC = 1500 # 25 min with no new chunk-checkpoint while online → hung
WEEKLY_GAP_HOURS = 6 # reset further than this → treat as weekly, not 5h WEEKLY_GAP_HOURS = 6 # reset further than this → treat as weekly, not 5h
DEFAULT_COOLDOWN_MIN = 30 # cooldown when a fresh 429 has no parseable reset time
IDT = timezone(timedelta(hours=3)) # Israel summer time (IDT, UTC+3) — display only IDT = timezone(timedelta(hours=3)) # Israel summer time (IDT, UTC+3) — display only
NIGHT_START, NIGHT_END = 23, 5 # the drain's normal window (IDT hours) NIGHT_START, NIGHT_END = 23, 5 # the drain's normal window (IDT hours)
CATCHUP_END = 7 # soft window end (IDT) for early-morning catch-up — see fix B CATCHUP_END = 7 # soft window end (IDT) for early-morning catch-up — see fix B
@@ -441,26 +445,38 @@ def tick():
# under the supervisor's own restart-churn, so an exhausted 5-hour window read # under the supervisor's own restart-churn, so an exhausted 5-hour window read
# as 'hung' and got hammered with restarts. The endpoint can't scroll away. # as 'hung' and got hammered with restarts. The endpoint can't scroll away.
fresh = (age is not None and age < 1800) fresh = (age is not None and age < 1800)
log_rl = bool(rl_recent and fresh) log_rl = bool(rl_recent and fresh) # a FRESH CLI 429 in the logs
auth = quota_exhausted() # (exhausted, reset_utc) | None if endpoint down auth = quota_exhausted() # (exhausted, reset_utc) | None if endpoint down
auth_says_ok = (auth is not None and not auth[0]) auth_exhausted = bool(auth is not None and auth[0])
auth_says_ok = bool(auth is not None and not auth[0])
# A fresh CLI 429 is GROUND TRUTH: the call is literally failing, so cooldown
# even when the usage endpoint reports the window available. Observed
# 2026-06-15: endpoint <100% while the CLI kept 429-ing ("session limit") —
# the old "endpoint is primary" logic then read it as 'hung' and restart-churned,
# re-extracting already-completed cases and DEGRADING them under the rate limit.
cd_dt = None cd_dt = None
if auth is not None and auth[0]: # authoritative: a window is exhausted if auth_exhausted:
cd_dt = auth[1] or reset_dt # prefer endpoint reset; fall back to parsed cd_dt = auth[1] or reset_dt # prefer endpoint reset; fall back to parsed
elif log_rl and not auth_says_ok: # endpoint down/silent → trust a fresh 429 if cd_dt is None and log_rl: # fresh 429 overrides an "available" endpoint
cd_dt = reset_dt cd_dt = reset_dt or (now + timedelta(minutes=DEFAULT_COOLDOWN_MIN))
if cd_dt is None and prev.get("cooldown_until"): # persist a stored future reset if cd_dt is None and prev.get("cooldown_until"): # persist a stored future reset
try: try:
cd_dt = datetime.fromisoformat(prev["cooldown_until"]) cd_dt = datetime.fromisoformat(prev["cooldown_until"])
except Exception: except Exception:
cd_dt = None cd_dt = None
in_cooldown = bool(cd_dt and now < cd_dt) in_cooldown = bool(cd_dt and now < cd_dt)
# Exit cooldown the instant quota is actually back — claude.ai usually frees up # Exit cooldown early ONLY when quota is actually back AND no fresh 429
# EARLIER than the reported reset. Authoritative all-clear is decisive; when the # contradicts it. The endpoint can lie "available" while the CLI still 429s, so
# endpoint is down, fall back to the tiny live CLI probe (old behavior). # a fresh 429 VETOES the early resume (otherwise we'd bounce straight back into
if in_cooldown and (auth_says_ok or (auth is None and quota_available())): # the churn). When there's no fresh 429: authoritative all-clear is decisive;
# if the endpoint is down, fall back to the tiny live CLI probe.
if (
in_cooldown
and not log_rl
and (auth_says_ok or (auth is None and quota_available()))
):
notes.append( notes.append(
f"בדיקת-מכסה: המכסה זמינה — מתחדש מיד " f"בדיקת-מכסה: המכסה זמינה (ואין 429 טרי) — מתחדש מיד "
f"(לפני האיפוס המדווח {cd_dt.astimezone(IDT):%H:%M IDT}).") f"(לפני האיפוס המדווח {cd_dt.astimezone(IDT):%H:%M IDT}).")
cd_dt, in_cooldown = None, False cd_dt, in_cooldown = None, False
cooldown_until = cd_dt.isoformat() if cd_dt else None cooldown_until = cd_dt.isoformat() if cd_dt else None