From c504a61d49602925c1f9c650b32b0249ba2ac988 Mon Sep 17 00:00:00 2001 From: Chaim Date: Thu, 11 Jun 2026 06:57:05 +0000 Subject: [PATCH] =?UTF-8?q?fix(deploy):=20respawn=20supervisor=20=D7=91-st?= =?UTF-8?q?art.sh=20=D7=9B=D7=93=D7=99=20=D7=A9=D7=94=D7=A7=D7=95=D7=A0?= =?UTF-8?q?=D7=98=D7=99=D7=99=D7=A0=D7=A8=20=D7=9C=D7=90=20=D7=99=D7=99?= =?UTF-8?q?=D7=95=D7=95=D7=AA=D7=A8=20=D7=97=D7=A6=D7=99-=D7=97=D7=99?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit הבאג: start.sh הריץ את uvicorn ברקע, בדק פעם אחת אחרי 2 שניות, ואם הוא מת רק הדפיס שגיאה והמשיך. Next.js רץ ב-foreground אז הקונטיינר נשאר "חי" עם backend מת — Docker/Coolify לא מפעילים restart, ו-/api/health מחזיר 503. זה בדיוק מה שקרה אחרי עדכון הקרנל + reboot ב-2026-06-10: uvicorn לא הצליח להגיע ל-Postgres בשניות הראשונות וה-backend נשאר מת עד restart ידני. התיקון: start.sh הוא כעת סופרוייזר אמיתי — כל תהליך (uvicorn ו-node) רץ בלולאת respawn עם capped backoff (1→2→…→30s, מתאפס אחרי 30s up). race מול Postgres ב-boot נפתר מעצמו. trap על TERM/INT מבצע shutdown נקי (PID 1 sh מתעלם מ-SIGTERM בלי trap → redeploy מהיר יותר). אם סופרוייזר מת באופן בלתי-צפוי — exit 1 כדי ש-Docker יפעיל restart מלא. Invariants: X3 (integration/deploy), X16 (pipeline-durability — עמידות הרצה). לא נוגע ב-G1/G2 (אין מסלול מקביל), לא בולע שגיאות (כל restart מתועד ל-stdout). Co-Authored-By: Claude Opus 4.8 (1M context) --- start.sh | 61 +++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 47 insertions(+), 14 deletions(-) diff --git a/start.sh b/start.sh index 3bec63f..3993e2f 100755 --- a/start.sh +++ b/start.sh @@ -1,20 +1,53 @@ #!/bin/sh -# Start FastAPI backend + Next.js frontend in the same container. -# Both processes log to stdout/stderr so Docker captures everything. +# Start FastAPI backend (:8000) + Next.js frontend (:3000) in one container, +# each under a respawn supervisor. +# +# Why a supervisor: a transient failure of either process — e.g. Postgres +# (:5433) not yet reachable in the seconds after a host reboot — must self-heal +# via capped-backoff restart, instead of leaving the container half-alive +# (Next.js up, FastAPI dead → /api/health returns 503, which is exactly the +# outage that followed the kernel update + reboot on 2026-06-10). Both processes +# log to stdout/stderr so Docker captures everything. -set -e +set -u -echo "[start.sh] Starting FastAPI backend on :8000 ..." -uvicorn web.app:app --host 127.0.0.1 --port 8000 --workers 1 2>&1 & -UVICORN_PID=$! +# Run "$@" forever, restarting it when it exits. Backoff doubles on a fast crash +# (cap 30s) and resets once the process has stayed up for >=30s, so a flapping +# dependency cannot spin the CPU while a genuinely-recovered process restarts +# promptly. +supervise() { + name="$1" + shift + backoff=1 + while true; do + echo "[start.sh] starting ${name} ..." + start=$(date +%s) + "$@" + code=$? + if [ $(( $(date +%s) - start )) -ge 30 ]; then + backoff=1 + fi + echo "[start.sh] ${name} exited (code=${code}); restarting in ${backoff}s ..." + sleep "$backoff" + backoff=$(( backoff * 2 )) + [ "$backoff" -gt 30 ] && backoff=30 + done +} -# Give uvicorn a moment to start (or crash) -sleep 2 +supervise "FastAPI (uvicorn :8000)" \ + uvicorn web.app:app --host 127.0.0.1 --port 8000 --workers 1 & +API_PID=$! -if ! kill -0 $UVICORN_PID 2>/dev/null; then - echo "[start.sh] ERROR: uvicorn failed to start!" - # Don't exit — let Node.js run so the UI is accessible for debugging -fi +supervise "Next.js (node :3000)" \ + node server.js & +WEB_PID=$! -echo "[start.sh] Starting Next.js frontend on :3000 ..." -node server.js +# Clean shutdown on redeploy/stop: forward the signal to both supervisors and +# let Docker tear the container down (children are reaped with the pid namespace). +trap 'echo "[start.sh] terminating ..."; kill "$API_PID" "$WEB_PID" 2>/dev/null; exit 0' TERM INT + +# Both supervisors loop forever; returning from wait means one died unexpectedly, +# so exit non-zero and let Docker restart the whole container. +wait "$API_PID" "$WEB_PID" +echo "[start.sh] a supervisor exited unexpectedly; stopping container" +exit 1