fix(deploy): respawn supervisor ב-start.sh כדי שהקונטיינר לא ייוותר חצי-חי

הבאג: start.sh הריץ את uvicorn ברקע, בדק פעם אחת אחרי 2 שניות, ואם הוא מת רק הדפיס שגיאה והמשיך. Next.js רץ ב-foreground אז הקונטיינר נשאר "חי" עם backend מת — Docker/Coolify לא מפעילים restart, ו-/api/health מחזיר 503. זה בדיוק מה שקרה אחרי עדכון הקרנל + reboot ב-2026-06-10: uvicorn לא הצליח להגיע ל-Postgres בשניות הראשונות וה-backend נשאר מת עד restart ידני. התיקון: start.sh הוא כעת סופרוייזר אמיתי — כל תהליך (uvicorn ו-node) רץ בלולאת respawn עם capped backoff (1→2→…→30s, מתאפס אחרי 30s up). race מול Postgres ב-boot נפתר מעצמו. trap על TERM/INT מבצע shutdown נקי (PID 1 sh מתעלם מ-SIGTERM בלי trap → redeploy מהיר יותר). אם סופרוייזר מת באופן בלתי-צפוי — exit 1 כדי ש-Docker יפעיל restart מלא. Invariants: X3 (integration/deploy), X16 (pipeline-durability — עמידות הרצה). לא נוגע ב-G1/G2 (אין מסלול מקביל), לא בולע שגיאות (כל restart מתועד ל-stdout). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-11 06:57:05 +00:00
parent 1eece500d3
commit c504a61d49
1 changed files with 47 additions and 14 deletions
--- a/start.sh
+++ b/start.sh
@@ -1,20 +1,53 @@
 #!/bin/sh
-# Start FastAPI backend + Next.js frontend in the same container.
+# Start FastAPI backend (:8000) + Next.js frontend (:3000) in one container,
-# Both processes log to stdout/stderr so Docker captures everything.
+# each under a respawn supervisor.
 #
 # Why a supervisor: a transient failure of either process — e.g. Postgres
 # (:5433) not yet reachable in the seconds after a host reboot — must self-heal
 # via capped-backoff restart, instead of leaving the container half-alive
 # (Next.js up, FastAPI dead → /api/health returns 503, which is exactly the
 # outage that followed the kernel update + reboot on 2026-06-10). Both processes
 # log to stdout/stderr so Docker captures everything.
-set -e
+set -u
-echo "[start.sh] Starting FastAPI backend on :8000 ..."
+# Run "$@" forever, restarting it when it exits. Backoff doubles on a fast crash
-uvicorn web.app:app --host 127.0.0.1 --port 8000 --workers 1 2>&1 &
+# (cap 30s) and resets once the process has stayed up for >=30s, so a flapping
-UVICORN_PID=$!
+# dependency cannot spin the CPU while a genuinely-recovered process restarts
-
+# promptly.
-# Give uvicorn a moment to start (or crash)
+supervise() {
-sleep 2
+  name="$1"
-
+  shift
-if ! kill -0 $UVICORN_PID 2>/dev/null; then
+  backoff=1
-  echo "[start.sh] ERROR: uvicorn failed to start!"
+  while true; do
-  # Don't exit — let Node.js run so the UI is accessible for debugging
+    echo "[start.sh] starting ${name} ..."
    start=$(date +%s)
    "$@"
    code=$?
    if [ $(( $(date +%s) - start )) -ge 30 ]; then
      backoff=1
    fi
    echo "[start.sh] ${name} exited (code=${code}); restarting in ${backoff}s ..."
    sleep "$backoff"
    backoff=$(( backoff * 2 ))
    [ "$backoff" -gt 30 ] && backoff=30
  done
 }
-echo "[start.sh] Starting Next.js frontend on :3000 ..."
+supervise "FastAPI (uvicorn :8000)" \
-node server.js
+  uvicorn web.app:app --host 127.0.0.1 --port 8000 --workers 1 &
 API_PID=$!
 supervise "Next.js (node :3000)" \
  node server.js &
 WEB_PID=$!
 # Clean shutdown on redeploy/stop: forward the signal to both supervisors and
 # let Docker tear the container down (children are reaped with the pid namespace).
 trap 'echo "[start.sh] terminating ..."; kill "$API_PID" "$WEB_PID" 2>/dev/null; exit 0' TERM INT
 # Both supervisors loop forever; returning from wait means one died unexpectedly,
 # so exit non-zero and let Docker restart the whole container.
 wait "$API_PID" "$WEB_PID"
 echo "[start.sh] a supervisor exited unexpectedly; stopping container"
 exit 1