diff --git a/start.sh b/start.sh index 3bec63f..3993e2f 100755 --- a/start.sh +++ b/start.sh @@ -1,20 +1,53 @@ #!/bin/sh -# Start FastAPI backend + Next.js frontend in the same container. -# Both processes log to stdout/stderr so Docker captures everything. +# Start FastAPI backend (:8000) + Next.js frontend (:3000) in one container, +# each under a respawn supervisor. +# +# Why a supervisor: a transient failure of either process — e.g. Postgres +# (:5433) not yet reachable in the seconds after a host reboot — must self-heal +# via capped-backoff restart, instead of leaving the container half-alive +# (Next.js up, FastAPI dead → /api/health returns 503, which is exactly the +# outage that followed the kernel update + reboot on 2026-06-10). Both processes +# log to stdout/stderr so Docker captures everything. -set -e +set -u -echo "[start.sh] Starting FastAPI backend on :8000 ..." -uvicorn web.app:app --host 127.0.0.1 --port 8000 --workers 1 2>&1 & -UVICORN_PID=$! +# Run "$@" forever, restarting it when it exits. Backoff doubles on a fast crash +# (cap 30s) and resets once the process has stayed up for >=30s, so a flapping +# dependency cannot spin the CPU while a genuinely-recovered process restarts +# promptly. +supervise() { + name="$1" + shift + backoff=1 + while true; do + echo "[start.sh] starting ${name} ..." + start=$(date +%s) + "$@" + code=$? + if [ $(( $(date +%s) - start )) -ge 30 ]; then + backoff=1 + fi + echo "[start.sh] ${name} exited (code=${code}); restarting in ${backoff}s ..." + sleep "$backoff" + backoff=$(( backoff * 2 )) + [ "$backoff" -gt 30 ] && backoff=30 + done +} -# Give uvicorn a moment to start (or crash) -sleep 2 +supervise "FastAPI (uvicorn :8000)" \ + uvicorn web.app:app --host 127.0.0.1 --port 8000 --workers 1 & +API_PID=$! -if ! kill -0 $UVICORN_PID 2>/dev/null; then - echo "[start.sh] ERROR: uvicorn failed to start!" - # Don't exit — let Node.js run so the UI is accessible for debugging -fi +supervise "Next.js (node :3000)" \ + node server.js & +WEB_PID=$! -echo "[start.sh] Starting Next.js frontend on :3000 ..." -node server.js +# Clean shutdown on redeploy/stop: forward the signal to both supervisors and +# let Docker tear the container down (children are reaped with the pid namespace). +trap 'echo "[start.sh] terminating ..."; kill "$API_PID" "$WEB_PID" 2>/dev/null; exit 0' TERM INT + +# Both supervisors loop forever; returning from wait means one died unexpectedly, +# so exit non-zero and let Docker restart the whole container. +wait "$API_PID" "$WEB_PID" +echo "[start.sh] a supervisor exited unexpectedly; stopping container" +exit 1