Merge pull request 'fix(deploy): respawn supervisor ב-start.sh — הקונטיינר לא ייוותר חצי-חי אחרי reboot' (#181) from harden-entrypoint into main
This commit was merged in pull request #181.
This commit is contained in:
63
start.sh
63
start.sh
@@ -1,20 +1,53 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
# Start FastAPI backend + Next.js frontend in the same container.
|
# Start FastAPI backend (:8000) + Next.js frontend (:3000) in one container,
|
||||||
# Both processes log to stdout/stderr so Docker captures everything.
|
# each under a respawn supervisor.
|
||||||
|
#
|
||||||
|
# Why a supervisor: a transient failure of either process — e.g. Postgres
|
||||||
|
# (:5433) not yet reachable in the seconds after a host reboot — must self-heal
|
||||||
|
# via capped-backoff restart, instead of leaving the container half-alive
|
||||||
|
# (Next.js up, FastAPI dead → /api/health returns 503, which is exactly the
|
||||||
|
# outage that followed the kernel update + reboot on 2026-06-10). Both processes
|
||||||
|
# log to stdout/stderr so Docker captures everything.
|
||||||
|
|
||||||
set -e
|
set -u
|
||||||
|
|
||||||
echo "[start.sh] Starting FastAPI backend on :8000 ..."
|
# Run "$@" forever, restarting it when it exits. Backoff doubles on a fast crash
|
||||||
uvicorn web.app:app --host 127.0.0.1 --port 8000 --workers 1 2>&1 &
|
# (cap 30s) and resets once the process has stayed up for >=30s, so a flapping
|
||||||
UVICORN_PID=$!
|
# dependency cannot spin the CPU while a genuinely-recovered process restarts
|
||||||
|
# promptly.
|
||||||
# Give uvicorn a moment to start (or crash)
|
supervise() {
|
||||||
sleep 2
|
name="$1"
|
||||||
|
shift
|
||||||
if ! kill -0 $UVICORN_PID 2>/dev/null; then
|
backoff=1
|
||||||
echo "[start.sh] ERROR: uvicorn failed to start!"
|
while true; do
|
||||||
# Don't exit — let Node.js run so the UI is accessible for debugging
|
echo "[start.sh] starting ${name} ..."
|
||||||
|
start=$(date +%s)
|
||||||
|
"$@"
|
||||||
|
code=$?
|
||||||
|
if [ $(( $(date +%s) - start )) -ge 30 ]; then
|
||||||
|
backoff=1
|
||||||
fi
|
fi
|
||||||
|
echo "[start.sh] ${name} exited (code=${code}); restarting in ${backoff}s ..."
|
||||||
|
sleep "$backoff"
|
||||||
|
backoff=$(( backoff * 2 ))
|
||||||
|
[ "$backoff" -gt 30 ] && backoff=30
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
echo "[start.sh] Starting Next.js frontend on :3000 ..."
|
supervise "FastAPI (uvicorn :8000)" \
|
||||||
node server.js
|
uvicorn web.app:app --host 127.0.0.1 --port 8000 --workers 1 &
|
||||||
|
API_PID=$!
|
||||||
|
|
||||||
|
supervise "Next.js (node :3000)" \
|
||||||
|
node server.js &
|
||||||
|
WEB_PID=$!
|
||||||
|
|
||||||
|
# Clean shutdown on redeploy/stop: forward the signal to both supervisors and
|
||||||
|
# let Docker tear the container down (children are reaped with the pid namespace).
|
||||||
|
trap 'echo "[start.sh] terminating ..."; kill "$API_PID" "$WEB_PID" 2>/dev/null; exit 0' TERM INT
|
||||||
|
|
||||||
|
# Both supervisors loop forever; returning from wait means one died unexpectedly,
|
||||||
|
# so exit non-zero and let Docker restart the whole container.
|
||||||
|
wait "$API_PID" "$WEB_PID"
|
||||||
|
echo "[start.sh] a supervisor exited unexpectedly; stopping container"
|
||||||
|
exit 1
|
||||||
|
|||||||
Reference in New Issue
Block a user