From 2fc695d806281093b7eb75b3f56877111df28a5d Mon Sep 17 00:00:00 2001 From: Dhruv Singal Date: Thu, 2 Oct 2025 15:05:03 -0700 Subject: [PATCH 1/4] custom server no replace supervisord with better shell script --- .../image_builder/serving_image_builder.py | 17 ++- .../templates/docker_server/server_wrapper.sh | 136 ++++++++++++++++++ .../templates/docker_server_requirements.txt | 2 +- truss/templates/server.Dockerfile.jinja | 13 +- 4 files changed, 154 insertions(+), 14 deletions(-) create mode 100644 truss/templates/docker_server/server_wrapper.sh diff --git a/truss/contexts/image_builder/serving_image_builder.py b/truss/contexts/image_builder/serving_image_builder.py index 8e7fad9db..064fd57d1 100644 --- a/truss/contexts/image_builder/serving_image_builder.py +++ b/truss/contexts/image_builder/serving_image_builder.py @@ -378,17 +378,20 @@ def generate_docker_server_nginx_config(build_dir, config): nginx_filepath.write_text(nginx_content) -def generate_docker_server_supervisord_config(build_dir, config): - supervisord_template = read_template_from_fs( - DOCKER_SERVER_TEMPLATES_DIR, "supervisord.conf.jinja" +def generate_docker_server_wrapper_script(build_dir, config): + wrapper_template = read_template_from_fs( + DOCKER_SERVER_TEMPLATES_DIR, "server_wrapper.sh" ) assert config.docker_server.start_command is not None, ( "docker_server.start_command is required to use custom server" ) start_command = config.docker_server.start_command - supervisord_contents = supervisord_template.render(start_command=start_command) - supervisord_filepath = build_dir / "supervisord.conf" - supervisord_filepath.write_text(supervisord_contents) + server_port = config.docker_server.server_port + wrapper_contents = wrapper_template.render( + start_command=start_command, server_port=server_port + ) + wrapper_filepath = build_dir / "server_wrapper.sh" + wrapper_filepath.write_text(wrapper_contents) class ServingImageBuilderContext(TrussContext): @@ -586,7 +589,7 @@ def prepare_image_build_dir( generate_docker_server_nginx_config(build_dir, config) - generate_docker_server_supervisord_config(build_dir, config) + generate_docker_server_wrapper_script(build_dir, config) # Override config.yml with (build_dir / CONFIG_FILE).open("w") as config_file: diff --git a/truss/templates/docker_server/server_wrapper.sh b/truss/templates/docker_server/server_wrapper.sh new file mode 100644 index 000000000..3a8a0f1f0 --- /dev/null +++ b/truss/templates/docker_server/server_wrapper.sh @@ -0,0 +1,136 @@ +#!/bin/bash +set -euo pipefail + +# Enhanced shell script to replace supervisord for custom servers +# Manages nginx and model server processes with proper signal handling + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >&2 +} + +cleanup() { + log "Received shutdown signal, stopping processes..." + + # Stop model server if running + if [[ -n "${MODEL_SERVER_PID:-}" ]] && kill -0 "$MODEL_SERVER_PID" 2>/dev/null; then + log "Stopping model server (PID: $MODEL_SERVER_PID)" + kill -TERM "$MODEL_SERVER_PID" 2>/dev/null || true + + # Wait for graceful shutdown (max 30 seconds) + for i in {1..30}; do + if ! kill -0 "$MODEL_SERVER_PID" 2>/dev/null; then + log "Model server stopped gracefully" + break + fi + if [[ $i -eq 30 ]]; then + log "Force killing model server" + kill -KILL "$MODEL_SERVER_PID" 2>/dev/null || true + fi + sleep 1 + done + fi + + # Stop nginx if running + if [[ -n "${NGINX_PID:-}" ]] && kill -0 "$NGINX_PID" 2>/dev/null; then + log "Stopping nginx (PID: $NGINX_PID)" + kill -TERM "$NGINX_PID" 2>/dev/null || true + + # Wait for graceful shutdown (max 10 seconds) + for i in {1..10}; do + if ! kill -0 "$NGINX_PID" 2>/dev/null; then + log "Nginx stopped gracefully" + break + fi + if [[ $i -eq 10 ]]; then + log "Force killing nginx" + kill -KILL "$NGINX_PID" 2>/dev/null || true + fi + sleep 1 + done + fi + + log "Shutdown complete" + exit 0 +} + +# Set up signal handlers +trap cleanup SIGTERM SIGINT + +# Validate required environment variables +if [[ -z "${START_COMMAND:-}" ]]; then + log "ERROR: START_COMMAND environment variable is required" + exit 1 +fi + +if [[ -z "${SERVER_PORT:-}" ]]; then + log "ERROR: SERVER_PORT environment variable is required" + exit 1 +fi + +log "Starting custom server wrapper" +log "Model server command: $START_COMMAND" +log "Server port: $SERVER_PORT" + +# Start nginx in background (will run in foreground later) +log "Starting nginx..." +nginx -g "daemon off;" & +NGINX_PID=$! + +# Wait a moment for nginx to start +sleep 2 + +# Verify nginx started successfully +if ! kill -0 "$NGINX_PID" 2>/dev/null; then + log "ERROR: Failed to start nginx" + exit 1 +fi + +log "Nginx started successfully (PID: $NGINX_PID)" + +# Start model server in background +log "Starting model server..." +eval "$START_COMMAND" & +MODEL_SERVER_PID=$! + +# Wait for model server to be ready (similar to supervisord's startsecs=30) +log "Waiting for model server to be ready..." +for i in {1..30}; do + if ! kill -0 "$MODEL_SERVER_PID" 2>/dev/null; then + log "ERROR: Model server failed to start" + cleanup + exit 1 + fi + + # Check if server is responding (basic health check) + if curl -s -f "http://localhost:${SERVER_PORT}/ready" >/dev/null 2>&1; then + log "Model server is ready" + break + fi + + if [[ $i -eq 30 ]]; then + log "WARNING: Model server readiness check timed out, continuing anyway" + fi + + sleep 1 +done + +# Monitor processes +log "Both services are running, monitoring processes..." +while true; do + # Check if nginx is still running + if ! kill -0 "$NGINX_PID" 2>/dev/null; then + log "ERROR: Nginx process died" + cleanup + exit 1 + fi + + # Check if model server is still running + if ! kill -0 "$MODEL_SERVER_PID" 2>/dev/null; then + log "ERROR: Model server process died" + cleanup + exit 1 + fi + + # Sleep for a short interval before checking again + sleep 5 +done diff --git a/truss/templates/docker_server_requirements.txt b/truss/templates/docker_server_requirements.txt index 3131eb911..7e31bd68d 100644 --- a/truss/templates/docker_server_requirements.txt +++ b/truss/templates/docker_server_requirements.txt @@ -1 +1 @@ -supervisor==4.2.5 +# Shell script wrapper replaces supervisord for process management diff --git a/truss/templates/server.Dockerfile.jinja b/truss/templates/server.Dockerfile.jinja index cbe06cd51..4d1fa7d18 100644 --- a/truss/templates/server.Dockerfile.jinja +++ b/truss/templates/server.Dockerfile.jinja @@ -119,17 +119,18 @@ RUN uv python install {{ control_python_version }} RUN uv venv /docker_server/.venv --python {{ control_python_version }} RUN uv pip install --python /docker_server/.venv/bin/python -r /app/docker_server_requirements.txt --no-cache-dir {% set proxy_config_path = "/etc/nginx/conf.d/proxy.conf" %} -{% set supervisor_config_path = "/etc/supervisor/supervisord.conf" %} -{% set supervisor_server_url = "http://localhost:8080" %} +{% set server_wrapper_path = "/docker_server/server_wrapper.sh" %} COPY --chown={{ default_owner }} ./proxy.conf {{ proxy_config_path }} -COPY --chown={{ default_owner }} ./supervisord.conf {{ supervisor_config_path }} -ENV SUPERVISOR_SERVER_URL="{{ supervisor_server_url }}" -ENV SERVER_START_CMD="/docker_server/.venv/bin/supervisord -c {{ supervisor_config_path }}" +COPY --chown={{ default_owner }} ./server_wrapper.sh {{ server_wrapper_path }} +RUN chmod +x {{ server_wrapper_path }} +ENV START_COMMAND="{{ config.docker_server.start_command }}" +ENV SERVER_PORT="{{ config.docker_server.server_port }}" +ENV SERVER_START_CMD="{{ server_wrapper_path }}" {#- default configuration uses port 80, which requires root privileges, so we remove it #} RUN rm -f /etc/nginx/sites-enabled/default {#- nginx writes to /var/lib/nginx, /var/log/nginx, and /run directories #} {{ chown_and_switch_to_regular_user_if_enabled(["/var/lib/nginx", "/var/log/nginx", "/run"]) }} -ENTRYPOINT ["/docker_server/.venv/bin/supervisord", "-c", "{{ supervisor_config_path }}"] +ENTRYPOINT ["{{ server_wrapper_path }}"] {%- elif requires_live_reload %} {#- elif requires_live_reload #} ENV HASH_TRUSS="{{ truss_hash }}" From 1562f474a1b724c4e915af6eacb697e95c65dc1e Mon Sep 17 00:00:00 2001 From: Dhruv Singal Date: Thu, 2 Oct 2025 15:20:08 -0700 Subject: [PATCH 2/4] update escaping characters --- truss/templates/server.Dockerfile.jinja | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/truss/templates/server.Dockerfile.jinja b/truss/templates/server.Dockerfile.jinja index 4d1fa7d18..dfeeaf4e5 100644 --- a/truss/templates/server.Dockerfile.jinja +++ b/truss/templates/server.Dockerfile.jinja @@ -123,9 +123,9 @@ RUN uv pip install --python /docker_server/.venv/bin/python -r /app/docker_serve COPY --chown={{ default_owner }} ./proxy.conf {{ proxy_config_path }} COPY --chown={{ default_owner }} ./server_wrapper.sh {{ server_wrapper_path }} RUN chmod +x {{ server_wrapper_path }} -ENV START_COMMAND="{{ config.docker_server.start_command }}" -ENV SERVER_PORT="{{ config.docker_server.server_port }}" -ENV SERVER_START_CMD="{{ server_wrapper_path }}" +ENV START_COMMAND={{ config.docker_server.start_command | tojson }} +ENV SERVER_PORT={{ config.docker_server.server_port | tojson }} +ENV SERVER_START_CMD={{ server_wrapper_path | tojson }} {#- default configuration uses port 80, which requires root privileges, so we remove it #} RUN rm -f /etc/nginx/sites-enabled/default {#- nginx writes to /var/lib/nginx, /var/log/nginx, and /run directories #} From f93b0b26d2b0dc247a0719ff56675ee03a414a91 Mon Sep 17 00:00:00 2001 From: Dhruv Singal Date: Thu, 2 Oct 2025 16:20:50 -0700 Subject: [PATCH 3/4] =?UTF-8?q?=E2=80=A2=20=E2=9C=85=20Auto-restarts=20cra?= =?UTF-8?q?shed=20processes=20=E2=80=A2=20=E2=9C=85=20Captures=20all=20pro?= =?UTF-8?q?cess=20output=20to=20container=20logs=20=E2=80=A2=20=E2=9C=85?= =?UTF-8?q?=20Implements=20proper=20restart=20limits=20with=20backoff=20?= =?UTF-8?q?=E2=80=A2=20=E2=9C=85=20Handles=20graceful=20shutdowns=20?= =?UTF-8?q?=E2=80=A2=20=E2=9C=85=20Provides=20comprehensive=20logging?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../templates/docker_server/server_wrapper.sh | 242 +++++++++++++----- 1 file changed, 180 insertions(+), 62 deletions(-) diff --git a/truss/templates/docker_server/server_wrapper.sh b/truss/templates/docker_server/server_wrapper.sh index 3a8a0f1f0..0d434ed21 100644 --- a/truss/templates/docker_server/server_wrapper.sh +++ b/truss/templates/docker_server/server_wrapper.sh @@ -1,49 +1,187 @@ #!/bin/bash set -euo pipefail -# Enhanced shell script to replace supervisord for custom servers -# Manages nginx and model server processes with proper signal handling +# Enhanced shell script to match supervisord behavior for custom servers +# Manages nginx and model server processes with auto-restart and proper output handling + +# Global variables for process management +declare -A PROCESS_PIDS +declare -A RESTART_COUNTS +declare -A LAST_RESTART_TIME +NGINX_RESTART_COUNT=0 +MODEL_SERVER_RESTART_COUNT=0 +SHUTDOWN_REQUESTED=false log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >&2 } +# Function to start nginx with output redirection +start_nginx() { + log "Starting nginx..." + + # Start nginx with output redirected to stdout (like supervisord's stdout_logfile=/dev/fd/1) + nginx -g "daemon off;" & + local nginx_pid=$! + + # Give nginx a moment to start + sleep 1 + + # Check if nginx started successfully + if kill -0 "$nginx_pid" 2>/dev/null; then + PROCESS_PIDS["nginx"]=$nginx_pid + log "Nginx started successfully (PID: $nginx_pid)" + return 0 + else + log "ERROR: Failed to start nginx" + return 1 + fi +} + +# Function to start model server with output redirection +start_model_server() { + log "Starting model server with command: $START_COMMAND" + + # Start model server with output redirected to stdout (like supervisord) + # Using eval to handle complex commands properly + eval "$START_COMMAND" & + local model_pid=$! + + # Wait for model server to be ready (similar to supervisord's startsecs=30) + log "Waiting for model server to be ready..." + for i in {1..30}; do + if ! kill -0 "$model_pid" 2>/dev/null; then + log "ERROR: Model server failed to start" + return 1 + fi + + # Check if server is responding (basic health check) + if curl -s -f "http://localhost:${SERVER_PORT}/ready" >/dev/null 2>&1; then + log "Model server is ready" + break + fi + + if [[ $i -eq 30 ]]; then + log "WARNING: Model server readiness check timed out, continuing anyway" + fi + + sleep 1 + done + + PROCESS_PIDS["model_server"]=$model_pid + log "Model server started successfully (PID: $model_pid)" + return 0 +} + +# Function to restart a process with backoff +restart_process() { + local process_name=$1 + local max_restarts=3 + local backoff_time=1 + + # Get current restart count + local restart_count_var="${process_name^^}_RESTART_COUNT" + local restart_count=${!restart_count_var} + + if [[ $restart_count -ge $max_restarts ]]; then + log "ERROR: $process_name has reached max restart limit ($max_restarts)" + return 1 + fi + + # Calculate backoff time (exponential backoff) + backoff_time=$((2 ** restart_count)) + + log "Restarting $process_name (attempt $((restart_count + 1))/$max_restarts) after ${backoff_time}s backoff" + sleep $backoff_time + + # Increment restart count + eval "$restart_count_var=$((restart_count + 1))" + + # Restart the process + if [[ "$process_name" == "nginx" ]]; then + start_nginx + elif [[ "$process_name" == "model_server" ]]; then + start_model_server + fi + + return $? +} + +# Function to check if a process is running +check_process() { + local process_name=$1 + local pid=${PROCESS_PIDS[$process_name]:-} + + if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then + return 0 # Process is running + else + return 1 # Process is not running + fi +} + +# Function to handle process failures +handle_process_failure() { + local process_name=$1 + + if [[ "$SHUTDOWN_REQUESTED" == "true" ]]; then + return 0 # Don't restart if shutdown was requested + fi + + log "WARNING: $process_name process has stopped" + + # Remove from process tracking + unset PROCESS_PIDS[$process_name] + + # Attempt to restart the process + if restart_process "$process_name"; then + log "Successfully restarted $process_name" + return 0 + else + log "ERROR: Failed to restart $process_name after max attempts" + return 1 + fi +} + +# Cleanup function for graceful shutdown cleanup() { log "Received shutdown signal, stopping processes..." + SHUTDOWN_REQUESTED=true # Stop model server if running - if [[ -n "${MODEL_SERVER_PID:-}" ]] && kill -0 "$MODEL_SERVER_PID" 2>/dev/null; then - log "Stopping model server (PID: $MODEL_SERVER_PID)" - kill -TERM "$MODEL_SERVER_PID" 2>/dev/null || true + if check_process "model_server"; then + local model_pid=${PROCESS_PIDS["model_server"]} + log "Stopping model server (PID: $model_pid)" + kill -TERM "$model_pid" 2>/dev/null || true # Wait for graceful shutdown (max 30 seconds) for i in {1..30}; do - if ! kill -0 "$MODEL_SERVER_PID" 2>/dev/null; then + if ! kill -0 "$model_pid" 2>/dev/null; then log "Model server stopped gracefully" break fi if [[ $i -eq 30 ]]; then log "Force killing model server" - kill -KILL "$MODEL_SERVER_PID" 2>/dev/null || true + kill -KILL "$model_pid" 2>/dev/null || true fi sleep 1 done fi # Stop nginx if running - if [[ -n "${NGINX_PID:-}" ]] && kill -0 "$NGINX_PID" 2>/dev/null; then - log "Stopping nginx (PID: $NGINX_PID)" - kill -TERM "$NGINX_PID" 2>/dev/null || true + if check_process "nginx"; then + local nginx_pid=${PROCESS_PIDS["nginx"]} + log "Stopping nginx (PID: $nginx_pid)" + kill -TERM "$nginx_pid" 2>/dev/null || true # Wait for graceful shutdown (max 10 seconds) for i in {1..10}; do - if ! kill -0 "$NGINX_PID" 2>/dev/null; then + if ! kill -0 "$nginx_pid" 2>/dev/null; then log "Nginx stopped gracefully" break fi if [[ $i -eq 10 ]]; then log "Force killing nginx" - kill -KILL "$NGINX_PID" 2>/dev/null || true + kill -KILL "$nginx_pid" 2>/dev/null || true fi sleep 1 done @@ -71,66 +209,46 @@ log "Starting custom server wrapper" log "Model server command: $START_COMMAND" log "Server port: $SERVER_PORT" -# Start nginx in background (will run in foreground later) -log "Starting nginx..." -nginx -g "daemon off;" & -NGINX_PID=$! +# Initialize restart counts +NGINX_RESTART_COUNT=0 +MODEL_SERVER_RESTART_COUNT=0 -# Wait a moment for nginx to start -sleep 2 - -# Verify nginx started successfully -if ! kill -0 "$NGINX_PID" 2>/dev/null; then - log "ERROR: Failed to start nginx" +# Start both processes +if ! start_nginx; then + log "ERROR: Failed to start nginx initially" exit 1 fi -log "Nginx started successfully (PID: $NGINX_PID)" - -# Start model server in background -log "Starting model server..." -eval "$START_COMMAND" & -MODEL_SERVER_PID=$! - -# Wait for model server to be ready (similar to supervisord's startsecs=30) -log "Waiting for model server to be ready..." -for i in {1..30}; do - if ! kill -0 "$MODEL_SERVER_PID" 2>/dev/null; then - log "ERROR: Model server failed to start" - cleanup - exit 1 - fi - - # Check if server is responding (basic health check) - if curl -s -f "http://localhost:${SERVER_PORT}/ready" >/dev/null 2>&1; then - log "Model server is ready" - break - fi - - if [[ $i -eq 30 ]]; then - log "WARNING: Model server readiness check timed out, continuing anyway" - fi - - sleep 1 -done +if ! start_model_server; then + log "ERROR: Failed to start model server initially" + cleanup + exit 1 +fi -# Monitor processes log "Both services are running, monitoring processes..." + +# Main monitoring loop - matches supervisord's behavior while true; do - # Check if nginx is still running - if ! kill -0 "$NGINX_PID" 2>/dev/null; then - log "ERROR: Nginx process died" - cleanup - exit 1 + # Check nginx status + if ! check_process "nginx"; then + log "WARNING: Nginx process has stopped" + if ! handle_process_failure "nginx"; then + log "ERROR: Failed to restart nginx, shutting down" + cleanup + exit 1 + fi fi - # Check if model server is still running - if ! kill -0 "$MODEL_SERVER_PID" 2>/dev/null; then - log "ERROR: Model server process died" - cleanup - exit 1 + # Check model server status + if ! check_process "model_server"; then + log "WARNING: Model server process has stopped" + if ! handle_process_failure "model_server"; then + log "ERROR: Failed to restart model server, shutting down" + cleanup + exit 1 + fi fi - # Sleep for a short interval before checking again + # Sleep for a short interval before checking again (like supervisord) sleep 5 done From 444b6681c739bb2adc55a16c92c1f644944e32a1 Mon Sep 17 00:00:00 2001 From: Dhruv Singal Date: Thu, 2 Oct 2025 16:24:44 -0700 Subject: [PATCH 4/4] fix restart behavior --- .../templates/docker_server/server_wrapper.sh | 102 +++++++++++++----- 1 file changed, 74 insertions(+), 28 deletions(-) diff --git a/truss/templates/docker_server/server_wrapper.sh b/truss/templates/docker_server/server_wrapper.sh index 0d434ed21..d83629034 100644 --- a/truss/templates/docker_server/server_wrapper.sh +++ b/truss/templates/docker_server/server_wrapper.sh @@ -8,10 +8,15 @@ set -euo pipefail declare -A PROCESS_PIDS declare -A RESTART_COUNTS declare -A LAST_RESTART_TIME -NGINX_RESTART_COUNT=0 -MODEL_SERVER_RESTART_COUNT=0 +declare -A PROCESS_START_TIME SHUTDOWN_REQUESTED=false +# Configuration matching supervisord defaults +MAX_RESTART_ATTEMPTS=3 +RESTART_RESET_TIME=10 # Reset restart counter after 10 seconds of stable operation +FATAL_STATE_GRACE_PERIOD=5 # Wait 5 seconds before declaring fatal state +LINEAR_BACKOFF_INTERVAL=1 # supervisord uses linear backoff by default + log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >&2 } @@ -30,6 +35,7 @@ start_nginx() { # Check if nginx started successfully if kill -0 "$nginx_pid" 2>/dev/null; then PROCESS_PIDS["nginx"]=$nginx_pid + PROCESS_START_TIME["nginx"]=$(date +%s) log "Nginx started successfully (PID: $nginx_pid)" return 0 else @@ -69,33 +75,50 @@ start_model_server() { done PROCESS_PIDS["model_server"]=$model_pid + PROCESS_START_TIME["model_server"]=$(date +%s) log "Model server started successfully (PID: $model_pid)" return 0 } -# Function to restart a process with backoff +# Function to check if restart counter should be reset (like supervisord's startsecs behavior) +should_reset_restart_counter() { + local process_name=$1 + local current_time=$(date +%s) + local start_time=${PROCESS_START_TIME[$process_name]:-0} + + # Reset counter if process has been running stably for RESTART_RESET_TIME seconds + if [[ $((current_time - start_time)) -gt $RESTART_RESET_TIME ]]; then + return 0 + fi + return 1 +} + +# Function to restart a process with backoff (matching supervisord behavior) restart_process() { local process_name=$1 - local max_restarts=3 - local backoff_time=1 - # Get current restart count - local restart_count_var="${process_name^^}_RESTART_COUNT" - local restart_count=${!restart_count_var} + # Check if we should reset the restart counter (process ran successfully for a while) + if should_reset_restart_counter "$process_name"; then + RESTART_COUNTS[$process_name]=0 + log "Resetting restart counter for $process_name (process ran stably for $RESTART_RESET_TIME seconds)" + fi + + local restart_count=${RESTART_COUNTS[$process_name]:-0} - if [[ $restart_count -ge $max_restarts ]]; then - log "ERROR: $process_name has reached max restart limit ($max_restarts)" + if [[ $restart_count -ge $MAX_RESTART_ATTEMPTS ]]; then + log "ERROR: $process_name has reached max restart limit ($MAX_RESTART_ATTEMPTS)" return 1 fi - # Calculate backoff time (exponential backoff) - backoff_time=$((2 ** restart_count)) + # Use linear backoff like supervisord (not exponential) + local backoff_time=$((restart_count * LINEAR_BACKOFF_INTERVAL)) - log "Restarting $process_name (attempt $((restart_count + 1))/$max_restarts) after ${backoff_time}s backoff" + log "Restarting $process_name (attempt $((restart_count + 1))/$MAX_RESTART_ATTEMPTS) after ${backoff_time}s backoff" sleep $backoff_time - # Increment restart count - eval "$restart_count_var=$((restart_count + 1))" + # Increment restart count and record restart time + RESTART_COUNTS[$process_name]=$((restart_count + 1)) + LAST_RESTART_TIME[$process_name]=$(date +%s) # Restart the process if [[ "$process_name" == "nginx" ]]; then @@ -119,7 +142,7 @@ check_process() { fi } -# Function to handle process failures +# Function to handle process failures (implements supervisord's PROCESS_STATE_FATAL behavior) handle_process_failure() { local process_name=$1 @@ -138,6 +161,17 @@ handle_process_failure() { return 0 else log "ERROR: Failed to restart $process_name after max attempts" + + # Implement supervisord's PROCESS_STATE_FATAL behavior - wait before declaring fatal + log "Waiting $FATAL_STATE_GRACE_PERIOD seconds before declaring fatal state..." + sleep $FATAL_STATE_GRACE_PERIOD + + # Check if shutdown was requested during grace period + if [[ "$SHUTDOWN_REQUESTED" == "true" ]]; then + return 0 + fi + + log "ERROR: $process_name has entered FATAL state (exhausted restart attempts)" return 1 fi } @@ -209,9 +243,9 @@ log "Starting custom server wrapper" log "Model server command: $START_COMMAND" log "Server port: $SERVER_PORT" -# Initialize restart counts -NGINX_RESTART_COUNT=0 -MODEL_SERVER_RESTART_COUNT=0 +# Initialize restart counts using associative arrays (properly scoped) +RESTART_COUNTS["nginx"]=0 +RESTART_COUNTS["model_server"]=0 # Start both processes if ! start_nginx; then @@ -228,27 +262,39 @@ fi log "Both services are running, monitoring processes..." # Main monitoring loop - matches supervisord's behavior -while true; do +FATAL_STATE_REACHED=false + +while [[ "$FATAL_STATE_REACHED" == "false" && "$SHUTDOWN_REQUESTED" == "false" ]]; do # Check nginx status if ! check_process "nginx"; then log "WARNING: Nginx process has stopped" if ! handle_process_failure "nginx"; then - log "ERROR: Failed to restart nginx, shutting down" - cleanup - exit 1 + log "ERROR: Nginx has entered FATAL state" + FATAL_STATE_REACHED=true fi fi - # Check model server status - if ! check_process "model_server"; then + # Check model server status (only if we haven't reached fatal state) + if [[ "$FATAL_STATE_REACHED" == "false" ]] && ! check_process "model_server"; then log "WARNING: Model server process has stopped" if ! handle_process_failure "model_server"; then - log "ERROR: Failed to restart model server, shutting down" - cleanup - exit 1 + log "ERROR: Model server has entered FATAL state" + FATAL_STATE_REACHED=true fi fi # Sleep for a short interval before checking again (like supervisord) sleep 5 done + +# Handle fatal state (like supervisord's PROCESS_STATE_FATAL) +if [[ "$FATAL_STATE_REACHED" == "true" ]]; then + log "ERROR: One or more processes have entered FATAL state - shutting down" + cleanup + exit 1 +fi + +# Normal shutdown +if [[ "$SHUTDOWN_REQUESTED" == "true" ]]; then + cleanup +fi