diff --git a/truss/contexts/image_builder/serving_image_builder.py b/truss/contexts/image_builder/serving_image_builder.py index 8e7fad9db..064fd57d1 100644 --- a/truss/contexts/image_builder/serving_image_builder.py +++ b/truss/contexts/image_builder/serving_image_builder.py @@ -378,17 +378,20 @@ def generate_docker_server_nginx_config(build_dir, config): nginx_filepath.write_text(nginx_content) -def generate_docker_server_supervisord_config(build_dir, config): - supervisord_template = read_template_from_fs( - DOCKER_SERVER_TEMPLATES_DIR, "supervisord.conf.jinja" +def generate_docker_server_wrapper_script(build_dir, config): + wrapper_template = read_template_from_fs( + DOCKER_SERVER_TEMPLATES_DIR, "server_wrapper.sh" ) assert config.docker_server.start_command is not None, ( "docker_server.start_command is required to use custom server" ) start_command = config.docker_server.start_command - supervisord_contents = supervisord_template.render(start_command=start_command) - supervisord_filepath = build_dir / "supervisord.conf" - supervisord_filepath.write_text(supervisord_contents) + server_port = config.docker_server.server_port + wrapper_contents = wrapper_template.render( + start_command=start_command, server_port=server_port + ) + wrapper_filepath = build_dir / "server_wrapper.sh" + wrapper_filepath.write_text(wrapper_contents) class ServingImageBuilderContext(TrussContext): @@ -586,7 +589,7 @@ def prepare_image_build_dir( generate_docker_server_nginx_config(build_dir, config) - generate_docker_server_supervisord_config(build_dir, config) + generate_docker_server_wrapper_script(build_dir, config) # Override config.yml with (build_dir / CONFIG_FILE).open("w") as config_file: diff --git a/truss/templates/docker_server/server_wrapper.sh b/truss/templates/docker_server/server_wrapper.sh new file mode 100644 index 000000000..d83629034 --- /dev/null +++ b/truss/templates/docker_server/server_wrapper.sh @@ -0,0 +1,300 @@ +#!/bin/bash +set -euo pipefail + +# Enhanced shell script to match supervisord behavior for custom servers +# Manages nginx and model server processes with auto-restart and proper output handling + +# Global variables for process management +declare -A PROCESS_PIDS +declare -A RESTART_COUNTS +declare -A LAST_RESTART_TIME +declare -A PROCESS_START_TIME +SHUTDOWN_REQUESTED=false + +# Configuration matching supervisord defaults +MAX_RESTART_ATTEMPTS=3 +RESTART_RESET_TIME=10 # Reset restart counter after 10 seconds of stable operation +FATAL_STATE_GRACE_PERIOD=5 # Wait 5 seconds before declaring fatal state +LINEAR_BACKOFF_INTERVAL=1 # supervisord uses linear backoff by default + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $*" >&2 +} + +# Function to start nginx with output redirection +start_nginx() { + log "Starting nginx..." + + # Start nginx with output redirected to stdout (like supervisord's stdout_logfile=/dev/fd/1) + nginx -g "daemon off;" & + local nginx_pid=$! + + # Give nginx a moment to start + sleep 1 + + # Check if nginx started successfully + if kill -0 "$nginx_pid" 2>/dev/null; then + PROCESS_PIDS["nginx"]=$nginx_pid + PROCESS_START_TIME["nginx"]=$(date +%s) + log "Nginx started successfully (PID: $nginx_pid)" + return 0 + else + log "ERROR: Failed to start nginx" + return 1 + fi +} + +# Function to start model server with output redirection +start_model_server() { + log "Starting model server with command: $START_COMMAND" + + # Start model server with output redirected to stdout (like supervisord) + # Using eval to handle complex commands properly + eval "$START_COMMAND" & + local model_pid=$! + + # Wait for model server to be ready (similar to supervisord's startsecs=30) + log "Waiting for model server to be ready..." + for i in {1..30}; do + if ! kill -0 "$model_pid" 2>/dev/null; then + log "ERROR: Model server failed to start" + return 1 + fi + + # Check if server is responding (basic health check) + if curl -s -f "http://localhost:${SERVER_PORT}/ready" >/dev/null 2>&1; then + log "Model server is ready" + break + fi + + if [[ $i -eq 30 ]]; then + log "WARNING: Model server readiness check timed out, continuing anyway" + fi + + sleep 1 + done + + PROCESS_PIDS["model_server"]=$model_pid + PROCESS_START_TIME["model_server"]=$(date +%s) + log "Model server started successfully (PID: $model_pid)" + return 0 +} + +# Function to check if restart counter should be reset (like supervisord's startsecs behavior) +should_reset_restart_counter() { + local process_name=$1 + local current_time=$(date +%s) + local start_time=${PROCESS_START_TIME[$process_name]:-0} + + # Reset counter if process has been running stably for RESTART_RESET_TIME seconds + if [[ $((current_time - start_time)) -gt $RESTART_RESET_TIME ]]; then + return 0 + fi + return 1 +} + +# Function to restart a process with backoff (matching supervisord behavior) +restart_process() { + local process_name=$1 + + # Check if we should reset the restart counter (process ran successfully for a while) + if should_reset_restart_counter "$process_name"; then + RESTART_COUNTS[$process_name]=0 + log "Resetting restart counter for $process_name (process ran stably for $RESTART_RESET_TIME seconds)" + fi + + local restart_count=${RESTART_COUNTS[$process_name]:-0} + + if [[ $restart_count -ge $MAX_RESTART_ATTEMPTS ]]; then + log "ERROR: $process_name has reached max restart limit ($MAX_RESTART_ATTEMPTS)" + return 1 + fi + + # Use linear backoff like supervisord (not exponential) + local backoff_time=$((restart_count * LINEAR_BACKOFF_INTERVAL)) + + log "Restarting $process_name (attempt $((restart_count + 1))/$MAX_RESTART_ATTEMPTS) after ${backoff_time}s backoff" + sleep $backoff_time + + # Increment restart count and record restart time + RESTART_COUNTS[$process_name]=$((restart_count + 1)) + LAST_RESTART_TIME[$process_name]=$(date +%s) + + # Restart the process + if [[ "$process_name" == "nginx" ]]; then + start_nginx + elif [[ "$process_name" == "model_server" ]]; then + start_model_server + fi + + return $? +} + +# Function to check if a process is running +check_process() { + local process_name=$1 + local pid=${PROCESS_PIDS[$process_name]:-} + + if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then + return 0 # Process is running + else + return 1 # Process is not running + fi +} + +# Function to handle process failures (implements supervisord's PROCESS_STATE_FATAL behavior) +handle_process_failure() { + local process_name=$1 + + if [[ "$SHUTDOWN_REQUESTED" == "true" ]]; then + return 0 # Don't restart if shutdown was requested + fi + + log "WARNING: $process_name process has stopped" + + # Remove from process tracking + unset PROCESS_PIDS[$process_name] + + # Attempt to restart the process + if restart_process "$process_name"; then + log "Successfully restarted $process_name" + return 0 + else + log "ERROR: Failed to restart $process_name after max attempts" + + # Implement supervisord's PROCESS_STATE_FATAL behavior - wait before declaring fatal + log "Waiting $FATAL_STATE_GRACE_PERIOD seconds before declaring fatal state..." + sleep $FATAL_STATE_GRACE_PERIOD + + # Check if shutdown was requested during grace period + if [[ "$SHUTDOWN_REQUESTED" == "true" ]]; then + return 0 + fi + + log "ERROR: $process_name has entered FATAL state (exhausted restart attempts)" + return 1 + fi +} + +# Cleanup function for graceful shutdown +cleanup() { + log "Received shutdown signal, stopping processes..." + SHUTDOWN_REQUESTED=true + + # Stop model server if running + if check_process "model_server"; then + local model_pid=${PROCESS_PIDS["model_server"]} + log "Stopping model server (PID: $model_pid)" + kill -TERM "$model_pid" 2>/dev/null || true + + # Wait for graceful shutdown (max 30 seconds) + for i in {1..30}; do + if ! kill -0 "$model_pid" 2>/dev/null; then + log "Model server stopped gracefully" + break + fi + if [[ $i -eq 30 ]]; then + log "Force killing model server" + kill -KILL "$model_pid" 2>/dev/null || true + fi + sleep 1 + done + fi + + # Stop nginx if running + if check_process "nginx"; then + local nginx_pid=${PROCESS_PIDS["nginx"]} + log "Stopping nginx (PID: $nginx_pid)" + kill -TERM "$nginx_pid" 2>/dev/null || true + + # Wait for graceful shutdown (max 10 seconds) + for i in {1..10}; do + if ! kill -0 "$nginx_pid" 2>/dev/null; then + log "Nginx stopped gracefully" + break + fi + if [[ $i -eq 10 ]]; then + log "Force killing nginx" + kill -KILL "$nginx_pid" 2>/dev/null || true + fi + sleep 1 + done + fi + + log "Shutdown complete" + exit 0 +} + +# Set up signal handlers +trap cleanup SIGTERM SIGINT + +# Validate required environment variables +if [[ -z "${START_COMMAND:-}" ]]; then + log "ERROR: START_COMMAND environment variable is required" + exit 1 +fi + +if [[ -z "${SERVER_PORT:-}" ]]; then + log "ERROR: SERVER_PORT environment variable is required" + exit 1 +fi + +log "Starting custom server wrapper" +log "Model server command: $START_COMMAND" +log "Server port: $SERVER_PORT" + +# Initialize restart counts using associative arrays (properly scoped) +RESTART_COUNTS["nginx"]=0 +RESTART_COUNTS["model_server"]=0 + +# Start both processes +if ! start_nginx; then + log "ERROR: Failed to start nginx initially" + exit 1 +fi + +if ! start_model_server; then + log "ERROR: Failed to start model server initially" + cleanup + exit 1 +fi + +log "Both services are running, monitoring processes..." + +# Main monitoring loop - matches supervisord's behavior +FATAL_STATE_REACHED=false + +while [[ "$FATAL_STATE_REACHED" == "false" && "$SHUTDOWN_REQUESTED" == "false" ]]; do + # Check nginx status + if ! check_process "nginx"; then + log "WARNING: Nginx process has stopped" + if ! handle_process_failure "nginx"; then + log "ERROR: Nginx has entered FATAL state" + FATAL_STATE_REACHED=true + fi + fi + + # Check model server status (only if we haven't reached fatal state) + if [[ "$FATAL_STATE_REACHED" == "false" ]] && ! check_process "model_server"; then + log "WARNING: Model server process has stopped" + if ! handle_process_failure "model_server"; then + log "ERROR: Model server has entered FATAL state" + FATAL_STATE_REACHED=true + fi + fi + + # Sleep for a short interval before checking again (like supervisord) + sleep 5 +done + +# Handle fatal state (like supervisord's PROCESS_STATE_FATAL) +if [[ "$FATAL_STATE_REACHED" == "true" ]]; then + log "ERROR: One or more processes have entered FATAL state - shutting down" + cleanup + exit 1 +fi + +# Normal shutdown +if [[ "$SHUTDOWN_REQUESTED" == "true" ]]; then + cleanup +fi diff --git a/truss/templates/docker_server_requirements.txt b/truss/templates/docker_server_requirements.txt index 3131eb911..7e31bd68d 100644 --- a/truss/templates/docker_server_requirements.txt +++ b/truss/templates/docker_server_requirements.txt @@ -1 +1 @@ -supervisor==4.2.5 +# Shell script wrapper replaces supervisord for process management diff --git a/truss/templates/server.Dockerfile.jinja b/truss/templates/server.Dockerfile.jinja index cbe06cd51..dfeeaf4e5 100644 --- a/truss/templates/server.Dockerfile.jinja +++ b/truss/templates/server.Dockerfile.jinja @@ -119,17 +119,18 @@ RUN uv python install {{ control_python_version }} RUN uv venv /docker_server/.venv --python {{ control_python_version }} RUN uv pip install --python /docker_server/.venv/bin/python -r /app/docker_server_requirements.txt --no-cache-dir {% set proxy_config_path = "/etc/nginx/conf.d/proxy.conf" %} -{% set supervisor_config_path = "/etc/supervisor/supervisord.conf" %} -{% set supervisor_server_url = "http://localhost:8080" %} +{% set server_wrapper_path = "/docker_server/server_wrapper.sh" %} COPY --chown={{ default_owner }} ./proxy.conf {{ proxy_config_path }} -COPY --chown={{ default_owner }} ./supervisord.conf {{ supervisor_config_path }} -ENV SUPERVISOR_SERVER_URL="{{ supervisor_server_url }}" -ENV SERVER_START_CMD="/docker_server/.venv/bin/supervisord -c {{ supervisor_config_path }}" +COPY --chown={{ default_owner }} ./server_wrapper.sh {{ server_wrapper_path }} +RUN chmod +x {{ server_wrapper_path }} +ENV START_COMMAND={{ config.docker_server.start_command | tojson }} +ENV SERVER_PORT={{ config.docker_server.server_port | tojson }} +ENV SERVER_START_CMD={{ server_wrapper_path | tojson }} {#- default configuration uses port 80, which requires root privileges, so we remove it #} RUN rm -f /etc/nginx/sites-enabled/default {#- nginx writes to /var/lib/nginx, /var/log/nginx, and /run directories #} {{ chown_and_switch_to_regular_user_if_enabled(["/var/lib/nginx", "/var/log/nginx", "/run"]) }} -ENTRYPOINT ["/docker_server/.venv/bin/supervisord", "-c", "{{ supervisor_config_path }}"] +ENTRYPOINT ["{{ server_wrapper_path }}"] {%- elif requires_live_reload %} {#- elif requires_live_reload #} ENV HASH_TRUSS="{{ truss_hash }}"