From b06c83040b9e5d0c7df0d0a6067333ee35633e0c Mon Sep 17 00:00:00 2001 From: Rob Mulla Date: Mon, 13 Oct 2025 14:04:47 -0400 Subject: [PATCH] feat: Add docker-compose and make based recipes --- .../vLLM/docker-compose/Llama3.1/.env.example | 37 +++++++++ .../vLLM/docker-compose/Llama3.1/Makefile | 59 +++++++++++++ .../vLLM/docker-compose/Llama3.1/README.md | 83 +++++++++++++++++++ .../Llama3.1/docker-compose.yml | 27 ++++++ .../docker-compose/Qwen2.5-32B/.env.example | 28 +++++++ .../vLLM/docker-compose/Qwen2.5-32B/Makefile | 66 +++++++++++++++ .../vLLM/docker-compose/Qwen2.5-32B/README.md | 58 +++++++++++++ .../Qwen2.5-32B/docker-compose.yml | 25 ++++++ .../docker-compose/Qwen2.5-VL/.env.example | 28 +++++++ .../vLLM/docker-compose/Qwen2.5-VL/Makefile | 70 ++++++++++++++++ .../vLLM/docker-compose/Qwen2.5-VL/README.md | 58 +++++++++++++ .../Qwen2.5-VL/docker-compose.yml | 31 +++++++ .../vLLM/docker-compose/Qwen3/.env.example | 35 ++++++++ .../vLLM/docker-compose/Qwen3/Makefile | 66 +++++++++++++++ .../vLLM/docker-compose/Qwen3/README.md | 58 +++++++++++++ .../docker-compose/Qwen3/docker-compose.yml | 24 ++++++ .../trillium/vLLM/docker-compose/README.md | 12 +++ 17 files changed, 765 insertions(+) create mode 100644 inference/trillium/vLLM/docker-compose/Llama3.1/.env.example create mode 100644 inference/trillium/vLLM/docker-compose/Llama3.1/Makefile create mode 100644 inference/trillium/vLLM/docker-compose/Llama3.1/README.md create mode 100644 inference/trillium/vLLM/docker-compose/Llama3.1/docker-compose.yml create mode 100644 inference/trillium/vLLM/docker-compose/Qwen2.5-32B/.env.example create mode 100644 inference/trillium/vLLM/docker-compose/Qwen2.5-32B/Makefile create mode 100644 inference/trillium/vLLM/docker-compose/Qwen2.5-32B/README.md create mode 100644 inference/trillium/vLLM/docker-compose/Qwen2.5-32B/docker-compose.yml create mode 100644 inference/trillium/vLLM/docker-compose/Qwen2.5-VL/.env.example create mode 100644 inference/trillium/vLLM/docker-compose/Qwen2.5-VL/Makefile create mode 100644 inference/trillium/vLLM/docker-compose/Qwen2.5-VL/README.md create mode 100644 inference/trillium/vLLM/docker-compose/Qwen2.5-VL/docker-compose.yml create mode 100644 inference/trillium/vLLM/docker-compose/Qwen3/.env.example create mode 100644 inference/trillium/vLLM/docker-compose/Qwen3/Makefile create mode 100644 inference/trillium/vLLM/docker-compose/Qwen3/README.md create mode 100644 inference/trillium/vLLM/docker-compose/Qwen3/docker-compose.yml create mode 100644 inference/trillium/vLLM/docker-compose/README.md diff --git a/inference/trillium/vLLM/docker-compose/Llama3.1/.env.example b/inference/trillium/vLLM/docker-compose/Llama3.1/.env.example new file mode 100644 index 0000000..7a82bb4 --- /dev/null +++ b/inference/trillium/vLLM/docker-compose/Llama3.1/.env.example @@ -0,0 +1,37 @@ +# This file provides the default configuration for running Llama3.1 models. +# Copy this file to .env and edit it with your specific values. + +# Docker image to use +DOCKER_URI=vllm/vllm-tpu:nightly + +# Your Hugging Face token for downloading models +HF_TOKEN= + +# Name of the user to tag the container (optional, defaults to current user) +# USER=your-name + +# --- Server Configuration --- +# Seed for reproducibility +SEED=42 +# GPU memory utilization +GPU_MEMORY_UTILIZATION=0.98 +# Add '--disable-log-requests' to disable request logging, or leave blank to enable. +DISABLE_LOG_REQUESTS=--disable-log-requests + +# --- Llama3.1-70B Configuration (Default) --- +SHM_SIZE=150gb +MODEL_NAME=meta-llama/Llama-3.1-70B-Instruct +MAX_MODEL_LEN=4096 +TP=8 +MAX_NUM_BATCHED_TOKENS=2048 +MAX_NUM_SEQS=256 + +# --- Llama3.1-8B Configuration --- +# Uncomment the lines below to switch to the 8B model. +# +# SHM_SIZE=17gb +# MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct +# MAX_MODEL_LEN=4096 +# TP=1 +# MAX_NUM_BATCHED_TOKENS=1024 +# MAX_NUM_SEQS=128 diff --git a/inference/trillium/vLLM/docker-compose/Llama3.1/Makefile b/inference/trillium/vLLM/docker-compose/Llama3.1/Makefile new file mode 100644 index 0000000..e3dd497 --- /dev/null +++ b/inference/trillium/vLLM/docker-compose/Llama3.1/Makefile @@ -0,0 +1,59 @@ +.PHONY: help up down logs shell clean setup benchmark + +help: + @echo "Usage: make [target]" + @echo "" + @echo "Targets:" + @echo " setup Create .env file from the example" + @echo " up Start the vLLM server in the background" + @echo " down Stop and remove the vLLM server" + @echo " logs Follow the server logs" + @echo " shell Get a bash shell inside the running container" + @echo " benchmark Run the benchmark test against the server" + @echo " clean Remove the .env file" + +# Creates the .env file if it doesn't exist +setup: + @if [ ! -f .env ]; then \ + echo "Creating .env file..."; \ + cp .env.example .env; \ + sed -i'' -e "s/your-username/$(USER)/g" .env; \ + echo "Created .env and set USER to '$(USER)'. Please edit it to add your Hugging Face token."; \ + else \ + echo ".env file already exists."; \ + fi + +# The -E flag preserves environment variables for the container +up: setup + sudo -E docker compose up -d + +down: + sudo docker compose down + +logs: + sudo docker compose logs -f + +# Get a bash shell inside the running container +shell: + sudo docker exec -it $(shell sudo docker compose ps -q vllm) bash + +benchmark: + @echo "Running benchmark inside the container..." + @sudo docker exec -it \ + -e MODEL_NAME=$(shell grep '^MODEL_NAME=' .env | cut -d= -f2) \ + -e SEED=$(shell grep '^SEED=' .env | cut -d= -f2) \ + $(shell sudo docker compose ps -q vllm) \ + bash -c ' \ + cd /workspace/vllm && \ + pip install -q datasets && \ + vllm bench serve \ + --model "$${MODEL_NAME}" \ + --dataset-name random \ + --num-prompts 1000 \ + --random-input-len=1800 \ + --random-output-len=128 \ + --seed=$${SEED} \ + ' + +clean: + rm -f .env diff --git a/inference/trillium/vLLM/docker-compose/Llama3.1/README.md b/inference/trillium/vLLM/docker-compose/Llama3.1/README.md new file mode 100644 index 0000000..f3bea39 --- /dev/null +++ b/inference/trillium/vLLM/docker-compose/Llama3.1/README.md @@ -0,0 +1,83 @@ +# Serve Llama3.1 with vLLM using Docker Compose and Make + +This guide provides a streamlined workflow for serving Llama3.1 models using `docker-compose` and a `Makefile` to simplify all commands. + +## Step 1: Create and Access your TPU VM + +Follow the instructions in the [original recipe](../../Llama3.1/README.md) to provision a TPU VM and SSH into it. + +## Step 2: Setup and Configuration + +On your TPU VM, clone this repository (if you haven't already) and navigate to this directory. + +```bash +cd ~/tpu-recipes/inference/trillium/vLLM/docker-compose/Llama3.1/ +``` + +Now, run the `setup` command. This will create a `.env` file for your configuration. + +```bash +make setup +``` + +After running, edit the new `.env` file to add your `HF_TOKEN`. You can also switch between the 70B and 8B models by commenting and uncommenting the relevant lines. + +```bash +nano .env +``` + +## Step 3: Run the Server + +The `Makefile` provides simple commands to manage the server lifecycle. + +* **Start the server:** + ```bash + make up + ``` +* **Follow the logs:** + ```bash + make logs + ``` + The server is ready when you see the message: `Application startup complete.` + +* **Run the benchmark:** + Once the server is ready, you can run the benchmark with a single command: + ```bash + make benchmark + ``` + This command will enter the container, install the necessary dependencies, and run the benchmark for you. + +* **Open a shell for manual testing:** + If you want to run `curl` commands manually or debug inside the container, you can use: + ```bash + make shell + ``` + +* **Stop the server:** + ```bash + make down + ``` + +--- + +## Advanced Usage + +### Forcing a Rebuild + +If you need to pull a newer version of the Docker image, you can use the `--build` flag with the `up` command: + +```bash +sudo -E docker compose up -d --build +``` + +### Removing the Docker Image + +To manually remove the Docker image from your TPU VM, first ensure the services are down, then use the `docker rmi` command. + +```bash +# Stop and remove the containers +make down + +# Remove the image +sudo docker rmi $(grep "^DOCKER_URI=" .env | cut -d '=' -f2) +``` diff --git a/inference/trillium/vLLM/docker-compose/Llama3.1/docker-compose.yml b/inference/trillium/vLLM/docker-compose/Llama3.1/docker-compose.yml new file mode 100644 index 0000000..8cf8a4d --- /dev/null +++ b/inference/trillium/vLLM/docker-compose/Llama3.1/docker-compose.yml @@ -0,0 +1,27 @@ +services: + vllm: + image: ${DOCKER_URI:-vllm/vllm-tpu:nightly} + privileged: true + network_mode: "host" + volumes: + - /dev/shm:/dev/shm + shm_size: ${SHM_SIZE} + # The 'ports' directive is ignored when network_mode is "host", but kept for clarity. + ports: + - "8000:8000" + environment: + - HF_HOME=/dev/shm + - HF_TOKEN=${HF_TOKEN} + - SEED=${SEED} + - MODEL_NAME=${MODEL_NAME} + # The command to start the vLLM server. + # All variables are sourced from the .env file. + command: > + vllm serve ${MODEL_NAME} + --seed ${SEED} + ${DISABLE_LOG_REQUESTS} + --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} + --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS} + --max-num-seqs ${MAX_NUM_SEQS} + --tensor-parallel-size ${TP} + --max-model-len ${MAX_MODEL_LEN} diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/.env.example b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/.env.example new file mode 100644 index 0000000..7164f1a --- /dev/null +++ b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/.env.example @@ -0,0 +1,28 @@ +# This file provides the default configuration for running the Qwen2.5-32B model. +# Copy this file to .env and edit it with your specific values. + +# Your username, used by the Makefile for container operations. +# This will be set automatically by 'make setup' +USER=your-username + +# Docker image to use +DOCKER_URI=vllm/vllm-tpu:nightly + +# Your Hugging Face token for downloading models +HF_TOKEN= + +# --- Server Configuration --- +SEED=42 +GPU_MEMORY_UTILIZATION=0.98 +DISABLE_LOG_REQUESTS=--disable-log-requests +# This model requires the V1 API flag +VLLM_USE_V1=1 + +# --- Qwen2.5-32B Configuration --- +# Note: The original recipe specified 10gb, but 100gb is a more realistic value for a 32B model. +SHM_SIZE=100gb +MODEL_NAME=Qwen/Qwen2.5-32B +MAX_MODEL_LEN=4096 +TP=4 +MAX_NUM_BATCHED_TOKENS=2048 +MAX_NUM_SEQS=128 diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/Makefile b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/Makefile new file mode 100644 index 0000000..08a5dd5 --- /dev/null +++ b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/Makefile @@ -0,0 +1,66 @@ +# Load environment variables from .env file, if it exists +-include .env +export + +# Default to the 32B model if not set in .env +MODEL_NAME ?= Qwen/Qwen2.5-32B + +.PHONY: help up down logs shell clean setup benchmark + +help: + @echo "Usage: make [target]" + @echo "" + @echo "Targets:" + @echo " setup Create .env file from the example" + @echo " up Start the vLLM server in the background" + @echo " down Stop and remove the vLLM server" + @echo " logs Follow the server logs" + @echo " shell Get a bash shell inside the running container" + @echo " benchmark Run the benchmark test against the server" + @echo " clean Remove the .env file" + +# Creates the .env file if it doesn't exist +setup: + @if [ ! -f .env ]; then \ + echo "Creating .env file..."; \ + cp .env.example .env; \ + sed -i'' -e "s/your-username/$(USER)/g" .env; \ + echo "Created .env and set USER to '$(USER)'. Please edit it to add your Hugging Face token."; \ + else \ + echo ".env file already exists."; \ + fi + +# The -E flag preserves environment variables for the container +up: setup + sudo -E docker compose up -d + +down: + sudo docker compose down + +logs: + sudo docker compose logs -f + +# Get a bash shell inside the running container +shell: + sudo docker exec -it $(shell sudo docker compose ps -q vllm) bash + +benchmark: + @echo "Running benchmark inside the container..." + @sudo docker exec -it \ + -e MODEL_NAME=$(shell grep '^MODEL_NAME=' .env | cut -d= -f2) \ + -e SEED=$(shell grep '^SEED=' .env | cut -d= -f2) \ + $(shell sudo docker compose ps -q vllm) \ + bash -c ' \ + cd /workspace/vllm && \ + pip install -q datasets && \ + vllm bench serve \ + --model "$$${MODEL_NAME}" \ + --dataset-name random \ + --num-prompts 1000 \ + --random-input-len=1800 \ + --random-output-len=128 \ + --seed=$$${SEED} \ + ' + +clean: + rm -f .env diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/README.md b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/README.md new file mode 100644 index 0000000..445e716 --- /dev/null +++ b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/README.md @@ -0,0 +1,58 @@ +# Serve Qwen2.5-32B with vLLM using Docker Compose and Make + +This guide provides a streamlined workflow for serving the Qwen2.5-32B model using `docker-compose` and a `Makefile` to simplify all commands. + +## Step 1: Create and Access your TPU VM + +Follow the instructions in the [original recipe](../../Qwen2.5-32B/README.md) to provision a TPU VM and SSH into it. + +## Step 2: Setup and Configuration + +On your TPU VM, clone this repository (if you haven't already) and navigate to this directory. + +```bash +cd ~/tpu-recipes/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/ +``` + +Now, run the `setup` command. This will create a `.env` file for your configuration. + +```bash +make setup +``` + +After running, edit the new `.env` file to add your Hugging Face token. + +```bash +nano .env +``` + +## Step 3: Run the Server + +The `Makefile` provides simple commands to manage the server lifecycle. + +* **Start the server:** + ```bash + make up + ``` +* **Follow the logs:** + ```bash + make logs + ``` + The server is ready when you see the message: `Application startup complete.` + +* **Run the benchmark:** + Once the server is ready, you can run the benchmark with a single command: + ```bash + make benchmark + ``` + +* **Open a shell for manual testing:** + If you want to run `curl` commands manually or debug inside the container, you can use: + ```bash + make shell + ``` + +* **Stop the server:** + ```bash + make down + ``` diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/docker-compose.yml b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/docker-compose.yml new file mode 100644 index 0000000..ea89544 --- /dev/null +++ b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/docker-compose.yml @@ -0,0 +1,25 @@ +services: + vllm: + image: ${DOCKER_URI:-vllm/vllm-tpu:nightly} + privileged: true + network_mode: "host" + volumes: + - /dev/shm:/dev/shm + shm_size: ${SHM_SIZE} + ports: + - "8000:8000" + environment: + - HF_HOME=/dev/shm + - HF_TOKEN=${HF_TOKEN} + - SEED=${SEED} + - MODEL_NAME=${MODEL_NAME} + - VLLM_USE_V1=${VLLM_USE_V1} + command: > + vllm serve ${MODEL_NAME} + --seed ${SEED} + ${DISABLE_LOG_REQUESTS} + --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} + --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS} + --max-num-seqs ${MAX_NUM_SEQS} + --tensor-parallel-size ${TP} + --max-model-len ${MAX_MODEL_LEN} diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/.env.example b/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/.env.example new file mode 100644 index 0000000..6ebd028 --- /dev/null +++ b/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/.env.example @@ -0,0 +1,28 @@ +# This file provides the default configuration for running the Qwen2.5-VL-7B model. +# Copy this file to .env and edit it with your specific values. + +# Your username, used by the Makefile for container operations. +# This will be set automatically by 'make setup' +USER=your-username + +# Docker image to use +DOCKER_URI=vllm/vllm-tpu:nightly + +# Your Hugging Face token for downloading models +HF_TOKEN= + +# --- Server Configuration --- +SEED=42 +GPU_MEMORY_UTILIZATION=0.98 +DISABLE_LOG_REQUESTS=--disable-log-requests + +# --- Qwen2.5-VL-7B Configuration --- +SHM_SIZE=17gb +MODEL_NAME=Qwen/Qwen2.5-VL-7B-Instruct +MAX_MODEL_LEN=16384 +TP=1 +# Multi-modal parameters +LIMIT_MM_PER_PROMPT='{"image": 10, "video": 0}' +MM_PROCESSOR_KWARGS='{"max_pixels": 1003520}' +GUIDED_DECODING_BACKEND=xgrammar +DISABLE_CHUNKED_MM_INPUT=--disable-chunked-mm-input diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/Makefile b/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/Makefile new file mode 100644 index 0000000..10a4456 --- /dev/null +++ b/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/Makefile @@ -0,0 +1,70 @@ +# Load environment variables from .env file, if it exists +-include .env +export + +# Default to the 7B VL model if not set in .env +MODEL_NAME ?= Qwen/Qwen2.5-VL-7B-Instruct + +.PHONY: help up down logs shell clean setup benchmark + +help: + @echo "Usage: make [target]" + @echo "" + @echo "Targets:" + @echo " setup Create .env file from the example" + @echo " up Start the vLLM server in the background" + @echo " down Stop and remove the vLLM server" + @echo " logs Follow the server logs" + @echo " shell Get a bash shell inside the running container" + @echo " benchmark Run the multi-modal benchmark test against the server" + @echo " clean Remove the .env file" + +# Creates the .env file if it doesn't exist +setup: + @if [ ! -f .env ]; then \ + echo "Creating .env file..."; \ + cp .env.example .env; \ + sed -i'' -e "s/your-username/$(USER)/g" .env; \ + echo "Created .env and set USER to '$(USER)'. Please edit it to add your Hugging Face token."; \ + else \ + echo ".env file already exists."; \ + fi + +# The -E flag preserves environment variables for the container +up: setup + sudo -E docker compose up -d + +down: + sudo docker compose down + +logs: + sudo docker compose logs -f + +# Get a bash shell inside the running container +shell: + sudo docker exec -it $(shell sudo docker compose ps -q vllm) bash + +benchmark: + @echo "Running multi-modal benchmark inside the container..." + @sudo docker exec -it \ + -e MODEL_NAME=$(shell grep '^MODEL_NAME=' .env | cut -d= -f2) \ + -e SEED=$(shell grep '^SEED=' .env | cut -d= -f2) \ + $(shell sudo docker compose ps -q vllm) \ + bash -c ' \ + cd /workspace/vllm && \ + pip install -q datasets && \ + vllm bench serve \ + --model "$${MODEL_NAME}" \ + --dataset-name random-mm \ + --num-prompts 128 \ + --backend openai-chat \ + --endpoint "/v1/chat/completions" \ + --random-mm-bucket-config "{(736, 736, 1): 1.0}" \ + --random-mm-base-items-per-request 6 \ + --random-mm-num-mm-items-range-ratio 0.67 \ + --random-mm-limit-mm-per-prompt "{\"image\": 10, \"video\": 0}" \ + --seed=$${SEED} \ + ' + +clean: + rm -f .env \ No newline at end of file diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/README.md b/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/README.md new file mode 100644 index 0000000..ee66e72 --- /dev/null +++ b/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/README.md @@ -0,0 +1,58 @@ +# Serve Qwen2.5-VL with vLLM using Docker Compose and Make + +This guide provides a streamlined workflow for serving the multi-modal Qwen2.5-VL-7B model using `docker-compose` and a `Makefile`. + +## Step 1: Create and Access your TPU VM + +Follow the instructions in the [original recipe](../../Qwen2.5-VL/README.md) to provision a TPU VM and SSH into it. + +## Step 2: Setup and Configuration + +On your TPU VM, clone this repository (if you haven't already) and navigate to this directory. + +```bash +cd ~/tpu-recipes/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/ +``` + +Run the `setup` command to create your `.env` file. + +```bash +make setup +``` + +After running, edit the new `.env` file to add your Hugging Face token. + +```bash +nano .env +``` + +## Step 3: Run the Server + +The `Makefile` provides simple commands to manage the server lifecycle. + +* **Start the server:** + ```bash + make up + ``` +* **Follow the logs:** + ```bash + make logs + ``` + The server is ready when you see the message: `Application startup complete.` + +* **Run the benchmark:** + Once the server is ready, you can run the multi-modal benchmark with a single command: + ```bash + make benchmark + ``` + +* **Open a shell for manual testing:** + If you want to debug inside the container, you can use: + ```bash + make shell + ``` + +* **Stop the server:** + ```bash + make down + ``` diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/docker-compose.yml b/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/docker-compose.yml new file mode 100644 index 0000000..9dc8526 --- /dev/null +++ b/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/docker-compose.yml @@ -0,0 +1,31 @@ +services: + vllm: + image: ${DOCKER_URI:-vllm/vllm-tpu:nightly} + privileged: true + network_mode: "host" + volumes: + - /dev/shm:/dev/shm + shm_size: ${SHM_SIZE} + ports: + - "8000:8000" + environment: + - HF_HOME=/dev/shm + - HF_TOKEN=${HF_TOKEN} + - SEED=${SEED} + - MODEL_NAME=${MODEL_NAME} + # Pass multi-modal vars to the container for the benchmark command + - LIMIT_MM_PER_PROMPT=${LIMIT_MM_PER_PROMPT} + - MM_PROCESSOR_KWARGS=${MM_PROCESSOR_KWARGS} + - GUIDED_DECODING_BACKEND=${GUIDED_DECODING_BACKEND} + - DISABLE_CHUNKED_MM_INPUT=${DISABLE_CHUNKED_MM_INPUT} + command: > + vllm serve ${MODEL_NAME} + --seed ${SEED} + ${DISABLE_LOG_REQUESTS} + --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} + --tensor-parallel-size ${TP} + --max-model-len ${MAX_MODEL_LEN} + --limit-mm-per-prompt ${LIMIT_MM_PER_PROMPT} + --mm-processor-kwargs ${MM_PROCESSOR_KWARGS} + --guided-decoding-backend ${GUIDED_DECODING_BACKEND} + ${DISABLE_CHUNKED_MM_INPUT} diff --git a/inference/trillium/vLLM/docker-compose/Qwen3/.env.example b/inference/trillium/vLLM/docker-compose/Qwen3/.env.example new file mode 100644 index 0000000..8ef63b7 --- /dev/null +++ b/inference/trillium/vLLM/docker-compose/Qwen3/.env.example @@ -0,0 +1,35 @@ +# This file provides the default configuration for running Qwen3 models. +# Copy this file to .env and edit it with your specific values. + +# Your username, used by the Makefile for container operations. +# This will be set automatically by 'make setup' +USER=your-username + +# Docker image to use +DOCKER_URI=vllm/vllm-tpu:nightly + +# Your Hugging Face token for downloading models +HF_TOKEN= + +# --- Server Configuration --- +SEED=42 +GPU_MEMORY_UTILIZATION=0.98 +DISABLE_LOG_REQUESTS=--disable-log-requests + +# --- Qwen3-32B Configuration (Default) --- +SHM_SIZE=100gb +MODEL_NAME=Qwen/Qwen3-32B +MAX_MODEL_LEN=4096 +TP=4 +MAX_NUM_BATCHED_TOKENS=2048 +MAX_NUM_SEQS=256 + +# --- Qwen3-4B Configuration --- +# Uncomment the lines below to switch to the 4B model. +# +# SHM_SIZE=10gb +# MODEL_NAME=Qwen/Qwen3-4B +# MAX_MODEL_LEN=4096 +# TP=1 +# MAX_NUM_BATCHED_TOKENS=1024 +# MAX_NUM_SEQS=128 diff --git a/inference/trillium/vLLM/docker-compose/Qwen3/Makefile b/inference/trillium/vLLM/docker-compose/Qwen3/Makefile new file mode 100644 index 0000000..a9953dc --- /dev/null +++ b/inference/trillium/vLLM/docker-compose/Qwen3/Makefile @@ -0,0 +1,66 @@ +# Load environment variables from .env file, if it exists +-include .env +export + +# Default to the 4B model if not set in .env +MODEL_NAME ?= Qwen/Qwen3-4B + +.PHONY: help up down logs shell clean setup benchmark + +help: + @echo "Usage: make [target]" + @echo "" + @echo "Targets:" + @echo " setup Create .env file from the example" + @echo " up Start the vLLM server in the background" + @echo " down Stop and remove the vLLM server" + @echo " logs Follow the server logs" + @echo " shell Get a bash shell inside the running container" + @echo " benchmark Run the benchmark test against the server" + @echo " clean Remove the .env file" + +# Creates the .env file if it doesn't exist +setup: + @if [ ! -f .env ]; then \ + echo "Creating .env file..."; \ + cp .env.example .env; \ + sed -i'' -e "s/your-username/$(USER)/g" .env; \ + echo "Created .env and set USER to '$(USER)'. Please edit it to add your Hugging Face token."; \ + else \ + echo ".env file already exists."; \ + fi + +# The -E flag preserves environment variables for the container +up: setup + sudo -E docker compose up -d + +down: + sudo docker compose down + +logs: + sudo docker compose logs -f + +# Get a bash shell inside the running container +shell: + sudo docker exec -it $(shell sudo docker compose ps -q vllm) bash + +benchmark: + @echo "Running benchmark inside the container..." + @sudo docker exec -it \ + -e MODEL_NAME=$(shell grep '^MODEL_NAME=' .env | cut -d= -f2) \ + -e SEED=$(shell grep '^SEED=' .env | cut -d= -f2) \ + $(shell sudo docker compose ps -q vllm) \ + bash -c ' \ + cd /workspace/vllm && \ + pip install -q datasets && \ + vllm bench serve \ + --model "$${MODEL_NAME}" \ + --dataset-name random \ + --num-prompts 1000 \ + --random-input-len=1800 \ + --random-output-len=128 \ + --seed=$${SEED} \ + ' + +clean: + rm -f .env diff --git a/inference/trillium/vLLM/docker-compose/Qwen3/README.md b/inference/trillium/vLLM/docker-compose/Qwen3/README.md new file mode 100644 index 0000000..d53c8fc --- /dev/null +++ b/inference/trillium/vLLM/docker-compose/Qwen3/README.md @@ -0,0 +1,58 @@ +# Serve Qwen3 with vLLM using Docker Compose and Make + +This guide provides a streamlined workflow for serving Qwen3 models using `docker-compose` and a `Makefile` to simplify all commands. + +## Step 1: Create and Access your TPU VM + +Follow the instructions in the [original recipe](../../Qwen3/README.md) to provision a TPU VM and SSH into it. + +## Step 2: Setup and Configuration + +On your TPU VM, clone this repository (if you haven't already) and navigate to this directory. + +```bash +cd ~/tpu-recipes/inference/trillium/vLLM/docker-compose/Qwen3/ +``` + +Now, run the `setup` command. This will create a `.env` file for your configuration. + +```bash +make setup +``` + +After running, edit the new `.env` file to add your Hugging Face token. You can also switch between the 32B and 4B models by commenting and uncommenting the relevant lines. + +```bash +nano .env +``` + +## Step 3: Run the Server + +The `Makefile` provides simple commands to manage the server lifecycle. + +* **Start the server:** + ```bash + make up + ``` +* **Follow the logs:** + ```bash + make logs + ``` + The server is ready when you see the message: `Application startup complete.` + +* **Run the benchmark:** + Once the server is ready, you can run the benchmark with a single command: + ```bash + make benchmark + ``` + +* **Open a shell for manual testing:** + If you want to run `curl` commands manually or debug inside the container, you can use: + ```bash + make shell + ``` + +* **Stop the server:** + ```bash + make down + ``` diff --git a/inference/trillium/vLLM/docker-compose/Qwen3/docker-compose.yml b/inference/trillium/vLLM/docker-compose/Qwen3/docker-compose.yml new file mode 100644 index 0000000..115274b --- /dev/null +++ b/inference/trillium/vLLM/docker-compose/Qwen3/docker-compose.yml @@ -0,0 +1,24 @@ +services: + vllm: + image: ${DOCKER_URI:-vllm/vllm-tpu:nightly} + privileged: true + network_mode: "host" + volumes: + - /dev/shm:/dev/shm + shm_size: ${SHM_SIZE} + ports: + - "8000:8000" + environment: + - HF_HOME=/dev/shm + - HF_TOKEN=${HF_TOKEN} + - SEED=${SEED} + - MODEL_NAME=${MODEL_NAME} + command: > + vllm serve ${MODEL_NAME} + --seed ${SEED} + ${DISABLE_LOG_REQUESTS} + --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION} + --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS} + --max-num-seqs ${MAX_NUM_SEQS} + --tensor-parallel-size ${TP} + --max-model-len ${MAX_MODEL_LEN} diff --git a/inference/trillium/vLLM/docker-compose/README.md b/inference/trillium/vLLM/docker-compose/README.md new file mode 100644 index 0000000..e8ec309 --- /dev/null +++ b/inference/trillium/vLLM/docker-compose/README.md @@ -0,0 +1,12 @@ +# vLLM Recipes with Docker Compose + +This directory provides an alternative workflow for running the vLLM recipes using `docker-compose`. + +Using `docker-compose` simplifies the process by abstracting away the long, complex `docker run` commands into declarative `docker-compose.yml` files. Configuration is managed via `.env` files, making it easier to switch between models and settings. + +## Available Recipes + +* [Llama3.1](./Llama3.1/README.md) +* [Qwen3](./Qwen3/README.md) +* [Qwen2.5-32B](./Qwen2.5-32B/README.md) +* [Qwen2.5-VL](./Qwen2.5-VL/README.md)