From b06c83040b9e5d0c7df0d0a6067333ee35633e0c Mon Sep 17 00:00:00 2001
From: Rob Mulla <rob.mulla@gmail.com>
Date: Mon, 13 Oct 2025 14:04:47 -0400
Subject: [PATCH] feat: Add docker-compose and make based recipes

---
 .../vLLM/docker-compose/Llama3.1/.env.example | 37 +++++++++
 .../vLLM/docker-compose/Llama3.1/Makefile     | 59 +++++++++++++
 .../vLLM/docker-compose/Llama3.1/README.md    | 83 +++++++++++++++++++
 .../Llama3.1/docker-compose.yml               | 27 ++++++
 .../docker-compose/Qwen2.5-32B/.env.example   | 28 +++++++
 .../vLLM/docker-compose/Qwen2.5-32B/Makefile  | 66 +++++++++++++++
 .../vLLM/docker-compose/Qwen2.5-32B/README.md | 58 +++++++++++++
 .../Qwen2.5-32B/docker-compose.yml            | 25 ++++++
 .../docker-compose/Qwen2.5-VL/.env.example    | 28 +++++++
 .../vLLM/docker-compose/Qwen2.5-VL/Makefile   | 70 ++++++++++++++++
 .../vLLM/docker-compose/Qwen2.5-VL/README.md  | 58 +++++++++++++
 .../Qwen2.5-VL/docker-compose.yml             | 31 +++++++
 .../vLLM/docker-compose/Qwen3/.env.example    | 35 ++++++++
 .../vLLM/docker-compose/Qwen3/Makefile        | 66 +++++++++++++++
 .../vLLM/docker-compose/Qwen3/README.md       | 58 +++++++++++++
 .../docker-compose/Qwen3/docker-compose.yml   | 24 ++++++
 .../trillium/vLLM/docker-compose/README.md    | 12 +++
 17 files changed, 765 insertions(+)
 create mode 100644 inference/trillium/vLLM/docker-compose/Llama3.1/.env.example
 create mode 100644 inference/trillium/vLLM/docker-compose/Llama3.1/Makefile
 create mode 100644 inference/trillium/vLLM/docker-compose/Llama3.1/README.md
 create mode 100644 inference/trillium/vLLM/docker-compose/Llama3.1/docker-compose.yml
 create mode 100644 inference/trillium/vLLM/docker-compose/Qwen2.5-32B/.env.example
 create mode 100644 inference/trillium/vLLM/docker-compose/Qwen2.5-32B/Makefile
 create mode 100644 inference/trillium/vLLM/docker-compose/Qwen2.5-32B/README.md
 create mode 100644 inference/trillium/vLLM/docker-compose/Qwen2.5-32B/docker-compose.yml
 create mode 100644 inference/trillium/vLLM/docker-compose/Qwen2.5-VL/.env.example
 create mode 100644 inference/trillium/vLLM/docker-compose/Qwen2.5-VL/Makefile
 create mode 100644 inference/trillium/vLLM/docker-compose/Qwen2.5-VL/README.md
 create mode 100644 inference/trillium/vLLM/docker-compose/Qwen2.5-VL/docker-compose.yml
 create mode 100644 inference/trillium/vLLM/docker-compose/Qwen3/.env.example
 create mode 100644 inference/trillium/vLLM/docker-compose/Qwen3/Makefile
 create mode 100644 inference/trillium/vLLM/docker-compose/Qwen3/README.md
 create mode 100644 inference/trillium/vLLM/docker-compose/Qwen3/docker-compose.yml
 create mode 100644 inference/trillium/vLLM/docker-compose/README.md

diff --git a/inference/trillium/vLLM/docker-compose/Llama3.1/.env.example b/inference/trillium/vLLM/docker-compose/Llama3.1/.env.example
new file mode 100644
index 0000000..7a82bb4
--- /dev/null
+++ b/inference/trillium/vLLM/docker-compose/Llama3.1/.env.example
@@ -0,0 +1,37 @@
+# This file provides the default configuration for running Llama3.1 models.
+# Copy this file to .env and edit it with your specific values.
+
+# Docker image to use
+DOCKER_URI=vllm/vllm-tpu:nightly
+
+# Your Hugging Face token for downloading models
+HF_TOKEN=<your-hugging-face-token-here>
+
+# Name of the user to tag the container (optional, defaults to current user)
+# USER=your-name
+
+# --- Server Configuration ---
+# Seed for reproducibility
+SEED=42
+# GPU memory utilization
+GPU_MEMORY_UTILIZATION=0.98
+# Add '--disable-log-requests' to disable request logging, or leave blank to enable.
+DISABLE_LOG_REQUESTS=--disable-log-requests
+
+# --- Llama3.1-70B Configuration (Default) ---
+SHM_SIZE=150gb
+MODEL_NAME=meta-llama/Llama-3.1-70B-Instruct
+MAX_MODEL_LEN=4096
+TP=8
+MAX_NUM_BATCHED_TOKENS=2048
+MAX_NUM_SEQS=256
+
+# --- Llama3.1-8B Configuration ---
+# Uncomment the lines below to switch to the 8B model.
+#
+# SHM_SIZE=17gb
+# MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct
+# MAX_MODEL_LEN=4096
+# TP=1
+# MAX_NUM_BATCHED_TOKENS=1024
+# MAX_NUM_SEQS=128
diff --git a/inference/trillium/vLLM/docker-compose/Llama3.1/Makefile b/inference/trillium/vLLM/docker-compose/Llama3.1/Makefile
new file mode 100644
index 0000000..e3dd497
--- /dev/null
+++ b/inference/trillium/vLLM/docker-compose/Llama3.1/Makefile
@@ -0,0 +1,59 @@
+.PHONY: help up down logs shell clean setup benchmark
+
+help:
+	@echo "Usage: make [target]"
+	@echo ""
+	@echo "Targets:"
+	@echo "  setup      Create .env file from the example"
+	@echo "  up         Start the vLLM server in the background"
+	@echo "  down       Stop and remove the vLLM server"
+	@echo "  logs       Follow the server logs"
+	@echo "  shell      Get a bash shell inside the running container"
+	@echo "  benchmark  Run the benchmark test against the server"
+	@echo "  clean      Remove the .env file"
+
+# Creates the .env file if it doesn't exist
+setup:
+	@if [ ! -f .env ]; then \
+		echo "Creating .env file..."; \
+		cp .env.example .env; \
+		sed -i'' -e "s/your-username/$(USER)/g" .env; \
+		echo "Created .env and set USER to '$(USER)'. Please edit it to add your Hugging Face token."; \
+	else \
+		echo ".env file already exists."; \
+	fi
+
+# The -E flag preserves environment variables for the container
+up: setup
+	sudo -E docker compose up -d
+
+down:
+	sudo docker compose down
+
+logs:
+	sudo docker compose logs -f
+
+# Get a bash shell inside the running container
+shell:
+	sudo docker exec -it $(shell sudo docker compose ps -q vllm) bash
+
+benchmark:
+	@echo "Running benchmark inside the container..."
+	@sudo docker exec -it \
+		-e MODEL_NAME=$(shell grep '^MODEL_NAME=' .env | cut -d= -f2) \
+		-e SEED=$(shell grep '^SEED=' .env | cut -d= -f2) \
+		$(shell sudo docker compose ps -q vllm) \
+		bash -c ' \
+			cd /workspace/vllm && \
+			pip install -q datasets && \
+			vllm bench serve \
+				--model "$${MODEL_NAME}" \
+				--dataset-name random \
+				--num-prompts 1000 \
+				--random-input-len=1800 \
+				--random-output-len=128 \
+				--seed=$${SEED} \
+		'
+
+clean:
+	rm -f .env
diff --git a/inference/trillium/vLLM/docker-compose/Llama3.1/README.md b/inference/trillium/vLLM/docker-compose/Llama3.1/README.md
new file mode 100644
index 0000000..f3bea39
--- /dev/null
+++ b/inference/trillium/vLLM/docker-compose/Llama3.1/README.md
@@ -0,0 +1,83 @@
+# Serve Llama3.1 with vLLM using Docker Compose and Make
+
+This guide provides a streamlined workflow for serving Llama3.1 models using `docker-compose` and a `Makefile` to simplify all commands.
+
+## Step 1: Create and Access your TPU VM
+
+Follow the instructions in the [original recipe](../../Llama3.1/README.md) to provision a TPU VM and SSH into it.
+
+## Step 2: Setup and Configuration
+
+On your TPU VM, clone this repository (if you haven't already) and navigate to this directory.
+
+```bash
+cd ~/tpu-recipes/inference/trillium/vLLM/docker-compose/Llama3.1/
+```
+
+Now, run the `setup` command. This will create a `.env` file for your configuration.
+
+```bash
+make setup
+```
+
+After running, edit the new `.env` file to add your `HF_TOKEN`. You can also switch between the 70B and 8B models by commenting and uncommenting the relevant lines.
+
+```bash
+nano .env
+```
+
+## Step 3: Run the Server
+
+The `Makefile` provides simple commands to manage the server lifecycle.
+
+*   **Start the server:**
+    ```bash
+    make up
+    ```
+*   **Follow the logs:**
+    ```bash
+    make logs
+    ```
+    The server is ready when you see the message: `Application startup complete.`
+
+*   **Run the benchmark:**
+    Once the server is ready, you can run the benchmark with a single command:
+    ```bash
+    make benchmark
+    ```
+    This command will enter the container, install the necessary dependencies, and run the benchmark for you.
+
+*   **Open a shell for manual testing:**
+    If you want to run `curl` commands manually or debug inside the container, you can use:
+    ```bash
+    make shell
+    ```
+
+*   **Stop the server:**
+    ```bash
+    make down
+    ```
+
+---
+
+## Advanced Usage
+
+### Forcing a Rebuild
+
+If you need to pull a newer version of the Docker image, you can use the `--build` flag with the `up` command:
+
+```bash
+sudo -E docker compose up -d --build
+```
+
+### Removing the Docker Image
+
+To manually remove the Docker image from your TPU VM, first ensure the services are down, then use the `docker rmi` command.
+
+```bash
+# Stop and remove the containers
+make down
+
+# Remove the image
+sudo docker rmi $(grep "^DOCKER_URI=" .env | cut -d '=' -f2)
+```
diff --git a/inference/trillium/vLLM/docker-compose/Llama3.1/docker-compose.yml b/inference/trillium/vLLM/docker-compose/Llama3.1/docker-compose.yml
new file mode 100644
index 0000000..8cf8a4d
--- /dev/null
+++ b/inference/trillium/vLLM/docker-compose/Llama3.1/docker-compose.yml
@@ -0,0 +1,27 @@
+services:
+  vllm:
+    image: ${DOCKER_URI:-vllm/vllm-tpu:nightly}
+    privileged: true
+    network_mode: "host"
+    volumes:
+      - /dev/shm:/dev/shm
+    shm_size: ${SHM_SIZE}
+    # The 'ports' directive is ignored when network_mode is "host", but kept for clarity.
+    ports:
+      - "8000:8000"
+    environment:
+      - HF_HOME=/dev/shm
+      - HF_TOKEN=${HF_TOKEN}
+      - SEED=${SEED}
+      - MODEL_NAME=${MODEL_NAME}
+    # The command to start the vLLM server.
+    # All variables are sourced from the .env file.
+    command: >
+      vllm serve ${MODEL_NAME}
+      --seed ${SEED}
+      ${DISABLE_LOG_REQUESTS}
+      --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION}
+      --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}
+      --max-num-seqs ${MAX_NUM_SEQS}
+      --tensor-parallel-size ${TP}
+      --max-model-len ${MAX_MODEL_LEN}
diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/.env.example b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/.env.example
new file mode 100644
index 0000000..7164f1a
--- /dev/null
+++ b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/.env.example
@@ -0,0 +1,28 @@
+# This file provides the default configuration for running the Qwen2.5-32B model.
+# Copy this file to .env and edit it with your specific values.
+
+# Your username, used by the Makefile for container operations.
+# This will be set automatically by 'make setup'
+USER=your-username
+
+# Docker image to use
+DOCKER_URI=vllm/vllm-tpu:nightly
+
+# Your Hugging Face token for downloading models
+HF_TOKEN=<your-hugging-face-token-here>
+
+# --- Server Configuration ---
+SEED=42
+GPU_MEMORY_UTILIZATION=0.98
+DISABLE_LOG_REQUESTS=--disable-log-requests
+# This model requires the V1 API flag
+VLLM_USE_V1=1
+
+# --- Qwen2.5-32B Configuration ---
+# Note: The original recipe specified 10gb, but 100gb is a more realistic value for a 32B model.
+SHM_SIZE=100gb
+MODEL_NAME=Qwen/Qwen2.5-32B
+MAX_MODEL_LEN=4096
+TP=4
+MAX_NUM_BATCHED_TOKENS=2048
+MAX_NUM_SEQS=128
diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/Makefile b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/Makefile
new file mode 100644
index 0000000..08a5dd5
--- /dev/null
+++ b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/Makefile
@@ -0,0 +1,66 @@
+# Load environment variables from .env file, if it exists
+-include .env
+export
+
+# Default to the 32B model if not set in .env
+MODEL_NAME ?= Qwen/Qwen2.5-32B
+
+.PHONY: help up down logs shell clean setup benchmark
+
+help:
+	@echo "Usage: make [target]"
+	@echo ""
+	@echo "Targets:"
+	@echo "  setup      Create .env file from the example"
+	@echo "  up         Start the vLLM server in the background"
+	@echo "  down       Stop and remove the vLLM server"
+	@echo "  logs       Follow the server logs"
+	@echo "  shell      Get a bash shell inside the running container"
+	@echo "  benchmark  Run the benchmark test against the server"
+	@echo "  clean      Remove the .env file"
+
+# Creates the .env file if it doesn't exist
+setup:
+	@if [ ! -f .env ]; then \
+		echo "Creating .env file..."; \
+		cp .env.example .env; \
+		sed -i'' -e "s/your-username/$(USER)/g" .env; \
+		echo "Created .env and set USER to '$(USER)'. Please edit it to add your Hugging Face token."; \
+	else \
+		echo ".env file already exists."; \
+	fi
+
+# The -E flag preserves environment variables for the container
+up: setup
+	sudo -E docker compose up -d
+
+down:
+	sudo docker compose down
+
+logs:
+	sudo docker compose logs -f
+
+# Get a bash shell inside the running container
+shell:
+	sudo docker exec -it $(shell sudo docker compose ps -q vllm) bash
+
+benchmark:
+	@echo "Running benchmark inside the container..."
+	@sudo docker exec -it \
+		-e MODEL_NAME=$(shell grep '^MODEL_NAME=' .env | cut -d= -f2) \
+		-e SEED=$(shell grep '^SEED=' .env | cut -d= -f2) \
+		$(shell sudo docker compose ps -q vllm) \
+		bash -c ' \
+			cd /workspace/vllm && \
+			pip install -q datasets && \
+			vllm bench serve \
+				--model "$$${MODEL_NAME}" \
+				--dataset-name random \
+				--num-prompts 1000 \
+				--random-input-len=1800 \
+				--random-output-len=128 \
+				--seed=$$${SEED} \
+		'
+
+clean:
+	rm -f .env
diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/README.md b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/README.md
new file mode 100644
index 0000000..445e716
--- /dev/null
+++ b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/README.md
@@ -0,0 +1,58 @@
+# Serve Qwen2.5-32B with vLLM using Docker Compose and Make
+
+This guide provides a streamlined workflow for serving the Qwen2.5-32B model using `docker-compose` and a `Makefile` to simplify all commands.
+
+## Step 1: Create and Access your TPU VM
+
+Follow the instructions in the [original recipe](../../Qwen2.5-32B/README.md) to provision a TPU VM and SSH into it.
+
+## Step 2: Setup and Configuration
+
+On your TPU VM, clone this repository (if you haven't already) and navigate to this directory.
+
+```bash
+cd ~/tpu-recipes/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/
+```
+
+Now, run the `setup` command. This will create a `.env` file for your configuration.
+
+```bash
+make setup
+```
+
+After running, edit the new `.env` file to add your Hugging Face token.
+
+```bash
+nano .env
+```
+
+## Step 3: Run the Server
+
+The `Makefile` provides simple commands to manage the server lifecycle.
+
+*   **Start the server:**
+    ```bash
+    make up
+    ```
+*   **Follow the logs:**
+    ```bash
+    make logs
+    ```
+    The server is ready when you see the message: `Application startup complete.`
+
+*   **Run the benchmark:**
+    Once the server is ready, you can run the benchmark with a single command:
+    ```bash
+    make benchmark
+    ```
+
+*   **Open a shell for manual testing:**
+    If you want to run `curl` commands manually or debug inside the container, you can use:
+    ```bash
+    make shell
+    ```
+
+*   **Stop the server:**
+    ```bash
+    make down
+    ```
diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/docker-compose.yml b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/docker-compose.yml
new file mode 100644
index 0000000..ea89544
--- /dev/null
+++ b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/docker-compose.yml
@@ -0,0 +1,25 @@
+services:
+  vllm:
+    image: ${DOCKER_URI:-vllm/vllm-tpu:nightly}
+    privileged: true
+    network_mode: "host"
+    volumes:
+      - /dev/shm:/dev/shm
+    shm_size: ${SHM_SIZE}
+    ports:
+      - "8000:8000"
+    environment:
+      - HF_HOME=/dev/shm
+      - HF_TOKEN=${HF_TOKEN}
+      - SEED=${SEED}
+      - MODEL_NAME=${MODEL_NAME}
+      - VLLM_USE_V1=${VLLM_USE_V1}
+    command: >
+      vllm serve ${MODEL_NAME}
+      --seed ${SEED}
+      ${DISABLE_LOG_REQUESTS}
+      --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION}
+      --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}
+      --max-num-seqs ${MAX_NUM_SEQS}
+      --tensor-parallel-size ${TP}
+      --max-model-len ${MAX_MODEL_LEN}
diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/.env.example b/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/.env.example
new file mode 100644
index 0000000..6ebd028
--- /dev/null
+++ b/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/.env.example
@@ -0,0 +1,28 @@
+# This file provides the default configuration for running the Qwen2.5-VL-7B model.
+# Copy this file to .env and edit it with your specific values.
+
+# Your username, used by the Makefile for container operations.
+# This will be set automatically by 'make setup'
+USER=your-username
+
+# Docker image to use
+DOCKER_URI=vllm/vllm-tpu:nightly
+
+# Your Hugging Face token for downloading models
+HF_TOKEN=<your-hugging-face-token-here>
+
+# --- Server Configuration ---
+SEED=42
+GPU_MEMORY_UTILIZATION=0.98
+DISABLE_LOG_REQUESTS=--disable-log-requests
+
+# --- Qwen2.5-VL-7B Configuration ---
+SHM_SIZE=17gb
+MODEL_NAME=Qwen/Qwen2.5-VL-7B-Instruct
+MAX_MODEL_LEN=16384
+TP=1
+# Multi-modal parameters
+LIMIT_MM_PER_PROMPT='{"image": 10, "video": 0}'
+MM_PROCESSOR_KWARGS='{"max_pixels": 1003520}'
+GUIDED_DECODING_BACKEND=xgrammar
+DISABLE_CHUNKED_MM_INPUT=--disable-chunked-mm-input
diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/Makefile b/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/Makefile
new file mode 100644
index 0000000..10a4456
--- /dev/null
+++ b/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/Makefile
@@ -0,0 +1,70 @@
+# Load environment variables from .env file, if it exists
+-include .env
+export
+
+# Default to the 7B VL model if not set in .env
+MODEL_NAME ?= Qwen/Qwen2.5-VL-7B-Instruct
+
+.PHONY: help up down logs shell clean setup benchmark
+
+help:
+	@echo "Usage: make [target]"
+	@echo ""
+	@echo "Targets:"
+	@echo "  setup      Create .env file from the example"
+	@echo "  up         Start the vLLM server in the background"
+	@echo "  down       Stop and remove the vLLM server"
+	@echo "  logs       Follow the server logs"
+	@echo "  shell      Get a bash shell inside the running container"
+	@echo "  benchmark  Run the multi-modal benchmark test against the server"
+	@echo "  clean      Remove the .env file"
+
+# Creates the .env file if it doesn't exist
+setup:
+	@if [ ! -f .env ]; then \
+		echo "Creating .env file..."; \
+		cp .env.example .env; \
+		sed -i'' -e "s/your-username/$(USER)/g" .env; \
+		echo "Created .env and set USER to '$(USER)'. Please edit it to add your Hugging Face token."; \
+	else \
+		echo ".env file already exists."; \
+	fi
+
+# The -E flag preserves environment variables for the container
+up: setup
+	sudo -E docker compose up -d
+
+down:
+	sudo docker compose down
+
+logs:
+	sudo docker compose logs -f
+
+# Get a bash shell inside the running container
+shell:
+	sudo docker exec -it $(shell sudo docker compose ps -q vllm) bash
+
+benchmark:
+	@echo "Running multi-modal benchmark inside the container..."
+	@sudo docker exec -it \
+		-e MODEL_NAME=$(shell grep '^MODEL_NAME=' .env | cut -d= -f2) \
+		-e SEED=$(shell grep '^SEED=' .env | cut -d= -f2) \
+		$(shell sudo docker compose ps -q vllm) \
+		bash -c ' \
+			cd /workspace/vllm && \
+			pip install -q datasets && \
+			vllm bench serve \
+				--model "$${MODEL_NAME}" \
+				--dataset-name random-mm \
+				--num-prompts 128 \
+				--backend openai-chat \
+				--endpoint "/v1/chat/completions" \
+				--random-mm-bucket-config "{(736, 736, 1): 1.0}" \
+				--random-mm-base-items-per-request 6 \
+				--random-mm-num-mm-items-range-ratio 0.67 \
+				--random-mm-limit-mm-per-prompt "{\"image\": 10, \"video\": 0}" \
+				--seed=$${SEED} \
+		' 
+
+clean:
+	rm -f .env
\ No newline at end of file
diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/README.md b/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/README.md
new file mode 100644
index 0000000..ee66e72
--- /dev/null
+++ b/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/README.md
@@ -0,0 +1,58 @@
+# Serve Qwen2.5-VL with vLLM using Docker Compose and Make
+
+This guide provides a streamlined workflow for serving the multi-modal Qwen2.5-VL-7B model using `docker-compose` and a `Makefile`.
+
+## Step 1: Create and Access your TPU VM
+
+Follow the instructions in the [original recipe](../../Qwen2.5-VL/README.md) to provision a TPU VM and SSH into it.
+
+## Step 2: Setup and Configuration
+
+On your TPU VM, clone this repository (if you haven't already) and navigate to this directory.
+
+```bash
+cd ~/tpu-recipes/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/
+```
+
+Run the `setup` command to create your `.env` file.
+
+```bash
+make setup
+```
+
+After running, edit the new `.env` file to add your Hugging Face token.
+
+```bash
+nano .env
+```
+
+## Step 3: Run the Server
+
+The `Makefile` provides simple commands to manage the server lifecycle.
+
+*   **Start the server:**
+    ```bash
+    make up
+    ```
+*   **Follow the logs:**
+    ```bash
+    make logs
+    ```
+    The server is ready when you see the message: `Application startup complete.`
+
+*   **Run the benchmark:**
+    Once the server is ready, you can run the multi-modal benchmark with a single command:
+    ```bash
+    make benchmark
+    ```
+
+*   **Open a shell for manual testing:**
+    If you want to debug inside the container, you can use:
+    ```bash
+    make shell
+    ```
+
+*   **Stop the server:**
+    ```bash
+    make down
+    ```
diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/docker-compose.yml b/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/docker-compose.yml
new file mode 100644
index 0000000..9dc8526
--- /dev/null
+++ b/inference/trillium/vLLM/docker-compose/Qwen2.5-VL/docker-compose.yml
@@ -0,0 +1,31 @@
+services:
+  vllm:
+    image: ${DOCKER_URI:-vllm/vllm-tpu:nightly}
+    privileged: true
+    network_mode: "host"
+    volumes:
+      - /dev/shm:/dev/shm
+    shm_size: ${SHM_SIZE}
+    ports:
+      - "8000:8000"
+    environment:
+      - HF_HOME=/dev/shm
+      - HF_TOKEN=${HF_TOKEN}
+      - SEED=${SEED}
+      - MODEL_NAME=${MODEL_NAME}
+      # Pass multi-modal vars to the container for the benchmark command
+      - LIMIT_MM_PER_PROMPT=${LIMIT_MM_PER_PROMPT}
+      - MM_PROCESSOR_KWARGS=${MM_PROCESSOR_KWARGS}
+      - GUIDED_DECODING_BACKEND=${GUIDED_DECODING_BACKEND}
+      - DISABLE_CHUNKED_MM_INPUT=${DISABLE_CHUNKED_MM_INPUT}
+    command: >
+      vllm serve ${MODEL_NAME}
+      --seed ${SEED}
+      ${DISABLE_LOG_REQUESTS}
+      --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION}
+      --tensor-parallel-size ${TP}
+      --max-model-len ${MAX_MODEL_LEN}
+      --limit-mm-per-prompt ${LIMIT_MM_PER_PROMPT}
+      --mm-processor-kwargs ${MM_PROCESSOR_KWARGS}
+      --guided-decoding-backend ${GUIDED_DECODING_BACKEND}
+      ${DISABLE_CHUNKED_MM_INPUT}
diff --git a/inference/trillium/vLLM/docker-compose/Qwen3/.env.example b/inference/trillium/vLLM/docker-compose/Qwen3/.env.example
new file mode 100644
index 0000000..8ef63b7
--- /dev/null
+++ b/inference/trillium/vLLM/docker-compose/Qwen3/.env.example
@@ -0,0 +1,35 @@
+# This file provides the default configuration for running Qwen3 models.
+# Copy this file to .env and edit it with your specific values.
+
+# Your username, used by the Makefile for container operations.
+# This will be set automatically by 'make setup'
+USER=your-username
+
+# Docker image to use
+DOCKER_URI=vllm/vllm-tpu:nightly
+
+# Your Hugging Face token for downloading models
+HF_TOKEN=<your-hugging-face-token-here>
+
+# --- Server Configuration ---
+SEED=42
+GPU_MEMORY_UTILIZATION=0.98
+DISABLE_LOG_REQUESTS=--disable-log-requests
+
+# --- Qwen3-32B Configuration (Default) ---
+SHM_SIZE=100gb
+MODEL_NAME=Qwen/Qwen3-32B
+MAX_MODEL_LEN=4096
+TP=4
+MAX_NUM_BATCHED_TOKENS=2048
+MAX_NUM_SEQS=256
+
+# --- Qwen3-4B Configuration ---
+# Uncomment the lines below to switch to the 4B model.
+#
+# SHM_SIZE=10gb
+# MODEL_NAME=Qwen/Qwen3-4B
+# MAX_MODEL_LEN=4096
+# TP=1
+# MAX_NUM_BATCHED_TOKENS=1024
+# MAX_NUM_SEQS=128
diff --git a/inference/trillium/vLLM/docker-compose/Qwen3/Makefile b/inference/trillium/vLLM/docker-compose/Qwen3/Makefile
new file mode 100644
index 0000000..a9953dc
--- /dev/null
+++ b/inference/trillium/vLLM/docker-compose/Qwen3/Makefile
@@ -0,0 +1,66 @@
+# Load environment variables from .env file, if it exists
+-include .env
+export
+
+# Default to the 4B model if not set in .env
+MODEL_NAME ?= Qwen/Qwen3-4B
+
+.PHONY: help up down logs shell clean setup benchmark
+
+help:
+	@echo "Usage: make [target]"
+	@echo ""
+	@echo "Targets:"
+	@echo "  setup      Create .env file from the example"
+	@echo "  up         Start the vLLM server in the background"
+	@echo "  down       Stop and remove the vLLM server"
+	@echo "  logs       Follow the server logs"
+	@echo "  shell      Get a bash shell inside the running container"
+	@echo "  benchmark  Run the benchmark test against the server"
+	@echo "  clean      Remove the .env file"
+
+# Creates the .env file if it doesn't exist
+setup:
+	@if [ ! -f .env ]; then \
+		echo "Creating .env file..."; \
+		cp .env.example .env; \
+		sed -i'' -e "s/your-username/$(USER)/g" .env; \
+		echo "Created .env and set USER to '$(USER)'. Please edit it to add your Hugging Face token."; \
+	else \
+		echo ".env file already exists."; \
+	fi
+
+# The -E flag preserves environment variables for the container
+up: setup
+	sudo -E docker compose up -d
+
+down:
+	sudo docker compose down
+
+logs:
+	sudo docker compose logs -f
+
+# Get a bash shell inside the running container
+shell:
+	sudo docker exec -it $(shell sudo docker compose ps -q vllm) bash
+
+benchmark:
+	@echo "Running benchmark inside the container..."
+	@sudo docker exec -it \
+		-e MODEL_NAME=$(shell grep '^MODEL_NAME=' .env | cut -d= -f2) \
+		-e SEED=$(shell grep '^SEED=' .env | cut -d= -f2) \
+		$(shell sudo docker compose ps -q vllm) \
+		bash -c ' \
+			cd /workspace/vllm && \
+			pip install -q datasets && \
+			vllm bench serve \
+				--model "$${MODEL_NAME}" \
+				--dataset-name random \
+				--num-prompts 1000 \
+				--random-input-len=1800 \
+				--random-output-len=128 \
+				--seed=$${SEED} \
+		'
+
+clean:
+	rm -f .env
diff --git a/inference/trillium/vLLM/docker-compose/Qwen3/README.md b/inference/trillium/vLLM/docker-compose/Qwen3/README.md
new file mode 100644
index 0000000..d53c8fc
--- /dev/null
+++ b/inference/trillium/vLLM/docker-compose/Qwen3/README.md
@@ -0,0 +1,58 @@
+# Serve Qwen3 with vLLM using Docker Compose and Make
+
+This guide provides a streamlined workflow for serving Qwen3 models using `docker-compose` and a `Makefile` to simplify all commands.
+
+## Step 1: Create and Access your TPU VM
+
+Follow the instructions in the [original recipe](../../Qwen3/README.md) to provision a TPU VM and SSH into it.
+
+## Step 2: Setup and Configuration
+
+On your TPU VM, clone this repository (if you haven't already) and navigate to this directory.
+
+```bash
+cd ~/tpu-recipes/inference/trillium/vLLM/docker-compose/Qwen3/
+```
+
+Now, run the `setup` command. This will create a `.env` file for your configuration.
+
+```bash
+make setup
+```
+
+After running, edit the new `.env` file to add your Hugging Face token. You can also switch between the 32B and 4B models by commenting and uncommenting the relevant lines.
+
+```bash
+nano .env
+```
+
+## Step 3: Run the Server
+
+The `Makefile` provides simple commands to manage the server lifecycle.
+
+*   **Start the server:**
+    ```bash
+    make up
+    ```
+*   **Follow the logs:**
+    ```bash
+    make logs
+    ```
+    The server is ready when you see the message: `Application startup complete.`
+
+*   **Run the benchmark:**
+    Once the server is ready, you can run the benchmark with a single command:
+    ```bash
+    make benchmark
+    ```
+
+*   **Open a shell for manual testing:**
+    If you want to run `curl` commands manually or debug inside the container, you can use:
+    ```bash
+    make shell
+    ```
+
+*   **Stop the server:**
+    ```bash
+    make down
+    ```
diff --git a/inference/trillium/vLLM/docker-compose/Qwen3/docker-compose.yml b/inference/trillium/vLLM/docker-compose/Qwen3/docker-compose.yml
new file mode 100644
index 0000000..115274b
--- /dev/null
+++ b/inference/trillium/vLLM/docker-compose/Qwen3/docker-compose.yml
@@ -0,0 +1,24 @@
+services:
+  vllm:
+    image: ${DOCKER_URI:-vllm/vllm-tpu:nightly}
+    privileged: true
+    network_mode: "host"
+    volumes:
+      - /dev/shm:/dev/shm
+    shm_size: ${SHM_SIZE}
+    ports:
+      - "8000:8000"
+    environment:
+      - HF_HOME=/dev/shm
+      - HF_TOKEN=${HF_TOKEN}
+      - SEED=${SEED}
+      - MODEL_NAME=${MODEL_NAME}
+    command: >
+      vllm serve ${MODEL_NAME}
+      --seed ${SEED}
+      ${DISABLE_LOG_REQUESTS}
+      --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION}
+      --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}
+      --max-num-seqs ${MAX_NUM_SEQS}
+      --tensor-parallel-size ${TP}
+      --max-model-len ${MAX_MODEL_LEN}
diff --git a/inference/trillium/vLLM/docker-compose/README.md b/inference/trillium/vLLM/docker-compose/README.md
new file mode 100644
index 0000000..e8ec309
--- /dev/null
+++ b/inference/trillium/vLLM/docker-compose/README.md
@@ -0,0 +1,12 @@
+# vLLM Recipes with Docker Compose
+
+This directory provides an alternative workflow for running the vLLM recipes using `docker-compose`.
+
+Using `docker-compose` simplifies the process by abstracting away the long, complex `docker run` commands into declarative `docker-compose.yml` files. Configuration is managed via `.env` files, making it easier to switch between models and settings.
+
+## Available Recipes
+
+*   [Llama3.1](./Llama3.1/README.md)
+*   [Qwen3](./Qwen3/README.md)
+*   [Qwen2.5-32B](./Qwen2.5-32B/README.md)
+*   [Qwen2.5-VL](./Qwen2.5-VL/README.md)