AI-Hypercomputer · RobMulla · Oct 13, 2025
diff --git a/inference/trillium/vLLM/docker-compose/Llama3.1/.env.example b/inference/trillium/vLLM/docker-compose/Llama3.1/.env.example
@@ -0,0 +1,37 @@
+# This file provides the default configuration for running Llama3.1 models.
+# Copy this file to .env and edit it with your specific values.
+
+# Docker image to use
+DOCKER_URI=vllm/vllm-tpu:nightly
+
+# Your Hugging Face token for downloading models
+HF_TOKEN=<your-hugging-face-token-here>
+
+# Name of the user to tag the container (optional, defaults to current user)
+# USER=your-name
+
+# --- Server Configuration ---
+# Seed for reproducibility
+SEED=42
+# GPU memory utilization
+GPU_MEMORY_UTILIZATION=0.98
+# Add '--disable-log-requests' to disable request logging, or leave blank to enable.
+DISABLE_LOG_REQUESTS=--disable-log-requests
+
+# --- Llama3.1-70B Configuration (Default) ---
+SHM_SIZE=150gb
+MODEL_NAME=meta-llama/Llama-3.1-70B-Instruct
+MAX_MODEL_LEN=4096
+TP=8
+MAX_NUM_BATCHED_TOKENS=2048
+MAX_NUM_SEQS=256
+
+# --- Llama3.1-8B Configuration ---
+# Uncomment the lines below to switch to the 8B model.
+#
+# SHM_SIZE=17gb
+# MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct
+# MAX_MODEL_LEN=4096
+# TP=1
+# MAX_NUM_BATCHED_TOKENS=1024
+# MAX_NUM_SEQS=128
diff --git a/inference/trillium/vLLM/docker-compose/Llama3.1/Makefile b/inference/trillium/vLLM/docker-compose/Llama3.1/Makefile
@@ -0,0 +1,59 @@
+.PHONY: help up down logs shell clean setup benchmark
+
+help:
+	@echo "Usage: make [target]"
+	@echo ""
+	@echo "Targets:"
+	@echo "  setup      Create .env file from the example"
+	@echo "  up         Start the vLLM server in the background"
+	@echo "  down       Stop and remove the vLLM server"
+	@echo "  logs       Follow the server logs"
+	@echo "  shell      Get a bash shell inside the running container"
+	@echo "  benchmark  Run the benchmark test against the server"
+	@echo "  clean      Remove the .env file"
+
+# Creates the .env file if it doesn't exist
+setup:
+	@if [ ! -f .env ]; then \
+		echo "Creating .env file..."; \
+		cp .env.example .env; \
+		sed -i'' -e "s/your-username/$(USER)/g" .env; \
+		echo "Created .env and set USER to '$(USER)'. Please edit it to add your Hugging Face token."; \
+	else \
+		echo ".env file already exists."; \
+	fi
+
+# The -E flag preserves environment variables for the container
+up: setup
+	sudo -E docker compose up -d
+
+down:
+	sudo docker compose down
+
+logs:
+	sudo docker compose logs -f
+
+# Get a bash shell inside the running container
+shell:
+	sudo docker exec -it $(shell sudo docker compose ps -q vllm) bash
+
+benchmark:
+	@echo "Running benchmark inside the container..."
+	@sudo docker exec -it \
+		-e MODEL_NAME=$(shell grep '^MODEL_NAME=' .env | cut -d= -f2) \
+		-e SEED=$(shell grep '^SEED=' .env | cut -d= -f2) \
+		$(shell sudo docker compose ps -q vllm) \
+		bash -c ' \
+			cd /workspace/vllm && \
+			pip install -q datasets && \
+			vllm bench serve \
+				--model "$${MODEL_NAME}" \
+				--dataset-name random \
+				--num-prompts 1000 \
+				--random-input-len=1800 \
+				--random-output-len=128 \
+				--seed=$${SEED} \
+		'
+
+clean:
+	rm -f .env
diff --git a/inference/trillium/vLLM/docker-compose/Llama3.1/README.md b/inference/trillium/vLLM/docker-compose/Llama3.1/README.md
@@ -0,0 +1,83 @@
+# Serve Llama3.1 with vLLM using Docker Compose and Make
+
+This guide provides a streamlined workflow for serving Llama3.1 models using `docker-compose` and a `Makefile` to simplify all commands.
+
+## Step 1: Create and Access your TPU VM
+
+Follow the instructions in the [original recipe](../../Llama3.1/README.md) to provision a TPU VM and SSH into it.
+
+## Step 2: Setup and Configuration
+
+On your TPU VM, clone this repository (if you haven't already) and navigate to this directory.
+
+```bash
+cd ~/tpu-recipes/inference/trillium/vLLM/docker-compose/Llama3.1/
+```
+
+Now, run the `setup` command. This will create a `.env` file for your configuration.
+
+```bash
+make setup
+```
+
+After running, edit the new `.env` file to add your `HF_TOKEN`. You can also switch between the 70B and 8B models by commenting and uncommenting the relevant lines.
+
+```bash
+nano .env
+```
+
+## Step 3: Run the Server
+
+The `Makefile` provides simple commands to manage the server lifecycle.
+
+*   **Start the server:**
+    ```bash
+    make up
+    ```
+*   **Follow the logs:**
+    ```bash
+    make logs
+    ```
+    The server is ready when you see the message: `Application startup complete.`
+
+*   **Run the benchmark:**
+    Once the server is ready, you can run the benchmark with a single command:
+    ```bash
+    make benchmark
+    ```
+    This command will enter the container, install the necessary dependencies, and run the benchmark for you.
+
+*   **Open a shell for manual testing:**
+    If you want to run `curl` commands manually or debug inside the container, you can use:
+    ```bash
+    make shell
+    ```
+
+*   **Stop the server:**
+    ```bash
+    make down
+    ```
+
+---
+
+## Advanced Usage
+
+### Forcing a Rebuild
+
+If you need to pull a newer version of the Docker image, you can use the `--build` flag with the `up` command:
+
+```bash
+sudo -E docker compose up -d --build
+```
+
+### Removing the Docker Image
+
+To manually remove the Docker image from your TPU VM, first ensure the services are down, then use the `docker rmi` command.
+
+```bash
+# Stop and remove the containers
+make down
+
+# Remove the image
+sudo docker rmi $(grep "^DOCKER_URI=" .env | cut -d '=' -f2)
+```
diff --git a/inference/trillium/vLLM/docker-compose/Llama3.1/docker-compose.yml b/inference/trillium/vLLM/docker-compose/Llama3.1/docker-compose.yml
@@ -0,0 +1,27 @@
+services:
+  vllm:
+    image: ${DOCKER_URI:-vllm/vllm-tpu:nightly}
+    privileged: true
+    network_mode: "host"
+    volumes:
+      - /dev/shm:/dev/shm
+    shm_size: ${SHM_SIZE}
+    # The 'ports' directive is ignored when network_mode is "host", but kept for clarity.
+    ports:
+      - "8000:8000"
+    environment:
+      - HF_HOME=/dev/shm
+      - HF_TOKEN=${HF_TOKEN}
+      - SEED=${SEED}
+      - MODEL_NAME=${MODEL_NAME}
+    # The command to start the vLLM server.
+    # All variables are sourced from the .env file.
+    command: >
+      vllm serve ${MODEL_NAME}
+      --seed ${SEED}
+      ${DISABLE_LOG_REQUESTS}
+      --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION}
+      --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}
+      --max-num-seqs ${MAX_NUM_SEQS}
+      --tensor-parallel-size ${TP}
+      --max-model-len ${MAX_MODEL_LEN}
diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/.env.example b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/.env.example
@@ -0,0 +1,28 @@
+# This file provides the default configuration for running the Qwen2.5-32B model.
+# Copy this file to .env and edit it with your specific values.
+
+# Your username, used by the Makefile for container operations.
+# This will be set automatically by 'make setup'
+USER=your-username
+
+# Docker image to use
+DOCKER_URI=vllm/vllm-tpu:nightly
+
+# Your Hugging Face token for downloading models
+HF_TOKEN=<your-hugging-face-token-here>
+
+# --- Server Configuration ---
+SEED=42
+GPU_MEMORY_UTILIZATION=0.98
+DISABLE_LOG_REQUESTS=--disable-log-requests
+# This model requires the V1 API flag
+VLLM_USE_V1=1
+
+# --- Qwen2.5-32B Configuration ---
+# Note: The original recipe specified 10gb, but 100gb is a more realistic value for a 32B model.
+SHM_SIZE=100gb
+MODEL_NAME=Qwen/Qwen2.5-32B
+MAX_MODEL_LEN=4096
+TP=4
+MAX_NUM_BATCHED_TOKENS=2048
+MAX_NUM_SEQS=128
diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/Makefile b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/Makefile
@@ -0,0 +1,66 @@
+# Load environment variables from .env file, if it exists
+-include .env
+export
+
+# Default to the 32B model if not set in .env
+MODEL_NAME ?= Qwen/Qwen2.5-32B
+
+.PHONY: help up down logs shell clean setup benchmark
+
+help:
+	@echo "Usage: make [target]"
+	@echo ""
+	@echo "Targets:"
+	@echo "  setup      Create .env file from the example"
+	@echo "  up         Start the vLLM server in the background"
+	@echo "  down       Stop and remove the vLLM server"
+	@echo "  logs       Follow the server logs"
+	@echo "  shell      Get a bash shell inside the running container"
+	@echo "  benchmark  Run the benchmark test against the server"
+	@echo "  clean      Remove the .env file"
+
+# Creates the .env file if it doesn't exist
+setup:
+	@if [ ! -f .env ]; then \
+		echo "Creating .env file..."; \
+		cp .env.example .env; \
+		sed -i'' -e "s/your-username/$(USER)/g" .env; \
+		echo "Created .env and set USER to '$(USER)'. Please edit it to add your Hugging Face token."; \
+	else \
+		echo ".env file already exists."; \
+	fi
+
+# The -E flag preserves environment variables for the container
+up: setup
+	sudo -E docker compose up -d
+
+down:
+	sudo docker compose down
+
+logs:
+	sudo docker compose logs -f
+
+# Get a bash shell inside the running container
+shell:
+	sudo docker exec -it $(shell sudo docker compose ps -q vllm) bash
+
+benchmark:
+	@echo "Running benchmark inside the container..."
+	@sudo docker exec -it \
+		-e MODEL_NAME=$(shell grep '^MODEL_NAME=' .env | cut -d= -f2) \
+		-e SEED=$(shell grep '^SEED=' .env | cut -d= -f2) \
+		$(shell sudo docker compose ps -q vllm) \
+		bash -c ' \
+			cd /workspace/vllm && \
+			pip install -q datasets && \
+			vllm bench serve \
+				--model "$$${MODEL_NAME}" \
+				--dataset-name random \
+				--num-prompts 1000 \
+				--random-input-len=1800 \
+				--random-output-len=128 \
+				--seed=$$${SEED} \
+		'
+
+clean:
+	rm -f .env
diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/README.md b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/README.md
@@ -0,0 +1,58 @@
+# Serve Qwen2.5-32B with vLLM using Docker Compose and Make
+
+This guide provides a streamlined workflow for serving the Qwen2.5-32B model using `docker-compose` and a `Makefile` to simplify all commands.
+
+## Step 1: Create and Access your TPU VM
+
+Follow the instructions in the [original recipe](../../Qwen2.5-32B/README.md) to provision a TPU VM and SSH into it.
+
+## Step 2: Setup and Configuration
+
+On your TPU VM, clone this repository (if you haven't already) and navigate to this directory.
+
+```bash
+cd ~/tpu-recipes/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/
+```
+
+Now, run the `setup` command. This will create a `.env` file for your configuration.
+
+```bash
+make setup
+```
+
+After running, edit the new `.env` file to add your Hugging Face token.
+
+```bash
+nano .env
+```
+
+## Step 3: Run the Server
+
+The `Makefile` provides simple commands to manage the server lifecycle.
+
+*   **Start the server:**
+    ```bash
+    make up
+    ```
+*   **Follow the logs:**
+    ```bash
+    make logs
+    ```
+    The server is ready when you see the message: `Application startup complete.`
+
+*   **Run the benchmark:**
+    Once the server is ready, you can run the benchmark with a single command:
+    ```bash
+    make benchmark
+    ```
+
+*   **Open a shell for manual testing:**
+    If you want to run `curl` commands manually or debug inside the container, you can use:
+    ```bash
+    make shell
+    ```
+
+*   **Stop the server:**
+    ```bash
+    make down
+    ```
diff --git a/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/docker-compose.yml b/inference/trillium/vLLM/docker-compose/Qwen2.5-32B/docker-compose.yml
@@ -0,0 +1,25 @@
+services:
+  vllm:
+    image: ${DOCKER_URI:-vllm/vllm-tpu:nightly}
+    privileged: true
+    network_mode: "host"
+    volumes:
+      - /dev/shm:/dev/shm
+    shm_size: ${SHM_SIZE}
+    ports:
+      - "8000:8000"
+    environment:
+      - HF_HOME=/dev/shm
+      - HF_TOKEN=${HF_TOKEN}
+      - SEED=${SEED}
+      - MODEL_NAME=${MODEL_NAME}
+      - VLLM_USE_V1=${VLLM_USE_V1}
+    command: >
+      vllm serve ${MODEL_NAME}
+      --seed ${SEED}
+      ${DISABLE_LOG_REQUESTS}
+      --gpu-memory-utilization ${GPU_MEMORY_UTILIZATION}
+      --max-num-batched-tokens ${MAX_NUM_BATCHED_TOKENS}
+      --max-num-seqs ${MAX_NUM_SEQS}
+      --tensor-parallel-size ${TP}
+      --max-model-len ${MAX_MODEL_LEN}