From cecdde8ec9562b69a441e14896d0f325b472dc2c Mon Sep 17 00:00:00 2001
From: ssraghavan-oci <sowmya.srinivasa.raghavan@oracle.com>
Date: Mon, 21 Apr 2025 12:09:10 -0400
Subject: [PATCH 1/9] docs for offline inference

---
 .../offline-inference-infra/README.md         | 120 ++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 docs/sample_blueprints/offline-inference-infra/README.md

diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md
new file mode 100644
index 0000000..803e977
--- /dev/null
+++ b/docs/sample_blueprints/offline-inference-infra/README.md
@@ -0,0 +1,120 @@
+Here’s your rewritten `README.md`, styled similarly to the CPU inference blueprint but focused on **offline GPU inference using the SGLang backend**.
+
+# Offline Inference Blueprint - Infra (SGLang + vLLM)
+
+This blueprint provides a configurable framework to run **offline LLM inference benchmarks** using either the SGLang or vLLM backends. It is designed for cloud GPU environments and supports automated performance benchmarking with MLflow logging.
+
+This blueprint enables you to:
+- Run inference locally on GPU nodes using pre-loaded models
+- Benchmark token throughput, latency, and request performance
+- Push results to MLflow for comparison and analysis
+
+---
+
+## Pre-Filled Samples
+
+| Title                         | Description                                                                 |
+|------------------------------|-----------------------------------------------------------------------------|
+|Offline inference with LLaMA 3|Benchmarks Meta-Llama-3.1-8B model using SGLang on VM.GPU.A10.2 with 2 GPUs. |
+
+You can access these pre-filled samples from the OCI AI Blueprint portal.
+
+---
+## When to use Offline inference 
+
+Offline inference is ideal for:
+- Accurate performance benchmarking (no API or network bottlenecks)
+- Comparing GPU hardware performance (A10, A100, H100, MI300X)
+- Evaluating backend frameworks like vLLM and SGLang
+
+---
+
+## Supported Backends
+
+| Backend  | Description                                                  |
+|----------|--------------------------------------------------------------|
+| sglang   | Fast multi-modal LLM backend with optimized throughput      |
+| vllm     | Token streaming inference engine for LLMs with speculative decoding |
+
+---
+
+## Running the Benchmark
+
+This blueprint supports benchmark execution via a job-mode recipe using a YAML config file. The recipe mounts a model and config file from Object Storage, runs offline inference, and logs metrics.
+
+---
+
+### Sample Recipe (Job Mode for Offline SGLang Inference)
+
+```json
+{
+  "recipe_id": "offline_inference_sglang",
+  "recipe_mode": "job",
+  "deployment_name": "Offline Inference Benchmark",
+  "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v2",
+  "recipe_node_shape": "VM.GPU.A10.2",
+  "input_object_storage": [
+    {
+      "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/Z2q73uuLCAxCbGXJ99CIeTxnCTNipsE-1xHE9HYfCz0RBYPTcCbqi9KHViUEH-Wq/n/iduyx1qnmway/b/mymodels/o/",
+      "mount_location": "/models",
+      "volume_size_in_gbs": 500,
+      "include": [
+        "example_sglang.yaml",
+        "NousResearch/Meta-Llama-3.1-8B"
+      ]
+    }
+  ],
+  "recipe_container_command_args": [
+    "/models/example_sglang.yaml"
+  ],
+  "recipe_replica_count": 1,
+  "recipe_container_port": "8000",
+  "recipe_nvidia_gpu_count": 2,
+  "recipe_node_pool_size": 1,
+  "recipe_node_boot_volume_size_in_gbs": 200,
+  "recipe_ephemeral_storage_size": 100,
+  "recipe_shared_memory_volume_size_limit_in_mb": 200
+}
+```
+
+---
+
+## Sample Config File (`example_sglang.yaml`)
+
+```yaml
+benchmark_type: offline
+offline_backend: sglang
+
+model_path: /models/NousResearch/Meta-Llama-3.1-8B
+tokenizer_path: /models/NousResearch/Meta-Llama-3.1-8B
+trust_remote_code: true
+conv_template: llama-2
+
+input_len: 128
+output_len: 128
+num_prompts: 64
+max_seq_len: 4096
+max_batch_size: 8
+dtype: auto
+temperature: 0.7
+top_p: 0.9
+
+mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000
+experiment_name: "sglang-bench-doc-test-new"
+run_name: "llama3-8b-sglang-test"
+```
+
+---
+
+## Metrics Logged
+
+- `requests_per_second`
+- `input_tokens_per_second`
+- `output_tokens_per_second`
+- `total_tokens_per_second`
+- `elapsed_time`
+- `total_input_tokens`
+- `total_output_tokens`
+
+If a dataset is provided:
+- `accuracy`

From 011f7fe29817e79e1c2f4477082f7a1c2160e0e4 Mon Sep 17 00:00:00 2001
From: ssraghavan-oci <sowmya.srinivasa.raghavan@oracle.com>
Date: Mon, 21 Apr 2025 12:18:56 -0400
Subject: [PATCH 2/9] removed edit line

---
 docs/sample_blueprints/offline-inference-infra/README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md
index 803e977..d0bb6d0 100644
--- a/docs/sample_blueprints/offline-inference-infra/README.md
+++ b/docs/sample_blueprints/offline-inference-infra/README.md
@@ -1,5 +1,3 @@
-Here’s your rewritten `README.md`, styled similarly to the CPU inference blueprint but focused on **offline GPU inference using the SGLang backend**.
-
 # Offline Inference Blueprint - Infra (SGLang + vLLM)
 
 This blueprint provides a configurable framework to run **offline LLM inference benchmarks** using either the SGLang or vLLM backends. It is designed for cloud GPU environments and supports automated performance benchmarking with MLflow logging.

From 66ecb28faf285bbe2f06c7fe633dc504394a4134 Mon Sep 17 00:00:00 2001
From: ssraghavan-oci <sowmya.srinivasa.raghavan@oracle.com>
Date: Mon, 21 Apr 2025 12:22:37 -0400
Subject: [PATCH 3/9] online inference readme

---
 .../online-inference-infra/README.md          | 104 ++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 docs/sample_blueprints/online-inference-infra/README.md

diff --git a/docs/sample_blueprints/online-inference-infra/README.md b/docs/sample_blueprints/online-inference-infra/README.md
new file mode 100644
index 0000000..8b1f4bf
--- /dev/null
+++ b/docs/sample_blueprints/online-inference-infra/README.md
@@ -0,0 +1,104 @@
+# Online Inference Blueprint (LLMPerf)
+
+This blueprint benchmarks **online inference performance** of large language models using **LLMPerf**, a standardized benchmarking tool. It is designed to evaluate LLM APIs served via platforms such as OpenAI-compatible interfaces, including self-hosted LLM inference endpoints.
+
+This blueprint helps:
+- Simulate real-time request load on a running model server
+- Measure end-to-end latency, throughput, and completion performance
+- Push results to MLflow for visibility and tracking
+
+---
+
+## Pre-Filled Samples
+
+| Title                                  | Description                                                                 |
+|----------------------------------------|-----------------------------------------------------------------------------|
+|Online inference on LLaMA 3 using LLMPerf|Benchmark of meta/llama3-8b-instruct via a local OpenAI-compatible endpoint |
+
+These can be accessed directly from the OCI AI Blueprint portal.
+
+---
+
+## Prerequisites
+
+Before running this blueprint:
+- You **must have an inference server already running**, compatible with the OpenAI API format.
+- Ensure the endpoint and model name match what’s defined in the config.
+
+---
+
+## Supported Scenarios
+
+| Use Case              | Description                                           |
+|-----------------------|-------------------------------------------------------|
+| Local LLM APIs        | Benchmark your own self-hosted models (e.g., vLLM)    |
+| Remote OpenAI API     | Benchmark OpenAI deployments for throughput analysis  |
+| Multi-model endpoints | Test latency/throughput across different configurations |
+
+---
+
+### Sample Recipe (Job Mode for Online Benchmarking)
+
+```json
+{
+  "recipe_id": "online_inference_benchmark",
+  "recipe_mode": "job",
+  "deployment_name": "Online Inference Benchmark",
+  "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v2",
+  "recipe_node_shape": "VM.GPU.A10.2",
+  "input_object_storage": [
+    {
+      "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/Z2q73uuLCAxCbGXJ99CIeTxnCTNipsE-1xHE9HYfCz0RBYPTcCbqi9KHViUEH-Wq/n/iduyx1qnmway/b/mymodels/o/",
+      "mount_location": "/models",
+      "volume_size_in_gbs": 100,
+      "include": [
+        "example_online.yaml"
+      ]
+    }
+  ],
+  "recipe_container_command_args": [
+    "/models/example_online.yaml"
+  ],
+  "recipe_replica_count": 1,
+  "recipe_container_port": "8000",
+  "recipe_node_pool_size": 1,
+  "recipe_node_boot_volume_size_in_gbs": 200,
+  "recipe_ephemeral_storage_size": 100
+}
+```
+
+---
+
+## Sample Config File (`example_online.yaml`)
+
+```yaml
+benchmark_type: online
+
+model: meta/llama3-8b-instruct
+input_len: 64
+output_len: 32
+max_requests: 5
+timeout: 300
+num_concurrent: 1
+results_dir: /workspace/results_on
+llm_api: openai
+llm_api_key: dummy-key
+llm_api_base: http://localhost:8001/v1
+
+experiment_name: local-bench
+run_name: llama3-test
+mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000
+llmperf_path: /opt/llmperf-src
+metadata: test=localhost
+```
+
+---
+
+## Metrics Logged
+
+- `output_tokens_per_second`
+- `requests_per_minute`
+- `overall_output_throughput`
+- All raw metrics from the `_summary.json` output of LLMPerf
+
+---

From 8ec78b4a67955e9a7ba1170c3a4ea0a2465f3864 Mon Sep 17 00:00:00 2001
From: ssraghavan-oci <sowmya.srinivasa.raghavan@oracle.com>
Date: Mon, 21 Apr 2025 23:16:40 -0400
Subject: [PATCH 4/9] better readme with extra pre-filled samples for offline
 inference

---
 .../offline-inference-infra/README.md         | 192 +++++++++++++++---
 1 file changed, 165 insertions(+), 27 deletions(-)

diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md
index d0bb6d0..f67861b 100644
--- a/docs/sample_blueprints/offline-inference-infra/README.md
+++ b/docs/sample_blueprints/offline-inference-infra/README.md
@@ -14,6 +14,7 @@ This blueprint enables you to:
 | Title                         | Description                                                                 |
 |------------------------------|-----------------------------------------------------------------------------|
 |Offline inference with LLaMA 3|Benchmarks Meta-Llama-3.1-8B model using SGLang on VM.GPU.A10.2 with 2 GPUs. |
+|Offline inference with LLAMA 3- vLLM| Benchmarks Meta-Llama-3.1-8B model using vLLM on VM.GPU.A10.2 with 2 GPUs.|
 
 You can access these pre-filled samples from the OCI AI Blueprint portal.
 
@@ -46,33 +47,41 @@ This blueprint supports benchmark execution via a job-mode recipe using a YAML c
 
 ```json
 {
-  "recipe_id": "offline_inference_sglang",
-  "recipe_mode": "job",
-  "deployment_name": "Offline Inference Benchmark",
-  "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v2",
-  "recipe_node_shape": "VM.GPU.A10.2",
-  "input_object_storage": [
-    {
-      "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/Z2q73uuLCAxCbGXJ99CIeTxnCTNipsE-1xHE9HYfCz0RBYPTcCbqi9KHViUEH-Wq/n/iduyx1qnmway/b/mymodels/o/",
-      "mount_location": "/models",
-      "volume_size_in_gbs": 500,
-      "include": [
-        "example_sglang.yaml",
-        "NousResearch/Meta-Llama-3.1-8B"
-      ]
-    }
-  ],
-  "recipe_container_command_args": [
-    "/models/example_sglang.yaml"
-  ],
-  "recipe_replica_count": 1,
-  "recipe_container_port": "8000",
-  "recipe_nvidia_gpu_count": 2,
-  "recipe_node_pool_size": 1,
-  "recipe_node_boot_volume_size_in_gbs": 200,
-  "recipe_ephemeral_storage_size": 100,
-  "recipe_shared_memory_volume_size_limit_in_mb": 200
-}
+    "recipe_id": "offline_inference_sglang",
+    "recipe_mode": "job",
+    "deployment_name": "Offline Inference Benchmark",
+    "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v4",
+    "recipe_node_shape": "VM.GPU.A10.2",
+    "input_object_storage": [
+      {
+        "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/0T99iRADcM08aVpumM6smqMIcnIJTFtV2D8ZIIWidUP9eL8GSRyDMxOb9Va9rmRc/n/iduyx1qnmway/b/mymodels/o/",
+        "mount_location": "/models",
+        "volume_size_in_gbs": 500,
+        "include": [
+          "new_example_sglang.yaml",
+          "NousResearch/Meta-Llama-3.1-8B"
+        ]
+      }
+    ],
+    "output_object_storage": [
+      {
+        "bucket_name": "inference_output",
+        "mount_location": "/mlcommons_output",
+        "volume_size_in_gbs": 200
+      }
+    ],
+    "recipe_container_command_args": [
+      "/models/new_example_sglang.yaml"
+    ],
+    "recipe_replica_count": 1,
+    "recipe_container_port": "8000",
+    "recipe_nvidia_gpu_count": 2,
+    "recipe_node_pool_size": 1,
+    "recipe_node_boot_volume_size_in_gbs": 200,
+    "recipe_ephemeral_storage_size": 100,
+    "recipe_shared_memory_volume_size_limit_in_mb": 200
+  }
+  
 ```
 
 ---
@@ -100,6 +109,43 @@ top_p: 0.9
 mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000
 experiment_name: "sglang-bench-doc-test-new"
 run_name: "llama3-8b-sglang-test"
+
+
+save_metrics_path: /mlcommons_output/benchmark_output_llama3_sglang.json
+
+```
+
+```yaml
+benchmark_type: offline
+model: /models/NousResearch/Meta-Llama-3.1-8B
+tokenizer: /models/NousResearch/Meta-Llama-3.1-8B
+
+input_len: 12
+output_len: 12
+num_prompts: 2
+seed: 42
+tensor_parallel_size: 8
+
+# vLLM-specific
+#quantization: awq
+dtype: half
+gpu_memory_utilization: 0.99
+num_scheduler_steps: 10
+device: cuda
+enforce_eager: true
+kv_cache_dtype: auto
+enable_prefix_caching: true
+distributed_executor_backend: mp
+
+# Output
+#output_json: ./128_128.json
+
+# MLflow
+mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000
+experiment_name: test-bm-suite-doc
+run_name: llama3-vllm-test
+save_metrics_path:  /mlcommons_output/benchmark_output_llama3_vllm.json
+
 ```
 
 ---
@@ -116,3 +162,95 @@ run_name: "llama3-8b-sglang-test"
 
 If a dataset is provided:
 - `accuracy`
+
+
+### Top-level Deployment Keys
+
+| Key | Description |
+|-----|-------------|
+| `recipe_id` | Identifier of the recipe to run; here, it's an offline SGLang benchmark job. |
+| `recipe_mode` | Specifies this is a `job`, meaning it runs to completion and exits. |
+| `deployment_name` | Human-readable name for the job. |
+| `recipe_image_uri` | Docker image containing the benchmark code and dependencies. |
+| `recipe_node_shape` | Shape of the VM or GPU node to run the job (e.g., VM.GPU.A10.2). |
+
+### Input Object Storage
+
+| Key | Description |
+|-----|-------------|
+| `input_object_storage` | List of inputs to mount from Object Storage. |
+| `par` | Pre-Authenticated Request (PAR) link to a bucket/folder. |
+| `mount_location` | Files are mounted to this path inside the container. |
+| `volume_size_in_gbs` | Size of the mount volume. |
+| `include` | Only these files/folders from the bucket are mounted (e.g., model + config). |
+
+### Output Object Storage
+
+| Key | Description |
+|-----|-------------|
+| `output_object_storage` | Where to store outputs like benchmark logs or results. |
+| `bucket_name` | Name of the output bucket in OCI Object Storage. |
+| `mount_location` | Mount point inside container where outputs are written. |
+| `volume_size_in_gbs` | Size of this volume in GBs. |
+
+### Runtime & Infra Settings
+
+| Key | Description |
+|-----|-------------|
+| `recipe_container_command_args` | Path to the YAML config that defines benchmark parameters. |
+| `recipe_replica_count` | Number of job replicas to run (usually 1 for inference). |
+| `recipe_container_port` | Port (optional for offline mode; required if API is exposed). |
+| `recipe_nvidia_gpu_count` | Number of GPUs allocated to this job. |
+| `recipe_node_pool_size` | Number of nodes in the pool (1 means 1 VM). |
+| `recipe_node_boot_volume_size_in_gbs` | Disk size for OS + dependencies. |
+| `recipe_ephemeral_storage_size` | Local scratch space in GBs. |
+| `recipe_shared_memory_volume_size_limit_in_mb` | Shared memory (used by some inference engines). |
+
+---
+
+## **Sample Config File (`example_sglang.yaml`)**
+
+This file is consumed by the container during execution to configure the benchmark run.
+
+### Inference Setup
+
+| Key | Description |
+|-----|-------------|
+| `benchmark_type` | Set to `offline` to indicate local execution with no HTTP server. |
+| `offline_backend` | Backend engine to use (`sglang` or `vllm`). |
+| `model_path` | Path to the model directory (already mounted via Object Storage). |
+| `tokenizer_path` | Path to the tokenizer (usually same as model path). |
+| `trust_remote_code` | Enables loading models that require custom code (Hugging Face). |
+| `conv_template` | Prompt formatting template to use (e.g., `llama-2`). |
+
+### Benchmark Parameters
+
+| Key | Description |
+|-----|-------------|
+| `input_len` | Number of tokens in the input prompt. |
+| `output_len` | Number of tokens to generate. |
+| `num_prompts` | Number of total prompts to run (e.g., 64 prompts x 128 output tokens). |
+| `max_seq_len` | Max sequence length supported by the model (e.g., 4096). |
+| `max_batch_size` | Max batch size per inference run (depends on GPU memory). |
+| `dtype` | Precision (e.g., float16, bfloat16, auto). |
+
+### Sampling Settings
+
+| Key | Description |
+|-----|-------------|
+| `temperature` | Controls randomness in generation (lower = more deterministic). |
+| `top_p` | Top-p sampling for diversity (0.9 keeps most probable tokens). |
+
+### MLflow Logging
+
+| Key | Description |
+|-----|-------------|
+| `mlflow_uri` | MLflow server to log performance metrics. |
+| `experiment_name` | Experiment name to group runs in MLflow UI. |
+| `run_name` | Custom name to identify this particular run. |
+
+### Output
+
+| Key | Description |
+|-----|-------------|
+| `save_metrics_path` | Path inside the container where metrics will be saved as JSON. |

From 14f45a01f0163923c42c95628b74945a50dfc2f6 Mon Sep 17 00:00:00 2001
From: ssraghavan-oci <sowmya.srinivasa.raghavan@oracle.com>
Date: Tue, 22 Apr 2025 12:14:21 -0400
Subject: [PATCH 5/9] added sample json files

---
 .../offline-inference-infra/README.md         |  1 +
 .../new_example_sglang.yaml                   | 24 +++++++++++++++
 .../offline_vllm_example.yaml                 | 29 +++++++++++++++++++
 .../online_example.yaml                       | 16 ++++++++++
 4 files changed, 70 insertions(+)
 create mode 100644 docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml
 create mode 100644 docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml
 create mode 100644 docs/sample_blueprints/online-inference-infra/online_example.yaml

diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md
index f67861b..256f63d 100644
--- a/docs/sample_blueprints/offline-inference-infra/README.md
+++ b/docs/sample_blueprints/offline-inference-infra/README.md
@@ -41,6 +41,7 @@ Offline inference is ideal for:
 
 This blueprint supports benchmark execution via a job-mode recipe using a YAML config file. The recipe mounts a model and config file from Object Storage, runs offline inference, and logs metrics.
 
+Notes : Make sure your output object storage is in the same tenancy as your stack. 
 ---
 
 ### Sample Recipe (Job Mode for Offline SGLang Inference)
diff --git a/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml b/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml
new file mode 100644
index 0000000..1649e7a
--- /dev/null
+++ b/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml
@@ -0,0 +1,24 @@
+benchmark_type: offline
+offline_backend: sglang
+
+model_path: /models/NousResearch/Meta-Llama-3.1-8B
+tokenizer_path: /models/NousResearch/Meta-Llama-3.1-8B
+trust_remote_code: true
+conv_template: llama-2
+
+input_len: 128
+output_len: 128
+num_prompts: 64
+max_seq_len: 4096
+max_batch_size: 8
+dtype: auto
+temperature: 0.7
+top_p: 0.9
+
+mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000
+experiment_name: "sglang-bench-doc-test-new"
+run_name: "llama3-8b-sglang-test"
+
+
+save_metrics_path: /mlcommons_output/benchmark_output_llama3_sglang.json
+
diff --git a/docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml b/docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml
new file mode 100644
index 0000000..7734c14
--- /dev/null
+++ b/docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml
@@ -0,0 +1,29 @@
+benchmark_type: offline
+model: /models/NousResearch/Meta-Llama-3.1-8B
+tokenizer: /models/NousResearch/Meta-Llama-3.1-8B
+
+input_len: 12
+output_len: 12
+num_prompts: 2
+seed: 42
+tensor_parallel_size: 8
+
+# vLLM-specific
+#quantization: awq
+dtype: half
+gpu_memory_utilization: 0.99
+num_scheduler_steps: 10
+device: cuda
+enforce_eager: true
+kv_cache_dtype: auto
+enable_prefix_caching: true
+distributed_executor_backend: mp
+
+# Output
+#output_json: ./128_128.json
+
+# MLflow
+mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000
+experiment_name: test-bm-suite-doc
+run_name: llama3-vllm-test
+save_metrics_path:  /mlcommons_output/benchmark_output_llama3_vllm.json
diff --git a/docs/sample_blueprints/online-inference-infra/online_example.yaml b/docs/sample_blueprints/online-inference-infra/online_example.yaml
new file mode 100644
index 0000000..d4d0fe3
--- /dev/null
+++ b/docs/sample_blueprints/online-inference-infra/online_example.yaml
@@ -0,0 +1,16 @@
+benchmark_type: online
+model: meta/llama3-8b-instruct
+input_len: 64
+output_len: 32
+max_requests: 5
+timeout: 300
+num_concurrent: 1
+results_dir: /workspace/results_on
+llm_api: openai
+llm_api_key: dummy-key
+llm_api_base: http://localhost:8001/v1
+experiment_name: local-bench
+run_name: llama3-test
+mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000
+llmperf_path: /opt/llmperf-src
+metadata: test=localhost

From da66e9502b5bae4f304f138ef1fd0545f501e5b1 Mon Sep 17 00:00:00 2001
From: ssraghavan-oci <sowmya.srinivasa.raghavan@oracle.com>
Date: Tue, 22 Apr 2025 12:18:56 -0400
Subject: [PATCH 6/9] added deployment json files

---
 .../offline_deployment_sglang.json            | 36 +++++++++++++++++++
 .../offline_deployment_vllm.json              | 36 +++++++++++++++++++
 2 files changed, 72 insertions(+)
 create mode 100644 docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json
 create mode 100644 docs/sample_blueprints/offline-inference-infra/offline_deployment_vllm.json

diff --git a/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json b/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json
new file mode 100644
index 0000000..e3b988a
--- /dev/null
+++ b/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json
@@ -0,0 +1,36 @@
+{
+    "recipe_id": "offline_inference_sglang",
+    "recipe_mode": "job",
+    "deployment_name": "Offline Inference Benchmark",
+    "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v4",
+    "recipe_node_shape": "VM.GPU.A10.2",
+    "input_object_storage": [
+      {
+        "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/0T99iRADcM08aVpumM6smqMIcnIJTFtV2D8ZIIWidUP9eL8GSRyDMxOb9Va9rmRc/n/iduyx1qnmway/b/mymodels/o/",
+        "mount_location": "/models",
+        "volume_size_in_gbs": 500,
+        "include": [
+          "new_example_sglang.yaml",
+          "NousResearch/Meta-Llama-3.1-8B"
+        ]
+      }
+    ],
+    "output_object_storage": [
+      {
+        "bucket_name": "inference_output",
+        "mount_location": "/mlcommons_output",
+        "volume_size_in_gbs": 200
+      }
+    ],
+    "recipe_container_command_args": [
+      "/models/new_example_sglang.yaml"
+    ],
+    "recipe_replica_count": 1,
+    "recipe_container_port": "8000",
+    "recipe_nvidia_gpu_count": 2,
+    "recipe_node_pool_size": 1,
+    "recipe_node_boot_volume_size_in_gbs": 200,
+    "recipe_ephemeral_storage_size": 100,
+    "recipe_shared_memory_volume_size_limit_in_mb": 200
+  }
+  
\ No newline at end of file
diff --git a/docs/sample_blueprints/offline-inference-infra/offline_deployment_vllm.json b/docs/sample_blueprints/offline-inference-infra/offline_deployment_vllm.json
new file mode 100644
index 0000000..e920f38
--- /dev/null
+++ b/docs/sample_blueprints/offline-inference-infra/offline_deployment_vllm.json
@@ -0,0 +1,36 @@
+{
+    "recipe_id": "offline_inference_vllm",
+    "recipe_mode": "job",
+    "deployment_name": "Offline Inference Benchmark vllm",
+    "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v4",
+    "recipe_node_shape": "VM.GPU.A10.2",
+    "input_object_storage": [
+      {
+        "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/0T99iRADcM08aVpumM6smqMIcnIJTFtV2D8ZIIWidUP9eL8GSRyDMxOb9Va9rmRc/n/iduyx1qnmway/b/mymodels/o/",
+        "mount_location": "/models",
+        "volume_size_in_gbs": 500,
+        "include": [
+          "new_example_sglang.yaml",
+          "NousResearch/Meta-Llama-3.1-8B"
+        ]
+      }
+    ],
+    "output_object_storage": [
+      {
+        "bucket_name": "inference_output",
+        "mount_location": "/mlcommons_output",
+        "volume_size_in_gbs": 200
+      }
+    ],
+    "recipe_container_command_args": [
+      "/models/offline_vllm_example.yaml"
+    ],
+    "recipe_replica_count": 1,
+    "recipe_container_port": "8000",
+    "recipe_nvidia_gpu_count": 2,
+    "recipe_node_pool_size": 1,
+    "recipe_node_boot_volume_size_in_gbs": 200,
+    "recipe_ephemeral_storage_size": 100,
+    "recipe_shared_memory_volume_size_limit_in_mb": 200
+  }
+  
\ No newline at end of file

From 13e490cb427daa2e03d243df7c1651eb043b9979 Mon Sep 17 00:00:00 2001
From: ssraghavan-oci <sowmya.srinivasa.raghavan@oracle.com>
Date: Tue, 22 Apr 2025 22:57:36 -0400
Subject: [PATCH 7/9] addressed PR comments

---
 .../offline-inference-infra/README.md         | 55 ++++++++++++++++++-
 1 file changed, 52 insertions(+), 3 deletions(-)

diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md
index 256f63d..bb3b882 100644
--- a/docs/sample_blueprints/offline-inference-infra/README.md
+++ b/docs/sample_blueprints/offline-inference-infra/README.md
@@ -38,13 +38,20 @@ Offline inference is ideal for:
 ---
 
 ## Running the Benchmark
+- Things need to run the benchmark 
+  - Model checkpoints pre-downloaded and stored in an object storage.
+  - Make sure to get a PAR for the object storage where the models are saved. With listing, write and read perimissions
+  - A Bucket to save the outputs. This does not take a PAR, so should be a bucket in the same tenancy as to where you have your OCI blueprints stack
+  - Config `.yaml` file that has all the parameters required to run the benhcmark. This includes input_len, output_len, gpu_utilization value etc. 
+  - Deployment `.json` to deploy your blueprint. 
+  - Sample deployment and config files are provided below along with links.
 
 This blueprint supports benchmark execution via a job-mode recipe using a YAML config file. The recipe mounts a model and config file from Object Storage, runs offline inference, and logs metrics.
 
 Notes : Make sure your output object storage is in the same tenancy as your stack. 
 ---
 
-### Sample Recipe (Job Mode for Offline SGLang Inference)
+### [Sample Blueprint (Job Mode for Offline SGLang Inference)](dhttps://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json)
 
 ```json
 {
@@ -86,8 +93,50 @@ Notes : Make sure your output object storage is in the same tenancy as your stac
 ```
 
 ---
+### [Sample Blueprint (Job Mode for Offline vLLM Inference)](dhttps://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json)
 
-## Sample Config File (`example_sglang.yaml`)
+```json
+{
+    "recipe_id": "offline_inference_vllm",
+    "recipe_mode": "job",
+    "deployment_name": "Offline Inference Benchmark vllm",
+    "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v4",
+    "recipe_node_shape": "VM.GPU.A10.2",
+    "input_object_storage": [
+      {
+        "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/0T99iRADcM08aVpumM6smqMIcnIJTFtV2D8ZIIWidUP9eL8GSRyDMxOb9Va9rmRc/n/iduyx1qnmway/b/mymodels/o/",
+        "mount_location": "/models",
+        "volume_size_in_gbs": 500,
+        "include": [
+          "offline_vllm_example.yaml",
+          "NousResearch/Meta-Llama-3.1-8B"
+        ]
+      }
+    ],
+    "output_object_storage": [
+      {
+        "bucket_name": "inference_output",
+        "mount_location": "/mlcommons_output",
+        "volume_size_in_gbs": 200
+      }
+    ],
+    "recipe_container_command_args": [
+      "/models/offline_vllm_example.yaml"
+    ],
+    "recipe_replica_count": 1,
+    "recipe_container_port": "8000",
+    "recipe_nvidia_gpu_count": 2,
+    "recipe_node_pool_size": 1,
+    "recipe_node_boot_volume_size_in_gbs": 200,
+    "recipe_ephemeral_storage_size": 100,
+    "recipe_shared_memory_volume_size_limit_in_mb": 200
+  }
+  
+```
+
+---
+
+## [Sample Config File SGlang - 1 (`new_example_sglang.yaml`)](https://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml)
 
 ```yaml
 benchmark_type: offline
@@ -115,7 +164,7 @@ run_name: "llama3-8b-sglang-test"
 save_metrics_path: /mlcommons_output/benchmark_output_llama3_sglang.json
 
 ```
-
+## [Sample Config File - 2 vLLM (`offline_vllm_example.yaml`)](https://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml)
 ```yaml
 benchmark_type: offline
 model: /models/NousResearch/Meta-Llama-3.1-8B

From e701cc46b067f1ebc201e204187498e9a9c3cacf Mon Sep 17 00:00:00 2001
From: ssraghavan-oci <sowmya.srinivasa.raghavan@oracle.com>
Date: Thu, 24 Apr 2025 10:05:14 -0400
Subject: [PATCH 8/9] changed file names to indiciate the workload, addressed
 comments on the PR for offline inference

---
 .../offline-inference-infra/README.md         | 14 ++++----
 ...glang.yaml => offline_sglang_example.yaml} |  2 +-
 .../llama3_public_online.yaml                 | 17 +++++++++
 .../online_deployment.json                    | 35 +++++++++++++++++++
 .../online_example.yaml                       | 16 ---------
 5 files changed, 60 insertions(+), 24 deletions(-)
 rename docs/sample_blueprints/offline-inference-infra/{new_example_sglang.yaml => offline_sglang_example.yaml} (86%)
 create mode 100644 docs/sample_blueprints/online-inference-infra/llama3_public_online.yaml
 create mode 100644 docs/sample_blueprints/online-inference-infra/online_deployment.json
 delete mode 100644 docs/sample_blueprints/online-inference-infra/online_example.yaml

diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md
index bb3b882..b7b91b4 100644
--- a/docs/sample_blueprints/offline-inference-infra/README.md
+++ b/docs/sample_blueprints/offline-inference-infra/README.md
@@ -66,7 +66,7 @@ Notes : Make sure your output object storage is in the same tenancy as your stac
         "mount_location": "/models",
         "volume_size_in_gbs": 500,
         "include": [
-          "new_example_sglang.yaml",
+          "offline_sglang_example.yaml",
           "NousResearch/Meta-Llama-3.1-8B"
         ]
       }
@@ -74,12 +74,12 @@ Notes : Make sure your output object storage is in the same tenancy as your stac
     "output_object_storage": [
       {
         "bucket_name": "inference_output",
-        "mount_location": "/mlcommons_output",
+        "mount_location": "/benchmarking_output",
         "volume_size_in_gbs": 200
       }
     ],
     "recipe_container_command_args": [
-      "/models/new_example_sglang.yaml"
+      "/models/offline_sglang_example.yaml"
     ],
     "recipe_replica_count": 1,
     "recipe_container_port": "8000",
@@ -93,7 +93,7 @@ Notes : Make sure your output object storage is in the same tenancy as your stac
 ```
 
 ---
-### [Sample Blueprint (Job Mode for Offline vLLM Inference)](dhttps://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json)
+### [Sample Blueprint (Job Mode for Offline vLLM Inference)](dhttps://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_deployment_vllm.json)
 
 ```json
 {
@@ -116,7 +116,7 @@ Notes : Make sure your output object storage is in the same tenancy as your stac
     "output_object_storage": [
       {
         "bucket_name": "inference_output",
-        "mount_location": "/mlcommons_output",
+        "mount_location": "/benchmarking_output",
         "volume_size_in_gbs": 200
       }
     ],
@@ -161,7 +161,7 @@ experiment_name: "sglang-bench-doc-test-new"
 run_name: "llama3-8b-sglang-test"
 
 
-save_metrics_path: /mlcommons_output/benchmark_output_llama3_sglang.json
+save_metrics_path: /benchmarking_output/benchmark_output_llama3_sglang.json
 
 ```
 ## [Sample Config File - 2 vLLM (`offline_vllm_example.yaml`)](https://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml)
@@ -194,7 +194,7 @@ distributed_executor_backend: mp
 mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000
 experiment_name: test-bm-suite-doc
 run_name: llama3-vllm-test
-save_metrics_path:  /mlcommons_output/benchmark_output_llama3_vllm.json
+save_metrics_path:  /benchmarking_output/benchmark_output_llama3_vllm.json
 
 ```
 
diff --git a/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml b/docs/sample_blueprints/offline-inference-infra/offline_sglang_example.yaml
similarity index 86%
rename from docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml
rename to docs/sample_blueprints/offline-inference-infra/offline_sglang_example.yaml
index 1649e7a..a1ccf27 100644
--- a/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml
+++ b/docs/sample_blueprints/offline-inference-infra/offline_sglang_example.yaml
@@ -20,5 +20,5 @@ experiment_name: "sglang-bench-doc-test-new"
 run_name: "llama3-8b-sglang-test"
 
 
-save_metrics_path: /mlcommons_output/benchmark_output_llama3_sglang.json
+save_metrics_path: /benchmarking_output/benchmark_output_llama3_sglang.json
 
diff --git a/docs/sample_blueprints/online-inference-infra/llama3_public_online.yaml b/docs/sample_blueprints/online-inference-infra/llama3_public_online.yaml
new file mode 100644
index 0000000..967b5c8
--- /dev/null
+++ b/docs/sample_blueprints/online-inference-infra/llama3_public_online.yaml
@@ -0,0 +1,17 @@
+benchmark_type: online
+model: /models/NousResearch/Meta-Llama-3.1-8B-Instruct  # Updated model path
+input_len: 64
+output_len: 32
+max_requests: 5
+timeout: 300
+num_concurrent: 1
+results_dir: /online_output
+llm_api: openai
+llm_api_key: dummy-key
+llm_api_base: https://llama8bobjvllm.129-80-16-111.nip.io/v1  # Updated to HTTPS
+experiment_name: local-bench
+run_name: llama3-test
+mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000
+llmperf_path: /opt/llmperf-src
+metadata: test=public-endpoint
+save_metrics_path: /online_output/benchmark_output_llama3_online_public.json
\ No newline at end of file
diff --git a/docs/sample_blueprints/online-inference-infra/online_deployment.json b/docs/sample_blueprints/online-inference-infra/online_deployment.json
new file mode 100644
index 0000000..daeca81
--- /dev/null
+++ b/docs/sample_blueprints/online-inference-infra/online_deployment.json
@@ -0,0 +1,35 @@
+{
+    "recipe_id": "online_infernece_llmperf",
+    "recipe_mode": "job",
+    "deployment_name": "a1",
+    "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v4",
+    "recipe_node_shape": "VM.Standard.E4.Flex",
+    "recipe_node_pool_size": 1,
+    "recipe_flex_shape_ocpu_count": 32,
+    "recipe_flex_shape_memory_size_in_gbs": 256,
+    "recipe_node_boot_volume_size_in_gbs": 200,
+    "recipe_ephemeral_storage_size": 150,
+    "input_object_storage": [
+      {
+        "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/0T99iRADcM08aVpumM6smqMIcnIJTFtV2D8ZIIWidUP9eL8GSRyDMxOb9Va9rmRc/n/iduyx1qnmway/b/mymodels/o/",
+        "mount_location": "/models",
+        "volume_size_in_gbs": 500,
+        "include": [
+          "llama3_public_online.yaml"
+        ]
+      }
+    ],
+    "output_object_storage": [
+      {
+        "bucket_name": "inference_output",
+        "mount_location": "/online_output",
+        "volume_size_in_gbs": 200
+      }
+    ],
+    "recipe_container_command_args": [
+      "/models/llama3_public_online.yaml"
+    ],
+    "recipe_replica_count": 1,
+    "recipe_container_port": "5678"
+  }
+  
\ No newline at end of file
diff --git a/docs/sample_blueprints/online-inference-infra/online_example.yaml b/docs/sample_blueprints/online-inference-infra/online_example.yaml
deleted file mode 100644
index d4d0fe3..0000000
--- a/docs/sample_blueprints/online-inference-infra/online_example.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-benchmark_type: online
-model: meta/llama3-8b-instruct
-input_len: 64
-output_len: 32
-max_requests: 5
-timeout: 300
-num_concurrent: 1
-results_dir: /workspace/results_on
-llm_api: openai
-llm_api_key: dummy-key
-llm_api_base: http://localhost:8001/v1
-experiment_name: local-bench
-run_name: llama3-test
-mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000
-llmperf_path: /opt/llmperf-src
-metadata: test=localhost

From 7720110438e2f196819abdf9742ca7a3bd791ba8 Mon Sep 17 00:00:00 2001
From: ssraghavan-oci <sowmya.srinivasa.raghavan@oracle.com>
Date: Thu, 24 Apr 2025 10:18:58 -0400
Subject: [PATCH 9/9] minor edit - offline readme

---
 docs/sample_blueprints/offline-inference-infra/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md
index b7b91b4..a45c426 100644
--- a/docs/sample_blueprints/offline-inference-infra/README.md
+++ b/docs/sample_blueprints/offline-inference-infra/README.md
@@ -136,7 +136,7 @@ Notes : Make sure your output object storage is in the same tenancy as your stac
 
 ---
 
-## [Sample Config File SGlang - 1 (`new_example_sglang.yaml`)](https://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml)
+## [Sample Config File SGlang - 1 (`new_example_sglang.yaml`)](https://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_sglang_example.yaml)
 
 ```yaml
 benchmark_type: offline