From cecdde8ec9562b69a441e14896d0f325b472dc2c Mon Sep 17 00:00:00 2001 From: ssraghavan-oci Date: Mon, 21 Apr 2025 12:09:10 -0400 Subject: [PATCH 1/9] docs for offline inference --- .../offline-inference-infra/README.md | 120 ++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 docs/sample_blueprints/offline-inference-infra/README.md diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md new file mode 100644 index 0000000..803e977 --- /dev/null +++ b/docs/sample_blueprints/offline-inference-infra/README.md @@ -0,0 +1,120 @@ +Here’s your rewritten `README.md`, styled similarly to the CPU inference blueprint but focused on **offline GPU inference using the SGLang backend**. + +# Offline Inference Blueprint - Infra (SGLang + vLLM) + +This blueprint provides a configurable framework to run **offline LLM inference benchmarks** using either the SGLang or vLLM backends. It is designed for cloud GPU environments and supports automated performance benchmarking with MLflow logging. + +This blueprint enables you to: +- Run inference locally on GPU nodes using pre-loaded models +- Benchmark token throughput, latency, and request performance +- Push results to MLflow for comparison and analysis + +--- + +## Pre-Filled Samples + +| Title | Description | +|------------------------------|-----------------------------------------------------------------------------| +|Offline inference with LLaMA 3|Benchmarks Meta-Llama-3.1-8B model using SGLang on VM.GPU.A10.2 with 2 GPUs. | + +You can access these pre-filled samples from the OCI AI Blueprint portal. + +--- +## When to use Offline inference + +Offline inference is ideal for: +- Accurate performance benchmarking (no API or network bottlenecks) +- Comparing GPU hardware performance (A10, A100, H100, MI300X) +- Evaluating backend frameworks like vLLM and SGLang + +--- + +## Supported Backends + +| Backend | Description | +|----------|--------------------------------------------------------------| +| sglang | Fast multi-modal LLM backend with optimized throughput | +| vllm | Token streaming inference engine for LLMs with speculative decoding | + +--- + +## Running the Benchmark + +This blueprint supports benchmark execution via a job-mode recipe using a YAML config file. The recipe mounts a model and config file from Object Storage, runs offline inference, and logs metrics. + +--- + +### Sample Recipe (Job Mode for Offline SGLang Inference) + +```json +{ + "recipe_id": "offline_inference_sglang", + "recipe_mode": "job", + "deployment_name": "Offline Inference Benchmark", + "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v2", + "recipe_node_shape": "VM.GPU.A10.2", + "input_object_storage": [ + { + "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/Z2q73uuLCAxCbGXJ99CIeTxnCTNipsE-1xHE9HYfCz0RBYPTcCbqi9KHViUEH-Wq/n/iduyx1qnmway/b/mymodels/o/", + "mount_location": "/models", + "volume_size_in_gbs": 500, + "include": [ + "example_sglang.yaml", + "NousResearch/Meta-Llama-3.1-8B" + ] + } + ], + "recipe_container_command_args": [ + "/models/example_sglang.yaml" + ], + "recipe_replica_count": 1, + "recipe_container_port": "8000", + "recipe_nvidia_gpu_count": 2, + "recipe_node_pool_size": 1, + "recipe_node_boot_volume_size_in_gbs": 200, + "recipe_ephemeral_storage_size": 100, + "recipe_shared_memory_volume_size_limit_in_mb": 200 +} +``` + +--- + +## Sample Config File (`example_sglang.yaml`) + +```yaml +benchmark_type: offline +offline_backend: sglang + +model_path: /models/NousResearch/Meta-Llama-3.1-8B +tokenizer_path: /models/NousResearch/Meta-Llama-3.1-8B +trust_remote_code: true +conv_template: llama-2 + +input_len: 128 +output_len: 128 +num_prompts: 64 +max_seq_len: 4096 +max_batch_size: 8 +dtype: auto +temperature: 0.7 +top_p: 0.9 + +mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 +experiment_name: "sglang-bench-doc-test-new" +run_name: "llama3-8b-sglang-test" +``` + +--- + +## Metrics Logged + +- `requests_per_second` +- `input_tokens_per_second` +- `output_tokens_per_second` +- `total_tokens_per_second` +- `elapsed_time` +- `total_input_tokens` +- `total_output_tokens` + +If a dataset is provided: +- `accuracy` From 011f7fe29817e79e1c2f4477082f7a1c2160e0e4 Mon Sep 17 00:00:00 2001 From: ssraghavan-oci Date: Mon, 21 Apr 2025 12:18:56 -0400 Subject: [PATCH 2/9] removed edit line --- docs/sample_blueprints/offline-inference-infra/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md index 803e977..d0bb6d0 100644 --- a/docs/sample_blueprints/offline-inference-infra/README.md +++ b/docs/sample_blueprints/offline-inference-infra/README.md @@ -1,5 +1,3 @@ -Here’s your rewritten `README.md`, styled similarly to the CPU inference blueprint but focused on **offline GPU inference using the SGLang backend**. - # Offline Inference Blueprint - Infra (SGLang + vLLM) This blueprint provides a configurable framework to run **offline LLM inference benchmarks** using either the SGLang or vLLM backends. It is designed for cloud GPU environments and supports automated performance benchmarking with MLflow logging. From 66ecb28faf285bbe2f06c7fe633dc504394a4134 Mon Sep 17 00:00:00 2001 From: ssraghavan-oci Date: Mon, 21 Apr 2025 12:22:37 -0400 Subject: [PATCH 3/9] online inference readme --- .../online-inference-infra/README.md | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 docs/sample_blueprints/online-inference-infra/README.md diff --git a/docs/sample_blueprints/online-inference-infra/README.md b/docs/sample_blueprints/online-inference-infra/README.md new file mode 100644 index 0000000..8b1f4bf --- /dev/null +++ b/docs/sample_blueprints/online-inference-infra/README.md @@ -0,0 +1,104 @@ +# Online Inference Blueprint (LLMPerf) + +This blueprint benchmarks **online inference performance** of large language models using **LLMPerf**, a standardized benchmarking tool. It is designed to evaluate LLM APIs served via platforms such as OpenAI-compatible interfaces, including self-hosted LLM inference endpoints. + +This blueprint helps: +- Simulate real-time request load on a running model server +- Measure end-to-end latency, throughput, and completion performance +- Push results to MLflow for visibility and tracking + +--- + +## Pre-Filled Samples + +| Title | Description | +|----------------------------------------|-----------------------------------------------------------------------------| +|Online inference on LLaMA 3 using LLMPerf|Benchmark of meta/llama3-8b-instruct via a local OpenAI-compatible endpoint | + +These can be accessed directly from the OCI AI Blueprint portal. + +--- + +## Prerequisites + +Before running this blueprint: +- You **must have an inference server already running**, compatible with the OpenAI API format. +- Ensure the endpoint and model name match what’s defined in the config. + +--- + +## Supported Scenarios + +| Use Case | Description | +|-----------------------|-------------------------------------------------------| +| Local LLM APIs | Benchmark your own self-hosted models (e.g., vLLM) | +| Remote OpenAI API | Benchmark OpenAI deployments for throughput analysis | +| Multi-model endpoints | Test latency/throughput across different configurations | + +--- + +### Sample Recipe (Job Mode for Online Benchmarking) + +```json +{ + "recipe_id": "online_inference_benchmark", + "recipe_mode": "job", + "deployment_name": "Online Inference Benchmark", + "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v2", + "recipe_node_shape": "VM.GPU.A10.2", + "input_object_storage": [ + { + "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/Z2q73uuLCAxCbGXJ99CIeTxnCTNipsE-1xHE9HYfCz0RBYPTcCbqi9KHViUEH-Wq/n/iduyx1qnmway/b/mymodels/o/", + "mount_location": "/models", + "volume_size_in_gbs": 100, + "include": [ + "example_online.yaml" + ] + } + ], + "recipe_container_command_args": [ + "/models/example_online.yaml" + ], + "recipe_replica_count": 1, + "recipe_container_port": "8000", + "recipe_node_pool_size": 1, + "recipe_node_boot_volume_size_in_gbs": 200, + "recipe_ephemeral_storage_size": 100 +} +``` + +--- + +## Sample Config File (`example_online.yaml`) + +```yaml +benchmark_type: online + +model: meta/llama3-8b-instruct +input_len: 64 +output_len: 32 +max_requests: 5 +timeout: 300 +num_concurrent: 1 +results_dir: /workspace/results_on +llm_api: openai +llm_api_key: dummy-key +llm_api_base: http://localhost:8001/v1 + +experiment_name: local-bench +run_name: llama3-test +mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 +llmperf_path: /opt/llmperf-src +metadata: test=localhost +``` + +--- + +## Metrics Logged + +- `output_tokens_per_second` +- `requests_per_minute` +- `overall_output_throughput` +- All raw metrics from the `_summary.json` output of LLMPerf + +--- From 8ec78b4a67955e9a7ba1170c3a4ea0a2465f3864 Mon Sep 17 00:00:00 2001 From: ssraghavan-oci Date: Mon, 21 Apr 2025 23:16:40 -0400 Subject: [PATCH 4/9] better readme with extra pre-filled samples for offline inference --- .../offline-inference-infra/README.md | 192 +++++++++++++++--- 1 file changed, 165 insertions(+), 27 deletions(-) diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md index d0bb6d0..f67861b 100644 --- a/docs/sample_blueprints/offline-inference-infra/README.md +++ b/docs/sample_blueprints/offline-inference-infra/README.md @@ -14,6 +14,7 @@ This blueprint enables you to: | Title | Description | |------------------------------|-----------------------------------------------------------------------------| |Offline inference with LLaMA 3|Benchmarks Meta-Llama-3.1-8B model using SGLang on VM.GPU.A10.2 with 2 GPUs. | +|Offline inference with LLAMA 3- vLLM| Benchmarks Meta-Llama-3.1-8B model using vLLM on VM.GPU.A10.2 with 2 GPUs.| You can access these pre-filled samples from the OCI AI Blueprint portal. @@ -46,33 +47,41 @@ This blueprint supports benchmark execution via a job-mode recipe using a YAML c ```json { - "recipe_id": "offline_inference_sglang", - "recipe_mode": "job", - "deployment_name": "Offline Inference Benchmark", - "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v2", - "recipe_node_shape": "VM.GPU.A10.2", - "input_object_storage": [ - { - "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/Z2q73uuLCAxCbGXJ99CIeTxnCTNipsE-1xHE9HYfCz0RBYPTcCbqi9KHViUEH-Wq/n/iduyx1qnmway/b/mymodels/o/", - "mount_location": "/models", - "volume_size_in_gbs": 500, - "include": [ - "example_sglang.yaml", - "NousResearch/Meta-Llama-3.1-8B" - ] - } - ], - "recipe_container_command_args": [ - "/models/example_sglang.yaml" - ], - "recipe_replica_count": 1, - "recipe_container_port": "8000", - "recipe_nvidia_gpu_count": 2, - "recipe_node_pool_size": 1, - "recipe_node_boot_volume_size_in_gbs": 200, - "recipe_ephemeral_storage_size": 100, - "recipe_shared_memory_volume_size_limit_in_mb": 200 -} + "recipe_id": "offline_inference_sglang", + "recipe_mode": "job", + "deployment_name": "Offline Inference Benchmark", + "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v4", + "recipe_node_shape": "VM.GPU.A10.2", + "input_object_storage": [ + { + "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/0T99iRADcM08aVpumM6smqMIcnIJTFtV2D8ZIIWidUP9eL8GSRyDMxOb9Va9rmRc/n/iduyx1qnmway/b/mymodels/o/", + "mount_location": "/models", + "volume_size_in_gbs": 500, + "include": [ + "new_example_sglang.yaml", + "NousResearch/Meta-Llama-3.1-8B" + ] + } + ], + "output_object_storage": [ + { + "bucket_name": "inference_output", + "mount_location": "/mlcommons_output", + "volume_size_in_gbs": 200 + } + ], + "recipe_container_command_args": [ + "/models/new_example_sglang.yaml" + ], + "recipe_replica_count": 1, + "recipe_container_port": "8000", + "recipe_nvidia_gpu_count": 2, + "recipe_node_pool_size": 1, + "recipe_node_boot_volume_size_in_gbs": 200, + "recipe_ephemeral_storage_size": 100, + "recipe_shared_memory_volume_size_limit_in_mb": 200 + } + ``` --- @@ -100,6 +109,43 @@ top_p: 0.9 mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 experiment_name: "sglang-bench-doc-test-new" run_name: "llama3-8b-sglang-test" + + +save_metrics_path: /mlcommons_output/benchmark_output_llama3_sglang.json + +``` + +```yaml +benchmark_type: offline +model: /models/NousResearch/Meta-Llama-3.1-8B +tokenizer: /models/NousResearch/Meta-Llama-3.1-8B + +input_len: 12 +output_len: 12 +num_prompts: 2 +seed: 42 +tensor_parallel_size: 8 + +# vLLM-specific +#quantization: awq +dtype: half +gpu_memory_utilization: 0.99 +num_scheduler_steps: 10 +device: cuda +enforce_eager: true +kv_cache_dtype: auto +enable_prefix_caching: true +distributed_executor_backend: mp + +# Output +#output_json: ./128_128.json + +# MLflow +mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 +experiment_name: test-bm-suite-doc +run_name: llama3-vllm-test +save_metrics_path: /mlcommons_output/benchmark_output_llama3_vllm.json + ``` --- @@ -116,3 +162,95 @@ run_name: "llama3-8b-sglang-test" If a dataset is provided: - `accuracy` + + +### Top-level Deployment Keys + +| Key | Description | +|-----|-------------| +| `recipe_id` | Identifier of the recipe to run; here, it's an offline SGLang benchmark job. | +| `recipe_mode` | Specifies this is a `job`, meaning it runs to completion and exits. | +| `deployment_name` | Human-readable name for the job. | +| `recipe_image_uri` | Docker image containing the benchmark code and dependencies. | +| `recipe_node_shape` | Shape of the VM or GPU node to run the job (e.g., VM.GPU.A10.2). | + +### Input Object Storage + +| Key | Description | +|-----|-------------| +| `input_object_storage` | List of inputs to mount from Object Storage. | +| `par` | Pre-Authenticated Request (PAR) link to a bucket/folder. | +| `mount_location` | Files are mounted to this path inside the container. | +| `volume_size_in_gbs` | Size of the mount volume. | +| `include` | Only these files/folders from the bucket are mounted (e.g., model + config). | + +### Output Object Storage + +| Key | Description | +|-----|-------------| +| `output_object_storage` | Where to store outputs like benchmark logs or results. | +| `bucket_name` | Name of the output bucket in OCI Object Storage. | +| `mount_location` | Mount point inside container where outputs are written. | +| `volume_size_in_gbs` | Size of this volume in GBs. | + +### Runtime & Infra Settings + +| Key | Description | +|-----|-------------| +| `recipe_container_command_args` | Path to the YAML config that defines benchmark parameters. | +| `recipe_replica_count` | Number of job replicas to run (usually 1 for inference). | +| `recipe_container_port` | Port (optional for offline mode; required if API is exposed). | +| `recipe_nvidia_gpu_count` | Number of GPUs allocated to this job. | +| `recipe_node_pool_size` | Number of nodes in the pool (1 means 1 VM). | +| `recipe_node_boot_volume_size_in_gbs` | Disk size for OS + dependencies. | +| `recipe_ephemeral_storage_size` | Local scratch space in GBs. | +| `recipe_shared_memory_volume_size_limit_in_mb` | Shared memory (used by some inference engines). | + +--- + +## **Sample Config File (`example_sglang.yaml`)** + +This file is consumed by the container during execution to configure the benchmark run. + +### Inference Setup + +| Key | Description | +|-----|-------------| +| `benchmark_type` | Set to `offline` to indicate local execution with no HTTP server. | +| `offline_backend` | Backend engine to use (`sglang` or `vllm`). | +| `model_path` | Path to the model directory (already mounted via Object Storage). | +| `tokenizer_path` | Path to the tokenizer (usually same as model path). | +| `trust_remote_code` | Enables loading models that require custom code (Hugging Face). | +| `conv_template` | Prompt formatting template to use (e.g., `llama-2`). | + +### Benchmark Parameters + +| Key | Description | +|-----|-------------| +| `input_len` | Number of tokens in the input prompt. | +| `output_len` | Number of tokens to generate. | +| `num_prompts` | Number of total prompts to run (e.g., 64 prompts x 128 output tokens). | +| `max_seq_len` | Max sequence length supported by the model (e.g., 4096). | +| `max_batch_size` | Max batch size per inference run (depends on GPU memory). | +| `dtype` | Precision (e.g., float16, bfloat16, auto). | + +### Sampling Settings + +| Key | Description | +|-----|-------------| +| `temperature` | Controls randomness in generation (lower = more deterministic). | +| `top_p` | Top-p sampling for diversity (0.9 keeps most probable tokens). | + +### MLflow Logging + +| Key | Description | +|-----|-------------| +| `mlflow_uri` | MLflow server to log performance metrics. | +| `experiment_name` | Experiment name to group runs in MLflow UI. | +| `run_name` | Custom name to identify this particular run. | + +### Output + +| Key | Description | +|-----|-------------| +| `save_metrics_path` | Path inside the container where metrics will be saved as JSON. | From 14f45a01f0163923c42c95628b74945a50dfc2f6 Mon Sep 17 00:00:00 2001 From: ssraghavan-oci Date: Tue, 22 Apr 2025 12:14:21 -0400 Subject: [PATCH 5/9] added sample json files --- .../offline-inference-infra/README.md | 1 + .../new_example_sglang.yaml | 24 +++++++++++++++ .../offline_vllm_example.yaml | 29 +++++++++++++++++++ .../online_example.yaml | 16 ++++++++++ 4 files changed, 70 insertions(+) create mode 100644 docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml create mode 100644 docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml create mode 100644 docs/sample_blueprints/online-inference-infra/online_example.yaml diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md index f67861b..256f63d 100644 --- a/docs/sample_blueprints/offline-inference-infra/README.md +++ b/docs/sample_blueprints/offline-inference-infra/README.md @@ -41,6 +41,7 @@ Offline inference is ideal for: This blueprint supports benchmark execution via a job-mode recipe using a YAML config file. The recipe mounts a model and config file from Object Storage, runs offline inference, and logs metrics. +Notes : Make sure your output object storage is in the same tenancy as your stack. --- ### Sample Recipe (Job Mode for Offline SGLang Inference) diff --git a/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml b/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml new file mode 100644 index 0000000..1649e7a --- /dev/null +++ b/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml @@ -0,0 +1,24 @@ +benchmark_type: offline +offline_backend: sglang + +model_path: /models/NousResearch/Meta-Llama-3.1-8B +tokenizer_path: /models/NousResearch/Meta-Llama-3.1-8B +trust_remote_code: true +conv_template: llama-2 + +input_len: 128 +output_len: 128 +num_prompts: 64 +max_seq_len: 4096 +max_batch_size: 8 +dtype: auto +temperature: 0.7 +top_p: 0.9 + +mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 +experiment_name: "sglang-bench-doc-test-new" +run_name: "llama3-8b-sglang-test" + + +save_metrics_path: /mlcommons_output/benchmark_output_llama3_sglang.json + diff --git a/docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml b/docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml new file mode 100644 index 0000000..7734c14 --- /dev/null +++ b/docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml @@ -0,0 +1,29 @@ +benchmark_type: offline +model: /models/NousResearch/Meta-Llama-3.1-8B +tokenizer: /models/NousResearch/Meta-Llama-3.1-8B + +input_len: 12 +output_len: 12 +num_prompts: 2 +seed: 42 +tensor_parallel_size: 8 + +# vLLM-specific +#quantization: awq +dtype: half +gpu_memory_utilization: 0.99 +num_scheduler_steps: 10 +device: cuda +enforce_eager: true +kv_cache_dtype: auto +enable_prefix_caching: true +distributed_executor_backend: mp + +# Output +#output_json: ./128_128.json + +# MLflow +mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 +experiment_name: test-bm-suite-doc +run_name: llama3-vllm-test +save_metrics_path: /mlcommons_output/benchmark_output_llama3_vllm.json diff --git a/docs/sample_blueprints/online-inference-infra/online_example.yaml b/docs/sample_blueprints/online-inference-infra/online_example.yaml new file mode 100644 index 0000000..d4d0fe3 --- /dev/null +++ b/docs/sample_blueprints/online-inference-infra/online_example.yaml @@ -0,0 +1,16 @@ +benchmark_type: online +model: meta/llama3-8b-instruct +input_len: 64 +output_len: 32 +max_requests: 5 +timeout: 300 +num_concurrent: 1 +results_dir: /workspace/results_on +llm_api: openai +llm_api_key: dummy-key +llm_api_base: http://localhost:8001/v1 +experiment_name: local-bench +run_name: llama3-test +mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 +llmperf_path: /opt/llmperf-src +metadata: test=localhost From da66e9502b5bae4f304f138ef1fd0545f501e5b1 Mon Sep 17 00:00:00 2001 From: ssraghavan-oci Date: Tue, 22 Apr 2025 12:18:56 -0400 Subject: [PATCH 6/9] added deployment json files --- .../offline_deployment_sglang.json | 36 +++++++++++++++++++ .../offline_deployment_vllm.json | 36 +++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json create mode 100644 docs/sample_blueprints/offline-inference-infra/offline_deployment_vllm.json diff --git a/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json b/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json new file mode 100644 index 0000000..e3b988a --- /dev/null +++ b/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json @@ -0,0 +1,36 @@ +{ + "recipe_id": "offline_inference_sglang", + "recipe_mode": "job", + "deployment_name": "Offline Inference Benchmark", + "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v4", + "recipe_node_shape": "VM.GPU.A10.2", + "input_object_storage": [ + { + "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/0T99iRADcM08aVpumM6smqMIcnIJTFtV2D8ZIIWidUP9eL8GSRyDMxOb9Va9rmRc/n/iduyx1qnmway/b/mymodels/o/", + "mount_location": "/models", + "volume_size_in_gbs": 500, + "include": [ + "new_example_sglang.yaml", + "NousResearch/Meta-Llama-3.1-8B" + ] + } + ], + "output_object_storage": [ + { + "bucket_name": "inference_output", + "mount_location": "/mlcommons_output", + "volume_size_in_gbs": 200 + } + ], + "recipe_container_command_args": [ + "/models/new_example_sglang.yaml" + ], + "recipe_replica_count": 1, + "recipe_container_port": "8000", + "recipe_nvidia_gpu_count": 2, + "recipe_node_pool_size": 1, + "recipe_node_boot_volume_size_in_gbs": 200, + "recipe_ephemeral_storage_size": 100, + "recipe_shared_memory_volume_size_limit_in_mb": 200 + } + \ No newline at end of file diff --git a/docs/sample_blueprints/offline-inference-infra/offline_deployment_vllm.json b/docs/sample_blueprints/offline-inference-infra/offline_deployment_vllm.json new file mode 100644 index 0000000..e920f38 --- /dev/null +++ b/docs/sample_blueprints/offline-inference-infra/offline_deployment_vllm.json @@ -0,0 +1,36 @@ +{ + "recipe_id": "offline_inference_vllm", + "recipe_mode": "job", + "deployment_name": "Offline Inference Benchmark vllm", + "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v4", + "recipe_node_shape": "VM.GPU.A10.2", + "input_object_storage": [ + { + "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/0T99iRADcM08aVpumM6smqMIcnIJTFtV2D8ZIIWidUP9eL8GSRyDMxOb9Va9rmRc/n/iduyx1qnmway/b/mymodels/o/", + "mount_location": "/models", + "volume_size_in_gbs": 500, + "include": [ + "new_example_sglang.yaml", + "NousResearch/Meta-Llama-3.1-8B" + ] + } + ], + "output_object_storage": [ + { + "bucket_name": "inference_output", + "mount_location": "/mlcommons_output", + "volume_size_in_gbs": 200 + } + ], + "recipe_container_command_args": [ + "/models/offline_vllm_example.yaml" + ], + "recipe_replica_count": 1, + "recipe_container_port": "8000", + "recipe_nvidia_gpu_count": 2, + "recipe_node_pool_size": 1, + "recipe_node_boot_volume_size_in_gbs": 200, + "recipe_ephemeral_storage_size": 100, + "recipe_shared_memory_volume_size_limit_in_mb": 200 + } + \ No newline at end of file From 13e490cb427daa2e03d243df7c1651eb043b9979 Mon Sep 17 00:00:00 2001 From: ssraghavan-oci Date: Tue, 22 Apr 2025 22:57:36 -0400 Subject: [PATCH 7/9] addressed PR comments --- .../offline-inference-infra/README.md | 55 ++++++++++++++++++- 1 file changed, 52 insertions(+), 3 deletions(-) diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md index 256f63d..bb3b882 100644 --- a/docs/sample_blueprints/offline-inference-infra/README.md +++ b/docs/sample_blueprints/offline-inference-infra/README.md @@ -38,13 +38,20 @@ Offline inference is ideal for: --- ## Running the Benchmark +- Things need to run the benchmark + - Model checkpoints pre-downloaded and stored in an object storage. + - Make sure to get a PAR for the object storage where the models are saved. With listing, write and read perimissions + - A Bucket to save the outputs. This does not take a PAR, so should be a bucket in the same tenancy as to where you have your OCI blueprints stack + - Config `.yaml` file that has all the parameters required to run the benhcmark. This includes input_len, output_len, gpu_utilization value etc. + - Deployment `.json` to deploy your blueprint. + - Sample deployment and config files are provided below along with links. This blueprint supports benchmark execution via a job-mode recipe using a YAML config file. The recipe mounts a model and config file from Object Storage, runs offline inference, and logs metrics. Notes : Make sure your output object storage is in the same tenancy as your stack. --- -### Sample Recipe (Job Mode for Offline SGLang Inference) +### [Sample Blueprint (Job Mode for Offline SGLang Inference)](dhttps://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json) ```json { @@ -86,8 +93,50 @@ Notes : Make sure your output object storage is in the same tenancy as your stac ``` --- +### [Sample Blueprint (Job Mode for Offline vLLM Inference)](dhttps://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json) -## Sample Config File (`example_sglang.yaml`) +```json +{ + "recipe_id": "offline_inference_vllm", + "recipe_mode": "job", + "deployment_name": "Offline Inference Benchmark vllm", + "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v4", + "recipe_node_shape": "VM.GPU.A10.2", + "input_object_storage": [ + { + "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/0T99iRADcM08aVpumM6smqMIcnIJTFtV2D8ZIIWidUP9eL8GSRyDMxOb9Va9rmRc/n/iduyx1qnmway/b/mymodels/o/", + "mount_location": "/models", + "volume_size_in_gbs": 500, + "include": [ + "offline_vllm_example.yaml", + "NousResearch/Meta-Llama-3.1-8B" + ] + } + ], + "output_object_storage": [ + { + "bucket_name": "inference_output", + "mount_location": "/mlcommons_output", + "volume_size_in_gbs": 200 + } + ], + "recipe_container_command_args": [ + "/models/offline_vllm_example.yaml" + ], + "recipe_replica_count": 1, + "recipe_container_port": "8000", + "recipe_nvidia_gpu_count": 2, + "recipe_node_pool_size": 1, + "recipe_node_boot_volume_size_in_gbs": 200, + "recipe_ephemeral_storage_size": 100, + "recipe_shared_memory_volume_size_limit_in_mb": 200 + } + +``` + +--- + +## [Sample Config File SGlang - 1 (`new_example_sglang.yaml`)](https://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml) ```yaml benchmark_type: offline @@ -115,7 +164,7 @@ run_name: "llama3-8b-sglang-test" save_metrics_path: /mlcommons_output/benchmark_output_llama3_sglang.json ``` - +## [Sample Config File - 2 vLLM (`offline_vllm_example.yaml`)](https://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml) ```yaml benchmark_type: offline model: /models/NousResearch/Meta-Llama-3.1-8B From e701cc46b067f1ebc201e204187498e9a9c3cacf Mon Sep 17 00:00:00 2001 From: ssraghavan-oci Date: Thu, 24 Apr 2025 10:05:14 -0400 Subject: [PATCH 8/9] changed file names to indiciate the workload, addressed comments on the PR for offline inference --- .../offline-inference-infra/README.md | 14 ++++---- ...glang.yaml => offline_sglang_example.yaml} | 2 +- .../llama3_public_online.yaml | 17 +++++++++ .../online_deployment.json | 35 +++++++++++++++++++ .../online_example.yaml | 16 --------- 5 files changed, 60 insertions(+), 24 deletions(-) rename docs/sample_blueprints/offline-inference-infra/{new_example_sglang.yaml => offline_sglang_example.yaml} (86%) create mode 100644 docs/sample_blueprints/online-inference-infra/llama3_public_online.yaml create mode 100644 docs/sample_blueprints/online-inference-infra/online_deployment.json delete mode 100644 docs/sample_blueprints/online-inference-infra/online_example.yaml diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md index bb3b882..b7b91b4 100644 --- a/docs/sample_blueprints/offline-inference-infra/README.md +++ b/docs/sample_blueprints/offline-inference-infra/README.md @@ -66,7 +66,7 @@ Notes : Make sure your output object storage is in the same tenancy as your stac "mount_location": "/models", "volume_size_in_gbs": 500, "include": [ - "new_example_sglang.yaml", + "offline_sglang_example.yaml", "NousResearch/Meta-Llama-3.1-8B" ] } @@ -74,12 +74,12 @@ Notes : Make sure your output object storage is in the same tenancy as your stac "output_object_storage": [ { "bucket_name": "inference_output", - "mount_location": "/mlcommons_output", + "mount_location": "/benchmarking_output", "volume_size_in_gbs": 200 } ], "recipe_container_command_args": [ - "/models/new_example_sglang.yaml" + "/models/offline_sglang_example.yaml" ], "recipe_replica_count": 1, "recipe_container_port": "8000", @@ -93,7 +93,7 @@ Notes : Make sure your output object storage is in the same tenancy as your stac ``` --- -### [Sample Blueprint (Job Mode for Offline vLLM Inference)](dhttps://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json) +### [Sample Blueprint (Job Mode for Offline vLLM Inference)](dhttps://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_deployment_vllm.json) ```json { @@ -116,7 +116,7 @@ Notes : Make sure your output object storage is in the same tenancy as your stac "output_object_storage": [ { "bucket_name": "inference_output", - "mount_location": "/mlcommons_output", + "mount_location": "/benchmarking_output", "volume_size_in_gbs": 200 } ], @@ -161,7 +161,7 @@ experiment_name: "sglang-bench-doc-test-new" run_name: "llama3-8b-sglang-test" -save_metrics_path: /mlcommons_output/benchmark_output_llama3_sglang.json +save_metrics_path: /benchmarking_output/benchmark_output_llama3_sglang.json ``` ## [Sample Config File - 2 vLLM (`offline_vllm_example.yaml`)](https://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml) @@ -194,7 +194,7 @@ distributed_executor_backend: mp mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 experiment_name: test-bm-suite-doc run_name: llama3-vllm-test -save_metrics_path: /mlcommons_output/benchmark_output_llama3_vllm.json +save_metrics_path: /benchmarking_output/benchmark_output_llama3_vllm.json ``` diff --git a/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml b/docs/sample_blueprints/offline-inference-infra/offline_sglang_example.yaml similarity index 86% rename from docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml rename to docs/sample_blueprints/offline-inference-infra/offline_sglang_example.yaml index 1649e7a..a1ccf27 100644 --- a/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml +++ b/docs/sample_blueprints/offline-inference-infra/offline_sglang_example.yaml @@ -20,5 +20,5 @@ experiment_name: "sglang-bench-doc-test-new" run_name: "llama3-8b-sglang-test" -save_metrics_path: /mlcommons_output/benchmark_output_llama3_sglang.json +save_metrics_path: /benchmarking_output/benchmark_output_llama3_sglang.json diff --git a/docs/sample_blueprints/online-inference-infra/llama3_public_online.yaml b/docs/sample_blueprints/online-inference-infra/llama3_public_online.yaml new file mode 100644 index 0000000..967b5c8 --- /dev/null +++ b/docs/sample_blueprints/online-inference-infra/llama3_public_online.yaml @@ -0,0 +1,17 @@ +benchmark_type: online +model: /models/NousResearch/Meta-Llama-3.1-8B-Instruct # Updated model path +input_len: 64 +output_len: 32 +max_requests: 5 +timeout: 300 +num_concurrent: 1 +results_dir: /online_output +llm_api: openai +llm_api_key: dummy-key +llm_api_base: https://llama8bobjvllm.129-80-16-111.nip.io/v1 # Updated to HTTPS +experiment_name: local-bench +run_name: llama3-test +mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 +llmperf_path: /opt/llmperf-src +metadata: test=public-endpoint +save_metrics_path: /online_output/benchmark_output_llama3_online_public.json \ No newline at end of file diff --git a/docs/sample_blueprints/online-inference-infra/online_deployment.json b/docs/sample_blueprints/online-inference-infra/online_deployment.json new file mode 100644 index 0000000..daeca81 --- /dev/null +++ b/docs/sample_blueprints/online-inference-infra/online_deployment.json @@ -0,0 +1,35 @@ +{ + "recipe_id": "online_infernece_llmperf", + "recipe_mode": "job", + "deployment_name": "a1", + "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v4", + "recipe_node_shape": "VM.Standard.E4.Flex", + "recipe_node_pool_size": 1, + "recipe_flex_shape_ocpu_count": 32, + "recipe_flex_shape_memory_size_in_gbs": 256, + "recipe_node_boot_volume_size_in_gbs": 200, + "recipe_ephemeral_storage_size": 150, + "input_object_storage": [ + { + "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/0T99iRADcM08aVpumM6smqMIcnIJTFtV2D8ZIIWidUP9eL8GSRyDMxOb9Va9rmRc/n/iduyx1qnmway/b/mymodels/o/", + "mount_location": "/models", + "volume_size_in_gbs": 500, + "include": [ + "llama3_public_online.yaml" + ] + } + ], + "output_object_storage": [ + { + "bucket_name": "inference_output", + "mount_location": "/online_output", + "volume_size_in_gbs": 200 + } + ], + "recipe_container_command_args": [ + "/models/llama3_public_online.yaml" + ], + "recipe_replica_count": 1, + "recipe_container_port": "5678" + } + \ No newline at end of file diff --git a/docs/sample_blueprints/online-inference-infra/online_example.yaml b/docs/sample_blueprints/online-inference-infra/online_example.yaml deleted file mode 100644 index d4d0fe3..0000000 --- a/docs/sample_blueprints/online-inference-infra/online_example.yaml +++ /dev/null @@ -1,16 +0,0 @@ -benchmark_type: online -model: meta/llama3-8b-instruct -input_len: 64 -output_len: 32 -max_requests: 5 -timeout: 300 -num_concurrent: 1 -results_dir: /workspace/results_on -llm_api: openai -llm_api_key: dummy-key -llm_api_base: http://localhost:8001/v1 -experiment_name: local-bench -run_name: llama3-test -mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 -llmperf_path: /opt/llmperf-src -metadata: test=localhost From 7720110438e2f196819abdf9742ca7a3bd791ba8 Mon Sep 17 00:00:00 2001 From: ssraghavan-oci Date: Thu, 24 Apr 2025 10:18:58 -0400 Subject: [PATCH 9/9] minor edit - offline readme --- docs/sample_blueprints/offline-inference-infra/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md index b7b91b4..a45c426 100644 --- a/docs/sample_blueprints/offline-inference-infra/README.md +++ b/docs/sample_blueprints/offline-inference-infra/README.md @@ -136,7 +136,7 @@ Notes : Make sure your output object storage is in the same tenancy as your stac --- -## [Sample Config File SGlang - 1 (`new_example_sglang.yaml`)](https://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml) +## [Sample Config File SGlang - 1 (`new_example_sglang.yaml`)](https://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_sglang_example.yaml) ```yaml benchmark_type: offline