From 1ef3eeb74818d92692d96e613d8beb9bda93ea63 Mon Sep 17 00:00:00 2001 From: Nathan Price Date: Wed, 23 Apr 2025 21:28:04 -0500 Subject: [PATCH 1/5] Add support for priority --- src/model.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/model.py b/src/model.py index d201244c..2c79aafa 100644 --- a/src/model.py +++ b/src/model.py @@ -128,6 +128,12 @@ def _auto_complete_inputs_and_outputs(auto_complete_model_config): "data_type": "TYPE_BOOL", "dims": [1], "optional": True, + }, + { + "name": "priority", + "data_type": "TYPE_INT32", + "dims": [1], + "optional": True, }, ] # Outputs expected by the backend. @@ -426,6 +432,7 @@ async def _generate(self, request): prepend_input, parameters, additional_outputs, + priority, ) = self._get_input_tensors(request) sampling_params = TritonSamplingParams.from_dict(parameters, self.logger) @@ -438,7 +445,7 @@ async def _generate(self, request): lora_request = LoRARequest(lora_id, lora_int_id, lora_local_path) response_iterator = self._llm_engine.generate( - prompt, sampling_params, request_id, lora_request=lora_request + prompt, sampling_params, request_id, lora_request=lora_request, priority=priority ) request_output_state = {} From e38a8189f046dd0e21cf5b3b54acea6d0fffb51b Mon Sep 17 00:00:00 2001 From: Nathan Price Date: Wed, 23 Apr 2025 21:34:37 -0500 Subject: [PATCH 2/5] Updated to set default value on priority --- src/model.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/model.py b/src/model.py index 2c79aafa..ca6683ef 100644 --- a/src/model.py +++ b/src/model.py @@ -444,6 +444,9 @@ async def _generate(self, request): lora_local_path = self.lora_repository[lora_name] lora_request = LoRARequest(lora_id, lora_int_id, lora_local_path) + if not priority: + priority = 0 + response_iterator = self._llm_engine.generate( prompt, sampling_params, request_id, lora_request=lora_request, priority=priority ) From 800a5029770f715ed0c5451257fc26432785e6bb Mon Sep 17 00:00:00 2001 From: Nathan Price Date: Wed, 23 Apr 2025 21:39:28 -0500 Subject: [PATCH 3/5] Updated README.md --- README.md | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/README.md b/README.md index 00669888..433415cb 100644 --- a/README.md +++ b/README.md @@ -208,6 +208,43 @@ for more information. Additional vLLM outputs may be requested optionally on a per-request basis. See [this docs](docs/additional_outputs.md) for more information. +## Priority Requests + +The vLLM backend supports priority-based request scheduling when the engine is configured with a scheduler policy set to `priority`. This allows you to prioritize certain requests over others, with lower priority numbers being processed first. + +### Configuration + +To enable priority scheduling, set the `scheduler_policy` parameter to `priority` in your `model.json`: + +```json +{ + "scheduler_policy": "priority", + // ... other engine args ... +} +``` + +### Usage + +You can specify the priority of a request using the optional `priority` input tensor: + +```python +inputs = [] +inputs.append(grpcclient.InferInput("text_input", [1], "BYTES")) +inputs[-1].set_data_from_numpy(np.array([prompt.encode("utf-8")], dtype=np.object_)) + +# Add priority input (optional) +inputs.append(grpcclient.InferInput("priority", [1], "INT32")) +inputs[-1].set_data_from_numpy(np.array([priority_value], dtype=np.int32)) +``` + +If the priority input is not provided, it defaults to 0. Lower priority numbers are processed first. + +### Example Use Cases + +- Prioritize real-time user requests over background tasks +- Implement different service level agreements (SLAs) +- Manage system resources by processing high-priority requests first + ## Triton Metrics Starting with the 24.08 release of Triton, users can now obtain specific vLLM metrics by querying the Triton metrics endpoint (see complete vLLM metrics From 920c693f521a3687f983872c08c4f9070057d0fe Mon Sep 17 00:00:00 2001 From: Nathan Price Date: Wed, 23 Apr 2025 21:45:46 -0500 Subject: [PATCH 4/5] Ran pre-commit and black formatter --- src/model.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/model.py b/src/model.py index ca6683ef..24c2f55f 100644 --- a/src/model.py +++ b/src/model.py @@ -128,7 +128,7 @@ def _auto_complete_inputs_and_outputs(auto_complete_model_config): "data_type": "TYPE_BOOL", "dims": [1], "optional": True, - }, + }, { "name": "priority", "data_type": "TYPE_INT32", @@ -448,7 +448,11 @@ async def _generate(self, request): priority = 0 response_iterator = self._llm_engine.generate( - prompt, sampling_params, request_id, lora_request=lora_request, priority=priority + prompt, + sampling_params, + request_id, + lora_request=lora_request, + priority=priority, ) request_output_state = {} From 9324920de8eeadaf37d6d6b2ebab66c75dbcdae1 Mon Sep 17 00:00:00 2001 From: Nathan Price Date: Thu, 24 Apr 2025 10:42:40 -0500 Subject: [PATCH 5/5] Updated to handle optional input --- src/model.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/model.py b/src/model.py index 24c2f55f..5b1efd45 100644 --- a/src/model.py +++ b/src/model.py @@ -444,9 +444,6 @@ async def _generate(self, request): lora_local_path = self.lora_repository[lora_name] lora_request = LoRARequest(lora_id, lora_int_id, lora_local_path) - if not priority: - priority = 0 - response_iterator = self._llm_engine.generate( prompt, sampling_params, @@ -601,7 +598,14 @@ def _get_input_tensors(self, request): tensor = False additional_outputs[tensor_name] = tensor - return prompt, stream, prepend_input, parameters, additional_outputs + # priority + priority = pb_utils.get_input_tensor_by_name(request, "priority") + if priority: + priority = int(priority.as_numpy()[0]) + else: + priority = 0 + + return prompt, stream, prepend_input, parameters, additional_outputs, priority def _create_response( self, request_output_state, request_output, prepend_input, additional_outputs