triton-inference-server · TheCodeWrangler · Apr 24, 2025 · Apr 24, 2025 · Apr 24, 2025 · Apr 24, 2025
diff --git a/README.md b/README.md
@@ -208,6 +208,43 @@ for more information.
 Additional vLLM outputs may be requested optionally on a per-request basis. See
 [this docs](docs/additional_outputs.md) for more information.
 
+## Priority Requests
+
+The vLLM backend supports priority-based request scheduling when the engine is configured with a scheduler policy set to `priority`. This allows you to prioritize certain requests over others, with lower priority numbers being processed first.
+
+### Configuration
+
+To enable priority scheduling, set the `scheduler_policy` parameter to `priority` in your `model.json`:
+
+```json
+{
+    "scheduler_policy": "priority",
+    // ... other engine args ...
+}
+```
+
+### Usage
+
+You can specify the priority of a request using the optional `priority` input tensor:
+
+```python
+inputs = []
+inputs.append(grpcclient.InferInput("text_input", [1], "BYTES"))
+inputs[-1].set_data_from_numpy(np.array([prompt.encode("utf-8")], dtype=np.object_))
+
+# Add priority input (optional)
+inputs.append(grpcclient.InferInput("priority", [1], "INT32"))
+inputs[-1].set_data_from_numpy(np.array([priority_value], dtype=np.int32))
+```
+
+If the priority input is not provided, it defaults to 0. Lower priority numbers are processed first.
+
+### Example Use Cases
+
+- Prioritize real-time user requests over background tasks
+- Implement different service level agreements (SLAs)
+- Manage system resources by processing high-priority requests first
+
 ## Triton Metrics
 Starting with the 24.08 release of Triton, users can now obtain specific
 vLLM metrics by querying the Triton metrics endpoint (see complete vLLM metrics

diff --git a/src/model.py b/src/model.py
@@ -129,6 +129,12 @@ def _auto_complete_inputs_and_outputs(auto_complete_model_config):
                 "dims": [1],
                 "optional": True,
             },
+            {
+                "name": "priority",
+                "data_type": "TYPE_INT32",
+                "dims": [1],
+                "optional": True,
+            },
         ]
         # Outputs expected by the backend.
         outputs = [
@@ -426,6 +432,7 @@ async def _generate(self, request):
                 prepend_input,
                 parameters,
                 additional_outputs,
+                priority,
             ) = self._get_input_tensors(request)
 
             sampling_params = TritonSamplingParams.from_dict(parameters, self.logger)
@@ -438,7 +445,11 @@ async def _generate(self, request):
                 lora_request = LoRARequest(lora_id, lora_int_id, lora_local_path)
 
             response_iterator = self._llm_engine.generate(
-                prompt, sampling_params, request_id, lora_request=lora_request
+                prompt,
+                sampling_params,
+                request_id,
+                lora_request=lora_request,
+                priority=priority,
             )
 
             request_output_state = {}
@@ -587,7 +598,14 @@ def _get_input_tensors(self, request):
                 tensor = False
             additional_outputs[tensor_name] = tensor
 
-        return prompt, stream, prepend_input, parameters, additional_outputs
+        # priority
+        priority = pb_utils.get_input_tensor_by_name(request, "priority")
+        if priority:
+            priority = int(priority.as_numpy()[0])
+        else:
+            priority = 0
+
+        return prompt, stream, prepend_input, parameters, additional_outputs, priority
 
     def _create_response(
         self, request_output_state, request_output, prepend_input, additional_outputs