triton-inference-server · krishung5 · Feb 21, 2025 · Feb 27, 2025 · Feb 27, 2025 · Feb 27, 2025
diff --git a/python/openai/tests/conftest.py b/python/openai/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -31,8 +31,10 @@
 from fastapi.testclient import TestClient
 from tests.utils import OpenAIServer, setup_fastapi_app, setup_server
 
-
 ### TEST ENVIRONMENT SETUP ###
+LLMAPI_SETUP = os.environ.get("LLMAPI_SETUP", 0)
+
+
 def infer_test_environment():
     # Infer the test environment for simplicity in local dev/testing.
     try:
@@ -48,7 +50,10 @@ def infer_test_environment():
         import tensorrt_llm as _
 
         backend = "tensorrtllm"
-        model = "tensorrt_llm_bls"
+        if LLMAPI_SETUP:
+            model = "tensorrt_llm"
+        else:
+            model = "tensorrt_llm_bls"
 if backend != "tensorrtllm": 
     pytest.skip( 
         reason="Only used to test TRT-LLM-specific temperature behavior" 
     ) 
 try: 
     import tensorrt_llm as _ 
     backend = "tensorrtllm" 
     model = "tensorrt_llm_bls" 
     return backend, model 
 except ImportError: 
     print("No tensorrt_llm installation found.") 
 if backend != "tensorrtllm": 
     pytest.skip( 
         reason="Only used to test TRT-LLM-specific temperature behavior" 
     ) 
 try: 
     import tensorrt_llm as _ 
  
     backend = "tensorrtllm" 
     model = "tensorrt_llm_bls" 
     return backend, model 
 except ImportError: 
     print("No tensorrt_llm installation found.") 
         return backend, model
     except ImportError:
         print("No tensorrt_llm installation found.")
@@ -57,7 +62,10 @@ def infer_test_environment():
 
 
 def infer_test_model_repository(backend):
-    model_repository = str(Path(__file__).parent / f"{backend}_models")
+    if LLMAPI_SETUP:
+        model_repository = str(Path(__file__).parent / f"{backend}_llmapi_models")
+    else:
+        model_repository = str(Path(__file__).parent / f"{backend}_models")
     return model_repository
 
 

diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py
@@ -1,4 +1,4 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,6 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import copy
+import os
 import subprocess
 from pathlib import Path
 from typing import List
@@ -368,6 +369,11 @@ def test_chat_completions_temperature_tensorrtllm(
         assert response1_text == response2_text
         assert response1_text != response3_text
 
+    # TODO: Remove xfail for LLM API when it supports seed
+    @pytest.mark.xfail(
+        condition=os.getenv("LLMAPI_SETUP") == "1",
+        reason="Didn't see any difference in responses with different seeds when using LLM API. Skipping for now.",
+    )
     # Simple tests to verify random seed roughly behaves as expected
     def test_chat_completions_seed(self, client, model: str, messages: List[dict]):
         responses = []

diff --git a/python/openai/tests/test_completions.py b/python/openai/tests/test_completions.py
@@ -1,4 +1,4 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,6 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import copy
+import os
 
 import pytest
 
@@ -238,6 +239,11 @@ def test_completions_temperature_tensorrtllm(
         assert response1_text == response2_text
         assert response1_text != response3_text
 
+    # TODO: Remove xfail for LLM API when it supports seed
+    @pytest.mark.xfail(
+        condition=os.getenv("LLMAPI_SETUP") == "1",
+        reason="Didn't see any difference in responses with different seeds when using LLM API. Skipping for now.",
+    )
     # Simple tests to verify seed roughly behaves as expected
     def test_completions_seed(self, client, model: str, prompt: str):
         responses = []
@@ -258,7 +264,7 @@ def test_completions_seed(self, client, model: str, prompt: str):
                 json=payload1,
             )
         )
-        # Third response should differ with different temperature in payload
+        # Third response should differ with different seed in payload
         responses.append(
             client.post(
                 "/v1/completions",

diff --git a/python/openai/tests/test_openai_client.py b/python/openai/tests/test_openai_client.py
@@ -1,4 +1,4 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -39,9 +39,16 @@ def test_openai_client_models(self, client: openai.OpenAI, backend: str):
         models = list(client.models.list())
         print(f"Models: {models}")
         if backend == "tensorrtllm":
-            # tensorrt_llm_bls +
-            # preprocess -> tensorrt_llm -> postprocess
-            assert len(models) == 4
+            import os
+
+            LLMAPI_SETUP = os.environ.get("LLMAPI_SETUP", 0)
+            if LLMAPI_SETUP:
+                # LLM API setup only has the tensorrt_llm model
+                assert len(models) == 1
+            else:
+                # tensorrt_llm_bls +
+                # preprocess -> tensorrt_llm -> postprocess
+                assert len(models) == 4
         elif backend == "vllm":
             assert len(models) == 1
         else:
@@ -105,9 +112,16 @@ async def test_openai_client_models(self, client: openai.AsyncOpenAI, backend: s
         models = [model async for model in async_models]
         print(f"Models: {models}")
         if backend == "tensorrtllm":
-            # tensorrt_llm_bls +
-            # preprocess -> tensorrt_llm -> postprocess
-            assert len(models) == 4
+            import os
+
+            LLMAPI_SETUP = os.environ.get("LLMAPI_SETUP", 0)
+            if LLMAPI_SETUP:
+                # LLM API setup only has the tensorrt_llm model
+                assert len(models) == 1
+            else:
+                # tensorrt_llm_bls +
+                # preprocess -> tensorrt_llm -> postprocess
+                assert len(models) == 4
         elif backend == "vllm":
             assert len(models) == 1
         else:

diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh
@@ -85,6 +85,12 @@ function prepare_tensorrtllm() {
     python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
     python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
     python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},batching_strategy:inflight_fused_batching,max_queue_size:0,max_queue_delay_microseconds:1000,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,exclude_input_in_output:True
+
+    # Prepare LLM API setup
+    LLMAPI_MODEL_REPO="tests/tensorrtllm_llmapi_models"
+    mkdir -p ${LLMAPI_MODEL_REPO}
+    cp /app/all_models/llmapi/* "${LLMAPI_MODEL_REPO}" -r
+    sed -i 's#"model":"TinyLlama/TinyLlama-1.1B-Chat-v1.0"#"model":"meta-llama/Meta-Llama-3.1-8B-Instruct"#g' ${LLMAPI_MODEL_REPO}/tensorrt_llm/1/model.json
 }
 
 function pre_test() {
@@ -103,11 +109,39 @@ function run_test() {
 
     # Capture error code without exiting to allow log collection
     set +e
-    pytest -s -v --junitxml=test_openai.xml tests/ 2>&1 > ${TEST_LOG}
-    if [ $? -ne 0 ]; then
-        cat ${TEST_LOG}
-        echo -e "\n***\n*** Test Failed\n***"
-        RET=1
+
+    if [ "${IMAGE_KIND}" == "TRTLLM" ]; then
+        echo "Running TensorRT-LLM tests..."
+
+        # First run with default model setup
+        echo "Running tests with default model setup..."
+        pytest -s -v --junitxml=test_openai_default.xml tests/ 2>&1 > test_openai_default.log
+        DEFAULT_RESULT=$?
+
+        # Then run with LLM API setup
+        echo "Running tests with LLM API setup..."
+        LLMAPI_SETUP=1 pytest -s -v --junitxml=test_openai_llmapi.xml tests/ 2>&1 > test_openai_llmapi.log
+        LLMAPI_RESULT=$?
+
+        # Combine results
+        if [ $DEFAULT_RESULT -ne 0 ]; then
+            cat test_openai_default.log
+            echo -e "\n***\n*** Test Failed with default model setup\n***"
+            RET=1
+        fi
+        if [ $LLMAPI_RESULT -ne 0 ]; then
+            cat test_openai_llmapi.log
+            echo -e "\n***\n*** Test Failed with LLM API setup\n***"
+            RET=1
+        fi
+    else
+        echo "Running vLLM tests..."
+        pytest -s -v --junitxml=test_openai.xml tests/ 2>&1 > ${TEST_LOG}
+        if [ $? -ne 0 ]; then
+            cat ${TEST_LOG}
+            echo -e "\n***\n*** Test Failed\n***"
+            RET=1
+        fi
     fi
     set -e