triton-inference-server · krishung5 · Feb 21, 2025 · Feb 27, 2025 · Feb 27, 2025 · Feb 27, 2025
diff --git a/python/openai/tests/conftest.py b/python/openai/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -32,7 +32,6 @@
 from tests.utils import OpenAIServer, setup_fastapi_app, setup_server
 
 
-### TEST ENVIRONMENT SETUP ###
 def infer_test_environment():
     # Infer the test environment for simplicity in local dev/testing.
     try:
@@ -47,8 +46,15 @@ def infer_test_environment():
     try:
         import tensorrt_llm as _
 
-        backend = "tensorrtllm"
-        model = "tensorrt_llm_bls"
+        # TODO: Refactor away from environment variables
+        LLMAPI_SETUP = os.environ.get("LLMAPI_SETUP", 0)
+
+        if LLMAPI_SETUP:
+            backend = "llmapi"
+            model = "tensorrt_llm"
+        else:
+            backend = "tensorrtllm"
+            model = "tensorrt_llm_bls"
         return backend, model
     except ImportError:
         print("No tensorrt_llm installation found.")
@@ -84,13 +90,23 @@ def infer_test_model_repository(backend):
 # only once for all the tests below.
 @pytest.fixture(scope="module")
 def server():
+    # TODO: tensorrllm and llmapi backends both use "tensorrtllm" as the backend flag for OpenAI server.
+    # In the future if the backend are consolidated, this check can be updated or removed.
+    # key: the TEST_BACKEND value
+    # value: the corresponding backend flag for OpenAI server
+    backend_map = {
+        "tensorrtllm": "tensorrtllm",
+        "llmapi": "tensorrtllm",
+        "vllm": "vllm",
+    }
+
     args = [
         "--model-repository",
         TEST_MODEL_REPOSITORY,
         "--tokenizer",
         TEST_TOKENIZER,
         "--backend",
-        TEST_BACKEND,
+        backend_map[TEST_BACKEND],
     ]
     # TODO: Incorporate kserve frontend binding smoke tests to catch any
     # breakage with default values or slight cli arg variations

diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py
@@ -1,4 +1,4 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -310,7 +310,7 @@ def test_chat_completions_temperature_vllm(
     def test_chat_completions_temperature_tensorrtllm(
         self, client, backend: str, model: str, messages: List[dict]
     ):
-        if backend != "tensorrtllm":
+        if backend != "tensorrtllm" and backend != "llmapi":
             pytest.skip(
                 reason="Only used to test TRT-LLM-specific temperature behavior"
             )
@@ -368,6 +368,11 @@ def test_chat_completions_temperature_tensorrtllm(
         assert response1_text == response2_text
         assert response1_text != response3_text
 
+    # TODO: Remove xfail for LLM API when it's verified.
+    @pytest.mark.xfail(
+        condition=lambda backend: backend == "llmapi",
+        reason="Seed parameter is not supported in LLM API PyTorch workflow yet",
+    )
     # Simple tests to verify random seed roughly behaves as expected
     def test_chat_completions_seed(self, client, model: str, messages: List[dict]):
         responses = []

diff --git a/python/openai/tests/test_completions.py b/python/openai/tests/test_completions.py
@@ -1,4 +1,4 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -191,8 +191,8 @@ def test_completions_temperature_vllm(
     def test_completions_temperature_tensorrtllm(
         self, client, backend: str, model: str, prompt: str
     ):
-        if backend != "tensorrtllm":
-            pytest.skip(reason="Only used to test vLLM-specific temperature behavior")
+        if backend != "tensorrtllm" and backend != "llmapi":
+            pytest.skip(reason="Only used to test TRTLLM-specific temperature behavior")
 
         responses = []
         payload1 = {
@@ -238,6 +238,11 @@ def test_completions_temperature_tensorrtllm(
         assert response1_text == response2_text
         assert response1_text != response3_text
 
+    # TODO: Remove xfail for LLM API when it's verified.
+    @pytest.mark.xfail(
+        condition=lambda backend: backend == "llmapi",
+        reason="Seed parameter is not supported in LLM API PyTorch workflow yet",
+    )
     # Simple tests to verify seed roughly behaves as expected
     def test_completions_seed(self, client, model: str, prompt: str):
         responses = []
@@ -258,7 +263,7 @@ def test_completions_seed(self, client, model: str, prompt: str):
                 json=payload1,
             )
         )
-        # Third response should differ with different temperature in payload
+        # Third response should differ with different seed in payload
         responses.append(
             client.post(
                 "/v1/completions",

diff --git a/python/openai/tests/test_openai_client.py b/python/openai/tests/test_openai_client.py
@@ -1,4 +1,4 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -42,6 +42,9 @@ def test_openai_client_models(self, client: openai.OpenAI, backend: str):
             # tensorrt_llm_bls +
             # preprocess -> tensorrt_llm -> postprocess
             assert len(models) == 4
+        elif backend == "llmapi":
+            # Only has one tensorrt_llm model.
+            assert len(models) == 1
         elif backend == "vllm":
             assert len(models) == 1
         else:
@@ -75,7 +78,7 @@ def test_openai_client_chat_completion(
     def test_openai_client_completion_echo(
         self, client: openai.OpenAI, echo: bool, backend: str, model: str, prompt: str
     ):
-        if backend == "tensorrtllm":
+        if backend == "tensorrtllm" or backend == "llmapi":
             pytest.skip(
                 reason="TRT-LLM backend currently only supports setting this parameter at model load time",
             )
@@ -108,6 +111,9 @@ async def test_openai_client_models(self, client: openai.AsyncOpenAI, backend: s
             # tensorrt_llm_bls +
             # preprocess -> tensorrt_llm -> postprocess
             assert len(models) == 4
+        elif backend == "llmapi":
+            # Only has one tensorrt_llm model.
+            assert len(models) == 1
         elif backend == "vllm":
             assert len(models) == 1
         else:

diff --git a/python/openai/tests/utils.py b/python/openai/tests/utils.py
@@ -1,4 +1,4 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -53,8 +53,17 @@ def setup_server(model_repository: str):
 
 
 def setup_fastapi_app(tokenizer: str, server: tritonserver.Server, backend: str):
+    # TODO: tensorrllm and llmapi backends both use "tensorrtllm" as the backend flag for OpenAI server.
+    # In the future if the backends are consolidated, this check can be updated or removed.
+    # key: the backend value
+    # value: the corresponding backend flag for OpenAI server
+    backend_map = {
+        "tensorrtllm": "tensorrtllm",
+        "llmapi": "tensorrtllm",
+        "vllm": "vllm",
+    }
     engine: TritonLLMEngine = TritonLLMEngine(
-        server=server, tokenizer=tokenizer, backend=backend
+        server=server, tokenizer=tokenizer, backend=backend_map[backend]
     )
     frontend: FastApiFrontend = FastApiFrontend(engine=engine)
     return frontend.app

diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh
@@ -85,6 +85,12 @@ function prepare_tensorrtllm() {
     python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
     python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
     python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},batching_strategy:inflight_fused_batching,max_queue_size:0,max_queue_delay_microseconds:1000,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,exclude_input_in_output:True
+
+    # Prepare LLM API setup
+    LLMAPI_MODEL_REPO="tests/llmapi_models"
+    mkdir -p ${LLMAPI_MODEL_REPO}
+    cp /app/all_models/llmapi/* "${LLMAPI_MODEL_REPO}" -r
+    sed -i 's#"model":"TinyLlama/TinyLlama-1.1B-Chat-v1.0"#"model":"meta-llama/Meta-Llama-3.1-8B-Instruct"#g' ${LLMAPI_MODEL_REPO}/tensorrt_llm/1/model.json
 }
 
 function pre_test() {
@@ -103,11 +109,39 @@ function run_test() {
 
     # Capture error code without exiting to allow log collection
     set +e
-    pytest -s -v --junitxml=test_openai.xml tests/ 2>&1 > ${TEST_LOG}
-    if [ $? -ne 0 ]; then
-        cat ${TEST_LOG}
-        echo -e "\n***\n*** Test Failed\n***"
-        RET=1
+
+    if [ "${IMAGE_KIND}" == "TRTLLM" ]; then
+        echo "Running TensorRT-LLM tests..."
+
+        # First run with default model setup
+        echo "Running tests with default model setup..."
+        pytest -s -v --junitxml=test_openai_default.xml tests/ 2>&1 > test_openai_default.log
+        DEFAULT_RESULT=$?
+
+        # Then run with LLM API setup
+        echo "Running tests with LLM API setup..."
+        LLMAPI_SETUP=1 pytest -s -v --junitxml=test_openai_llmapi.xml tests/ 2>&1 > test_openai_llmapi.log
+        LLMAPI_RESULT=$?
+
+        # Combine results
+        if [ $DEFAULT_RESULT -ne 0 ]; then
+            cat test_openai_default.log
+            echo -e "\n***\n*** Test Failed with default model setup\n***"
+            RET=1
+        fi
+        if [ $LLMAPI_RESULT -ne 0 ]; then
+            cat test_openai_llmapi.log
+            echo -e "\n***\n*** Test Failed with LLM API setup\n***"
+            RET=1
+        fi
+    else
+        echo "Running vLLM tests..."
+        pytest -s -v --junitxml=test_openai.xml tests/ 2>&1 > ${TEST_LOG}
+        if [ $? -ne 0 ]; then
+            cat ${TEST_LOG}
+            echo -e "\n***\n*** Test Failed\n***"
+            RET=1
+        fi
     fi
     set -e