From fe8081f679bc7b3a7acec500f06e5f024114dcae Mon Sep 17 00:00:00 2001
From: krishung5 <krish@nvidia.com>
Date: Fri, 21 Feb 2025 03:03:53 -0800
Subject: [PATCH 1/6] Add openai testing for LLM API

---
 python/openai/tests/conftest.py           | 16 ++++++--
 python/openai/tests/test_openai_client.py | 28 +++++++++----
 qa/L0_openai/test.sh                      | 49 ++++++++++++++++++++---
 3 files changed, 76 insertions(+), 17 deletions(-)

diff --git a/python/openai/tests/conftest.py b/python/openai/tests/conftest.py
index 9ea9a5634e..c567a82d65 100644
--- a/python/openai/tests/conftest.py
+++ b/python/openai/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -31,8 +31,10 @@
 from fastapi.testclient import TestClient
 from tests.utils import OpenAIServer, setup_fastapi_app, setup_server
 
-
 ### TEST ENVIRONMENT SETUP ###
+LLMAPI_SETUP = os.environ.get("LLMAPI_SETUP", 0)
+
+
 def infer_test_environment():
     # Infer the test environment for simplicity in local dev/testing.
     try:
@@ -48,7 +50,10 @@ def infer_test_environment():
         import tensorrt_llm as _
 
         backend = "tensorrtllm"
-        model = "tensorrt_llm_bls"
+        if LLMAPI_SETUP:
+            model = "tensorrt_llm"
+        else:
+            model = "tensorrt_llm_bls"
         return backend, model
     except ImportError:
         print("No tensorrt_llm installation found.")
@@ -57,7 +62,10 @@ def infer_test_environment():
 
 
 def infer_test_model_repository(backend):
-    model_repository = str(Path(__file__).parent / f"{backend}_models")
+    if LLMAPI_SETUP:
+        model_repository = str(Path(__file__).parent / f"{backend}_llmapi_models")
+    else:
+        model_repository = str(Path(__file__).parent / f"{backend}_models")
     return model_repository
 
 
diff --git a/python/openai/tests/test_openai_client.py b/python/openai/tests/test_openai_client.py
index 6f1b456ab4..70011d9c1b 100644
--- a/python/openai/tests/test_openai_client.py
+++ b/python/openai/tests/test_openai_client.py
@@ -1,4 +1,4 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -39,9 +39,16 @@ def test_openai_client_models(self, client: openai.OpenAI, backend: str):
         models = list(client.models.list())
         print(f"Models: {models}")
         if backend == "tensorrtllm":
-            # tensorrt_llm_bls +
-            # preprocess -> tensorrt_llm -> postprocess
-            assert len(models) == 4
+            import os
+
+            LLMAPI_SETUP = os.environ.get("LLMAPI_SETUP", 0)
+            if LLMAPI_SETUP:
+                # LLM API setup only has the tensorrt_llm model
+                assert len(models) == 1
+            else:
+                # tensorrt_llm_bls +
+                # preprocess -> tensorrt_llm -> postprocess
+                assert len(models) == 4
         elif backend == "vllm":
             assert len(models) == 1
         else:
@@ -105,9 +112,16 @@ async def test_openai_client_models(self, client: openai.AsyncOpenAI, backend: s
         models = [model async for model in async_models]
         print(f"Models: {models}")
         if backend == "tensorrtllm":
-            # tensorrt_llm_bls +
-            # preprocess -> tensorrt_llm -> postprocess
-            assert len(models) == 4
+            import os
+
+            LLMAPI_SETUP = os.environ.get("LLMAPI_SETUP", 0)
+            if LLMAPI_SETUP:
+                # LLM API setup only has the tensorrt_llm model
+                assert len(models) == 1
+            else:
+                # tensorrt_llm_bls +
+                # preprocess -> tensorrt_llm -> postprocess
+                assert len(models) == 4
         elif backend == "vllm":
             assert len(models) == 1
         else:
diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh
index 0921bce98e..e56bca749c 100755
--- a/qa/L0_openai/test.sh
+++ b/qa/L0_openai/test.sh
@@ -85,6 +85,14 @@ function prepare_tensorrtllm() {
     python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
     python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
     python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},batching_strategy:inflight_fused_batching,max_queue_size:0,max_queue_delay_microseconds:1000,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,exclude_input_in_output:True
+
+    # Prepare LLM API setup
+    LLMAPI_MODEL_REPO="tests/tensorrtllm_llmapi_models"
+    mkdir -p ${LLMAPI_MODEL_REPO}
+    cp /app/all_models/llmapi/* "${LLMAPI_MODEL_REPO}" -r
+
+    # Modify the json file model.json, from "model":"TinyLlama/TinyLlama-1.1B-Chat-v1.0", to "model": "meta-llama/Meta-Llama-3.1-8B-Instruct"
+    sed -i 's#"model":"TinyLlama/TinyLlama-1.1B-Chat-v1.0"#"model":"meta-llama/Meta-Llama-3.1-8B-Instruct"#g' ${LLMAPI_MODEL_REPO}/tensorrt_llm/1/model.json
 }
 
 function pre_test() {
@@ -103,16 +111,45 @@ function run_test() {
 
     # Capture error code without exiting to allow log collection
     set +e
-    pytest -s -v --junitxml=test_openai.xml tests/ 2>&1 > ${TEST_LOG}
-    if [ $? -ne 0 ]; then
-        cat ${TEST_LOG}
-        echo -e "\n***\n*** Test Failed\n***"
-        RET=1
+
+    if [ "${IMAGE_KIND}" == "TRTLLM" ]; then
+        echo "Running TensorRT-LLM tests..."
+
+        # First run with default model setup
+        echo "Running tests with default model setup..."
+        pytest -s -v --junitxml=test_openai_default.xml tests/ 2>&1 > test_openai_default.log
+        DEFAULT_RESULT=$?
+
+        # Then run with LLM API setup
+        echo "Running tests with LLM API setup..."
+        LLMAPI_SETUP=1 pytest -s -v --junitxml=test_openai_llmapi.xml tests/ 2>&1 > test_openai_llmapi.log
+        LLMAPI_RESULT=$?
+
+        # Combine results
+        if [ $DEFAULT_RESULT -ne 0 ]; then
+            cat test_openai_default.log
+            echo -e "\n***\n*** Test Failed with default model setup\n***"
+            RET=1
+        fi
+        if [ $LLMAPI_RESULT -ne 0 ]; then
+            cat test_openai_llmapi.log
+            echo -e "\n***\n*** Test Failed with LLM API setup\n***"
+            RET=1
+        fi
+    else
+        echo "Running vLLM tests..."
+        pytest -s -v --junitxml=test_openai.xml tests/ 2>&1 > ${TEST_LOG}
+        if [ $? -ne 0 ]; then
+            cat ${TEST_LOG}
+            echo -e "\n***\n*** Test Failed\n***"
+            RET=1
+        fi
     fi
-    set -e
 
     # Collect logs for error analysis when needed
     cp *.xml *.log ../../../
+
+    set -e
     popd
 }
 

From be339c5754c84705bec760b082f738127a98ea2d Mon Sep 17 00:00:00 2001
From: krishung5 <krish@nvidia.com>
Date: Thu, 27 Feb 2025 01:03:04 -0800
Subject: [PATCH 2/6] Skip seed tests for LLM API

---
 python/openai/tests/test_chat_completions.py |  8 +++++++-
 python/openai/tests/test_completions.py      | 10 ++++++++--
 qa/L0_openai/test.sh                         |  5 +----
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py
index 401601c526..8a83a1d025 100644
--- a/python/openai/tests/test_chat_completions.py
+++ b/python/openai/tests/test_chat_completions.py
@@ -1,4 +1,4 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,6 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import copy
+import os
 import subprocess
 from pathlib import Path
 from typing import List
@@ -368,6 +369,11 @@ def test_chat_completions_temperature_tensorrtllm(
         assert response1_text == response2_text
         assert response1_text != response3_text
 
+    # TODO: Remove xfail for LLM API when it supports seed
+    @pytest.mark.xfail(
+        condition=os.getenv("LLMAPI_SETUP") == "1",
+        reason="Didn't see any difference in responses with different seeds when using LLM API. Skipping for now.",
+    )
     # Simple tests to verify random seed roughly behaves as expected
     def test_chat_completions_seed(self, client, model: str, messages: List[dict]):
         responses = []
diff --git a/python/openai/tests/test_completions.py b/python/openai/tests/test_completions.py
index d89ff4701e..327042ff6b 100644
--- a/python/openai/tests/test_completions.py
+++ b/python/openai/tests/test_completions.py
@@ -1,4 +1,4 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -25,6 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import copy
+import os
 
 import pytest
 
@@ -238,6 +239,11 @@ def test_completions_temperature_tensorrtllm(
         assert response1_text == response2_text
         assert response1_text != response3_text
 
+    # TODO: Remove xfail for LLM API when it supports seed
+    @pytest.mark.xfail(
+        condition=os.getenv("LLMAPI_SETUP") == "1",
+        reason="Didn't see any difference in responses with different seeds when using LLM API. Skipping for now.",
+    )
     # Simple tests to verify seed roughly behaves as expected
     def test_completions_seed(self, client, model: str, prompt: str):
         responses = []
@@ -258,7 +264,7 @@ def test_completions_seed(self, client, model: str, prompt: str):
                 json=payload1,
             )
         )
-        # Third response should differ with different temperature in payload
+        # Third response should differ with different seed in payload
         responses.append(
             client.post(
                 "/v1/completions",
diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh
index e56bca749c..bbe204408d 100755
--- a/qa/L0_openai/test.sh
+++ b/qa/L0_openai/test.sh
@@ -90,8 +90,6 @@ function prepare_tensorrtllm() {
     LLMAPI_MODEL_REPO="tests/tensorrtllm_llmapi_models"
     mkdir -p ${LLMAPI_MODEL_REPO}
     cp /app/all_models/llmapi/* "${LLMAPI_MODEL_REPO}" -r
-
-    # Modify the json file model.json, from "model":"TinyLlama/TinyLlama-1.1B-Chat-v1.0", to "model": "meta-llama/Meta-Llama-3.1-8B-Instruct"
     sed -i 's#"model":"TinyLlama/TinyLlama-1.1B-Chat-v1.0"#"model":"meta-llama/Meta-Llama-3.1-8B-Instruct"#g' ${LLMAPI_MODEL_REPO}/tensorrt_llm/1/model.json
 }
 
@@ -145,11 +143,10 @@ function run_test() {
             RET=1
         fi
     fi
+    set -e
 
     # Collect logs for error analysis when needed
     cp *.xml *.log ../../../
-
-    set -e
     popd
 }
 

From 2f5f06e84a5486eaa3bbc88a5ae406b72479dbef Mon Sep 17 00:00:00 2001
From: krishung5 <krish@nvidia.com>
Date: Thu, 27 Feb 2025 11:00:28 -0800
Subject: [PATCH 3/6] Rewording

---
 python/openai/tests/test_chat_completions.py | 4 ++--
 python/openai/tests/test_completions.py      | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py
index 8a83a1d025..09f8ed621e 100644
--- a/python/openai/tests/test_chat_completions.py
+++ b/python/openai/tests/test_chat_completions.py
@@ -369,10 +369,10 @@ def test_chat_completions_temperature_tensorrtllm(
         assert response1_text == response2_text
         assert response1_text != response3_text
 
-    # TODO: Remove xfail for LLM API when it supports seed
+    # TODO: Remove xfail for LLM API when it's verified.
     @pytest.mark.xfail(
         condition=os.getenv("LLMAPI_SETUP") == "1",
-        reason="Didn't see any difference in responses with different seeds when using LLM API. Skipping for now.",
+        reason="Seed parameter support to be verified for LLM API",
     )
     # Simple tests to verify random seed roughly behaves as expected
     def test_chat_completions_seed(self, client, model: str, messages: List[dict]):
diff --git a/python/openai/tests/test_completions.py b/python/openai/tests/test_completions.py
index 327042ff6b..a767e723b6 100644
--- a/python/openai/tests/test_completions.py
+++ b/python/openai/tests/test_completions.py
@@ -239,10 +239,10 @@ def test_completions_temperature_tensorrtllm(
         assert response1_text == response2_text
         assert response1_text != response3_text
 
-    # TODO: Remove xfail for LLM API when it supports seed
+    # TODO: Remove xfail for LLM API when it's verified.
     @pytest.mark.xfail(
         condition=os.getenv("LLMAPI_SETUP") == "1",
-        reason="Didn't see any difference in responses with different seeds when using LLM API. Skipping for now.",
+        reason="Seed parameter support to be verified for LLM API",
     )
     # Simple tests to verify seed roughly behaves as expected
     def test_completions_seed(self, client, model: str, prompt: str):

From 70268030d6c02c92285421769848c3ed44477dcf Mon Sep 17 00:00:00 2001
From: krishung5 <krish@nvidia.com>
Date: Fri, 28 Feb 2025 15:15:07 -0800
Subject: [PATCH 4/6] Use backend fixture for llmapi

---
 python/openai/tests/conftest.py              | 26 +++++++++------
 python/openai/tests/test_chat_completions.py |  4 +--
 python/openai/tests/test_completions.py      |  6 ++--
 python/openai/tests/test_openai_client.py    | 34 ++++++++------------
 python/openai/tests/utils.py                 | 13 ++++++--
 qa/L0_openai/test.sh                         |  2 +-
 6 files changed, 47 insertions(+), 38 deletions(-)

diff --git a/python/openai/tests/conftest.py b/python/openai/tests/conftest.py
index c567a82d65..8460889de2 100644
--- a/python/openai/tests/conftest.py
+++ b/python/openai/tests/conftest.py
@@ -31,9 +31,6 @@
 from fastapi.testclient import TestClient
 from tests.utils import OpenAIServer, setup_fastapi_app, setup_server
 
-### TEST ENVIRONMENT SETUP ###
-LLMAPI_SETUP = os.environ.get("LLMAPI_SETUP", 0)
-
 
 def infer_test_environment():
     # Infer the test environment for simplicity in local dev/testing.
@@ -49,10 +46,14 @@ def infer_test_environment():
     try:
         import tensorrt_llm as _
 
-        backend = "tensorrtllm"
+        # TODO: Refactor away from environment variables
+        LLMAPI_SETUP = os.environ.get("LLMAPI_SETUP", 0)
+
         if LLMAPI_SETUP:
+            backend = "llmapi"
             model = "tensorrt_llm"
         else:
+            backend = "tensorrtllm"
             model = "tensorrt_llm_bls"
         return backend, model
     except ImportError:
@@ -62,10 +63,7 @@ def infer_test_environment():
 
 
 def infer_test_model_repository(backend):
-    if LLMAPI_SETUP:
-        model_repository = str(Path(__file__).parent / f"{backend}_llmapi_models")
-    else:
-        model_repository = str(Path(__file__).parent / f"{backend}_models")
+    model_repository = str(Path(__file__).parent / f"{backend}_models")
     return model_repository
 
 
@@ -92,13 +90,23 @@ def infer_test_model_repository(backend):
 # only once for all the tests below.
 @pytest.fixture(scope="module")
 def server():
+    # TODO: tensorrllm and llmapi backends both use "tensorrtllm" as the backend flag for OpenAI server.
+    # In the future if the backend are consolidated, this check can be updated or removed.
+    # key: the TEST_BACKEND value
+    # value: the corresponding backend flag for OpenAI server
+    backend_map = {
+        "tensorrtllm": "tensorrtllm",
+        "llmapi": "tensorrtllm",
+        "vllm": "vllm",
+    }
+
     args = [
         "--model-repository",
         TEST_MODEL_REPOSITORY,
         "--tokenizer",
         TEST_TOKENIZER,
         "--backend",
-        TEST_BACKEND,
+        backend_map[TEST_BACKEND],
     ]
     # TODO: Incorporate kserve frontend binding smoke tests to catch any
     # breakage with default values or slight cli arg variations
diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py
index 09f8ed621e..6c0b39c1fb 100644
--- a/python/openai/tests/test_chat_completions.py
+++ b/python/openai/tests/test_chat_completions.py
@@ -311,7 +311,7 @@ def test_chat_completions_temperature_vllm(
     def test_chat_completions_temperature_tensorrtllm(
         self, client, backend: str, model: str, messages: List[dict]
     ):
-        if backend != "tensorrtllm":
+        if backend != "tensorrtllm" and backend != "llmapi":
             pytest.skip(
                 reason="Only used to test TRT-LLM-specific temperature behavior"
             )
@@ -371,7 +371,7 @@ def test_chat_completions_temperature_tensorrtllm(
 
     # TODO: Remove xfail for LLM API when it's verified.
     @pytest.mark.xfail(
-        condition=os.getenv("LLMAPI_SETUP") == "1",
+        condition=lambda backend: backend == "llmapi",
         reason="Seed parameter support to be verified for LLM API",
     )
     # Simple tests to verify random seed roughly behaves as expected
diff --git a/python/openai/tests/test_completions.py b/python/openai/tests/test_completions.py
index a767e723b6..5e76b9368b 100644
--- a/python/openai/tests/test_completions.py
+++ b/python/openai/tests/test_completions.py
@@ -192,8 +192,8 @@ def test_completions_temperature_vllm(
     def test_completions_temperature_tensorrtllm(
         self, client, backend: str, model: str, prompt: str
     ):
-        if backend != "tensorrtllm":
-            pytest.skip(reason="Only used to test vLLM-specific temperature behavior")
+        if backend != "tensorrtllm" and backend != "llmapi":
+            pytest.skip(reason="Only used to test TRTLLM-specific temperature behavior")
 
         responses = []
         payload1 = {
@@ -241,7 +241,7 @@ def test_completions_temperature_tensorrtllm(
 
     # TODO: Remove xfail for LLM API when it's verified.
     @pytest.mark.xfail(
-        condition=os.getenv("LLMAPI_SETUP") == "1",
+        condition=lambda backend: backend == "llmapi",
         reason="Seed parameter support to be verified for LLM API",
     )
     # Simple tests to verify seed roughly behaves as expected
diff --git a/python/openai/tests/test_openai_client.py b/python/openai/tests/test_openai_client.py
index 70011d9c1b..6c083d91ee 100644
--- a/python/openai/tests/test_openai_client.py
+++ b/python/openai/tests/test_openai_client.py
@@ -39,16 +39,12 @@ def test_openai_client_models(self, client: openai.OpenAI, backend: str):
         models = list(client.models.list())
         print(f"Models: {models}")
         if backend == "tensorrtllm":
-            import os
-
-            LLMAPI_SETUP = os.environ.get("LLMAPI_SETUP", 0)
-            if LLMAPI_SETUP:
-                # LLM API setup only has the tensorrt_llm model
-                assert len(models) == 1
-            else:
-                # tensorrt_llm_bls +
-                # preprocess -> tensorrt_llm -> postprocess
-                assert len(models) == 4
+            # tensorrt_llm_bls +
+            # preprocess -> tensorrt_llm -> postprocess
+            assert len(models) == 4
+        elif backend == "llmapi":
+            # Only has one tensorrt_llm model.
+            assert len(models) == 1
         elif backend == "vllm":
             assert len(models) == 1
         else:
@@ -82,7 +78,7 @@ def test_openai_client_chat_completion(
     def test_openai_client_completion_echo(
         self, client: openai.OpenAI, echo: bool, backend: str, model: str, prompt: str
     ):
-        if backend == "tensorrtllm":
+        if backend == "tensorrtllm" or backend == "llmapi":
             pytest.skip(
                 reason="TRT-LLM backend currently only supports setting this parameter at model load time",
             )
@@ -112,16 +108,12 @@ async def test_openai_client_models(self, client: openai.AsyncOpenAI, backend: s
         models = [model async for model in async_models]
         print(f"Models: {models}")
         if backend == "tensorrtllm":
-            import os
-
-            LLMAPI_SETUP = os.environ.get("LLMAPI_SETUP", 0)
-            if LLMAPI_SETUP:
-                # LLM API setup only has the tensorrt_llm model
-                assert len(models) == 1
-            else:
-                # tensorrt_llm_bls +
-                # preprocess -> tensorrt_llm -> postprocess
-                assert len(models) == 4
+            # tensorrt_llm_bls +
+            # preprocess -> tensorrt_llm -> postprocess
+            assert len(models) == 4
+        elif backend == "llmapi":
+            # Only has one tensorrt_llm model.
+            assert len(models) == 1
         elif backend == "vllm":
             assert len(models) == 1
         else:
diff --git a/python/openai/tests/utils.py b/python/openai/tests/utils.py
index fdffcc5ea9..a2b655d86d 100644
--- a/python/openai/tests/utils.py
+++ b/python/openai/tests/utils.py
@@ -1,4 +1,4 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -53,8 +53,17 @@ def setup_server(model_repository: str):
 
 
 def setup_fastapi_app(tokenizer: str, server: tritonserver.Server, backend: str):
+    # TODO: tensorrllm and llmapi backends both use "tensorrtllm" as the backend flag for OpenAI server.
+    # In the future if the backends are consolidated, this check can be updated or removed.
+    # key: the backend value
+    # value: the corresponding backend flag for OpenAI server
+    backend_map = {
+        "tensorrtllm": "tensorrtllm",
+        "llmapi": "tensorrtllm",
+        "vllm": "vllm",
+    }
     engine: TritonLLMEngine = TritonLLMEngine(
-        server=server, tokenizer=tokenizer, backend=backend
+        server=server, tokenizer=tokenizer, backend=backend_map[backend]
     )
     frontend: FastApiFrontend = FastApiFrontend(engine=engine)
     return frontend.app
diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh
index bbe204408d..7a86a6db97 100755
--- a/qa/L0_openai/test.sh
+++ b/qa/L0_openai/test.sh
@@ -87,7 +87,7 @@ function prepare_tensorrtllm() {
     python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},batching_strategy:inflight_fused_batching,max_queue_size:0,max_queue_delay_microseconds:1000,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,exclude_input_in_output:True
 
     # Prepare LLM API setup
-    LLMAPI_MODEL_REPO="tests/tensorrtllm_llmapi_models"
+    LLMAPI_MODEL_REPO="tests/llmapi_models"
     mkdir -p ${LLMAPI_MODEL_REPO}
     cp /app/all_models/llmapi/* "${LLMAPI_MODEL_REPO}" -r
     sed -i 's#"model":"TinyLlama/TinyLlama-1.1B-Chat-v1.0"#"model":"meta-llama/Meta-Llama-3.1-8B-Instruct"#g' ${LLMAPI_MODEL_REPO}/tensorrt_llm/1/model.json

From d2daa15091f88c87053304e49ba5165cde39dead Mon Sep 17 00:00:00 2001
From: krishung5 <krish@nvidia.com>
Date: Fri, 28 Feb 2025 15:22:50 -0800
Subject: [PATCH 5/6] Remove unused import

---
 python/openai/tests/test_chat_completions.py | 1 -
 python/openai/tests/test_completions.py      | 1 -
 2 files changed, 2 deletions(-)

diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py
index 6c0b39c1fb..64fd2873d6 100644
--- a/python/openai/tests/test_chat_completions.py
+++ b/python/openai/tests/test_chat_completions.py
@@ -25,7 +25,6 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import copy
-import os
 import subprocess
 from pathlib import Path
 from typing import List
diff --git a/python/openai/tests/test_completions.py b/python/openai/tests/test_completions.py
index 5e76b9368b..1a58a3294f 100644
--- a/python/openai/tests/test_completions.py
+++ b/python/openai/tests/test_completions.py
@@ -25,7 +25,6 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import copy
-import os
 
 import pytest
 

From d631a648f039112074272ae36403893e816598b7 Mon Sep 17 00:00:00 2001
From: krishung5 <krish@nvidia.com>
Date: Tue, 18 Mar 2025 02:00:41 -0700
Subject: [PATCH 6/6] VUpdate comment as verified that seed parameter is not
 supported yet

---
 python/openai/tests/test_chat_completions.py | 2 +-
 python/openai/tests/test_completions.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py
index 64fd2873d6..edcfd22fbe 100644
--- a/python/openai/tests/test_chat_completions.py
+++ b/python/openai/tests/test_chat_completions.py
@@ -371,7 +371,7 @@ def test_chat_completions_temperature_tensorrtllm(
     # TODO: Remove xfail for LLM API when it's verified.
     @pytest.mark.xfail(
         condition=lambda backend: backend == "llmapi",
-        reason="Seed parameter support to be verified for LLM API",
+        reason="Seed parameter is not supported in LLM API PyTorch workflow yet",
     )
     # Simple tests to verify random seed roughly behaves as expected
     def test_chat_completions_seed(self, client, model: str, messages: List[dict]):
diff --git a/python/openai/tests/test_completions.py b/python/openai/tests/test_completions.py
index 1a58a3294f..0b9fe0efa3 100644
--- a/python/openai/tests/test_completions.py
+++ b/python/openai/tests/test_completions.py
@@ -241,7 +241,7 @@ def test_completions_temperature_tensorrtllm(
     # TODO: Remove xfail for LLM API when it's verified.
     @pytest.mark.xfail(
         condition=lambda backend: backend == "llmapi",
-        reason="Seed parameter support to be verified for LLM API",
+        reason="Seed parameter is not supported in LLM API PyTorch workflow yet",
     )
     # Simple tests to verify seed roughly behaves as expected
     def test_completions_seed(self, client, model: str, prompt: str):