Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions python/openai/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -32,7 +32,6 @@
from tests.utils import OpenAIServer, setup_fastapi_app, setup_server


### TEST ENVIRONMENT SETUP ###
def infer_test_environment():
# Infer the test environment for simplicity in local dev/testing.
try:
Expand All @@ -47,8 +46,15 @@ def infer_test_environment():
try:
import tensorrt_llm as _

backend = "tensorrtllm"
model = "tensorrt_llm_bls"
# TODO: Refactor away from environment variables
LLMAPI_SETUP = os.environ.get("LLMAPI_SETUP", 0)

if LLMAPI_SETUP:
backend = "llmapi"
model = "tensorrt_llm"
else:
backend = "tensorrtllm"
model = "tensorrt_llm_bls"
return backend, model
except ImportError:
print("No tensorrt_llm installation found.")
Expand Down Expand Up @@ -84,13 +90,23 @@ def infer_test_model_repository(backend):
# only once for all the tests below.
@pytest.fixture(scope="module")
def server():
# TODO: tensorrllm and llmapi backends both use "tensorrtllm" as the backend flag for OpenAI server.
# In the future if the backend are consolidated, this check can be updated or removed.
# key: the TEST_BACKEND value
# value: the corresponding backend flag for OpenAI server
backend_map = {
"tensorrtllm": "tensorrtllm",
"llmapi": "tensorrtllm",
"vllm": "vllm",
}

args = [
"--model-repository",
TEST_MODEL_REPOSITORY,
"--tokenizer",
TEST_TOKENIZER,
"--backend",
TEST_BACKEND,
backend_map[TEST_BACKEND],
]
# TODO: Incorporate kserve frontend binding smoke tests to catch any
# breakage with default values or slight cli arg variations
Expand Down
9 changes: 7 additions & 2 deletions python/openai/tests/test_chat_completions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -310,7 +310,7 @@ def test_chat_completions_temperature_vllm(
def test_chat_completions_temperature_tensorrtllm(
self, client, backend: str, model: str, messages: List[dict]
):
if backend != "tensorrtllm":
if backend != "tensorrtllm" and backend != "llmapi":
pytest.skip(
reason="Only used to test TRT-LLM-specific temperature behavior"
)
Expand Down Expand Up @@ -368,6 +368,11 @@ def test_chat_completions_temperature_tensorrtllm(
assert response1_text == response2_text
assert response1_text != response3_text

# TODO: Remove xfail for LLM API when it's verified.
@pytest.mark.xfail(
condition=lambda backend: backend == "llmapi",
reason="Seed parameter is not supported in LLM API PyTorch workflow yet",
)
# Simple tests to verify random seed roughly behaves as expected
def test_chat_completions_seed(self, client, model: str, messages: List[dict]):
responses = []
Expand Down
13 changes: 9 additions & 4 deletions python/openai/tests/test_completions.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -191,8 +191,8 @@ def test_completions_temperature_vllm(
def test_completions_temperature_tensorrtllm(
self, client, backend: str, model: str, prompt: str
):
if backend != "tensorrtllm":
pytest.skip(reason="Only used to test vLLM-specific temperature behavior")
if backend != "tensorrtllm" and backend != "llmapi":
pytest.skip(reason="Only used to test TRTLLM-specific temperature behavior")

responses = []
payload1 = {
Expand Down Expand Up @@ -238,6 +238,11 @@ def test_completions_temperature_tensorrtllm(
assert response1_text == response2_text
assert response1_text != response3_text

# TODO: Remove xfail for LLM API when it's verified.
@pytest.mark.xfail(
condition=lambda backend: backend == "llmapi",
reason="Seed parameter is not supported in LLM API PyTorch workflow yet",
)
# Simple tests to verify seed roughly behaves as expected
def test_completions_seed(self, client, model: str, prompt: str):
responses = []
Expand All @@ -258,7 +263,7 @@ def test_completions_seed(self, client, model: str, prompt: str):
json=payload1,
)
)
# Third response should differ with different temperature in payload
# Third response should differ with different seed in payload
responses.append(
client.post(
"/v1/completions",
Expand Down
10 changes: 8 additions & 2 deletions python/openai/tests/test_openai_client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -42,6 +42,9 @@ def test_openai_client_models(self, client: openai.OpenAI, backend: str):
# tensorrt_llm_bls +
# preprocess -> tensorrt_llm -> postprocess
assert len(models) == 4
elif backend == "llmapi":
# Only has one tensorrt_llm model.
assert len(models) == 1
elif backend == "vllm":
assert len(models) == 1
else:
Expand Down Expand Up @@ -75,7 +78,7 @@ def test_openai_client_chat_completion(
def test_openai_client_completion_echo(
self, client: openai.OpenAI, echo: bool, backend: str, model: str, prompt: str
):
if backend == "tensorrtllm":
if backend == "tensorrtllm" or backend == "llmapi":
pytest.skip(
reason="TRT-LLM backend currently only supports setting this parameter at model load time",
)
Expand Down Expand Up @@ -108,6 +111,9 @@ async def test_openai_client_models(self, client: openai.AsyncOpenAI, backend: s
# tensorrt_llm_bls +
# preprocess -> tensorrt_llm -> postprocess
assert len(models) == 4
elif backend == "llmapi":
# Only has one tensorrt_llm model.
assert len(models) == 1
elif backend == "vllm":
assert len(models) == 1
else:
Expand Down
13 changes: 11 additions & 2 deletions python/openai/tests/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -53,8 +53,17 @@ def setup_server(model_repository: str):


def setup_fastapi_app(tokenizer: str, server: tritonserver.Server, backend: str):
# TODO: tensorrllm and llmapi backends both use "tensorrtllm" as the backend flag for OpenAI server.
# In the future if the backends are consolidated, this check can be updated or removed.
# key: the backend value
# value: the corresponding backend flag for OpenAI server
backend_map = {
"tensorrtllm": "tensorrtllm",
"llmapi": "tensorrtllm",
"vllm": "vllm",
}
engine: TritonLLMEngine = TritonLLMEngine(
server=server, tokenizer=tokenizer, backend=backend
server=server, tokenizer=tokenizer, backend=backend_map[backend]
)
frontend: FastApiFrontend = FastApiFrontend(engine=engine)
return frontend.app
Expand Down
44 changes: 39 additions & 5 deletions qa/L0_openai/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ function prepare_tensorrtllm() {
python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1
python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32
python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},batching_strategy:inflight_fused_batching,max_queue_size:0,max_queue_delay_microseconds:1000,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,exclude_input_in_output:True

# Prepare LLM API setup
LLMAPI_MODEL_REPO="tests/llmapi_models"
mkdir -p ${LLMAPI_MODEL_REPO}
cp /app/all_models/llmapi/* "${LLMAPI_MODEL_REPO}" -r
sed -i 's#"model":"TinyLlama/TinyLlama-1.1B-Chat-v1.0"#"model":"meta-llama/Meta-Llama-3.1-8B-Instruct"#g' ${LLMAPI_MODEL_REPO}/tensorrt_llm/1/model.json
}

function pre_test() {
Expand All @@ -103,11 +109,39 @@ function run_test() {

# Capture error code without exiting to allow log collection
set +e
pytest -s -v --junitxml=test_openai.xml tests/ 2>&1 > ${TEST_LOG}
if [ $? -ne 0 ]; then
cat ${TEST_LOG}
echo -e "\n***\n*** Test Failed\n***"
RET=1

if [ "${IMAGE_KIND}" == "TRTLLM" ]; then
echo "Running TensorRT-LLM tests..."

# First run with default model setup
echo "Running tests with default model setup..."
pytest -s -v --junitxml=test_openai_default.xml tests/ 2>&1 > test_openai_default.log
DEFAULT_RESULT=$?

# Then run with LLM API setup
echo "Running tests with LLM API setup..."
LLMAPI_SETUP=1 pytest -s -v --junitxml=test_openai_llmapi.xml tests/ 2>&1 > test_openai_llmapi.log
LLMAPI_RESULT=$?

# Combine results
if [ $DEFAULT_RESULT -ne 0 ]; then
cat test_openai_default.log
echo -e "\n***\n*** Test Failed with default model setup\n***"
RET=1
fi
if [ $LLMAPI_RESULT -ne 0 ]; then
cat test_openai_llmapi.log
echo -e "\n***\n*** Test Failed with LLM API setup\n***"
RET=1
fi
else
echo "Running vLLM tests..."
pytest -s -v --junitxml=test_openai.xml tests/ 2>&1 > ${TEST_LOG}
if [ $? -ne 0 ]; then
cat ${TEST_LOG}
echo -e "\n***\n*** Test Failed\n***"
RET=1
fi
fi
set -e

Expand Down
Loading