diff --git a/python/openai/tests/conftest.py b/python/openai/tests/conftest.py index 9ea9a5634e..8460889de2 100644 --- a/python/openai/tests/conftest.py +++ b/python/openai/tests/conftest.py @@ -1,4 +1,4 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -32,7 +32,6 @@ from tests.utils import OpenAIServer, setup_fastapi_app, setup_server -### TEST ENVIRONMENT SETUP ### def infer_test_environment(): # Infer the test environment for simplicity in local dev/testing. try: @@ -47,8 +46,15 @@ def infer_test_environment(): try: import tensorrt_llm as _ - backend = "tensorrtllm" - model = "tensorrt_llm_bls" + # TODO: Refactor away from environment variables + LLMAPI_SETUP = os.environ.get("LLMAPI_SETUP", 0) + + if LLMAPI_SETUP: + backend = "llmapi" + model = "tensorrt_llm" + else: + backend = "tensorrtllm" + model = "tensorrt_llm_bls" return backend, model except ImportError: print("No tensorrt_llm installation found.") @@ -84,13 +90,23 @@ def infer_test_model_repository(backend): # only once for all the tests below. @pytest.fixture(scope="module") def server(): + # TODO: tensorrllm and llmapi backends both use "tensorrtllm" as the backend flag for OpenAI server. + # In the future if the backend are consolidated, this check can be updated or removed. + # key: the TEST_BACKEND value + # value: the corresponding backend flag for OpenAI server + backend_map = { + "tensorrtllm": "tensorrtllm", + "llmapi": "tensorrtllm", + "vllm": "vllm", + } + args = [ "--model-repository", TEST_MODEL_REPOSITORY, "--tokenizer", TEST_TOKENIZER, "--backend", - TEST_BACKEND, + backend_map[TEST_BACKEND], ] # TODO: Incorporate kserve frontend binding smoke tests to catch any # breakage with default values or slight cli arg variations diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py index 401601c526..edcfd22fbe 100644 --- a/python/openai/tests/test_chat_completions.py +++ b/python/openai/tests/test_chat_completions.py @@ -1,4 +1,4 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -310,7 +310,7 @@ def test_chat_completions_temperature_vllm( def test_chat_completions_temperature_tensorrtllm( self, client, backend: str, model: str, messages: List[dict] ): - if backend != "tensorrtllm": + if backend != "tensorrtllm" and backend != "llmapi": pytest.skip( reason="Only used to test TRT-LLM-specific temperature behavior" ) @@ -368,6 +368,11 @@ def test_chat_completions_temperature_tensorrtllm( assert response1_text == response2_text assert response1_text != response3_text + # TODO: Remove xfail for LLM API when it's verified. + @pytest.mark.xfail( + condition=lambda backend: backend == "llmapi", + reason="Seed parameter is not supported in LLM API PyTorch workflow yet", + ) # Simple tests to verify random seed roughly behaves as expected def test_chat_completions_seed(self, client, model: str, messages: List[dict]): responses = [] diff --git a/python/openai/tests/test_completions.py b/python/openai/tests/test_completions.py index d89ff4701e..0b9fe0efa3 100644 --- a/python/openai/tests/test_completions.py +++ b/python/openai/tests/test_completions.py @@ -1,4 +1,4 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -191,8 +191,8 @@ def test_completions_temperature_vllm( def test_completions_temperature_tensorrtllm( self, client, backend: str, model: str, prompt: str ): - if backend != "tensorrtllm": - pytest.skip(reason="Only used to test vLLM-specific temperature behavior") + if backend != "tensorrtllm" and backend != "llmapi": + pytest.skip(reason="Only used to test TRTLLM-specific temperature behavior") responses = [] payload1 = { @@ -238,6 +238,11 @@ def test_completions_temperature_tensorrtllm( assert response1_text == response2_text assert response1_text != response3_text + # TODO: Remove xfail for LLM API when it's verified. + @pytest.mark.xfail( + condition=lambda backend: backend == "llmapi", + reason="Seed parameter is not supported in LLM API PyTorch workflow yet", + ) # Simple tests to verify seed roughly behaves as expected def test_completions_seed(self, client, model: str, prompt: str): responses = [] @@ -258,7 +263,7 @@ def test_completions_seed(self, client, model: str, prompt: str): json=payload1, ) ) - # Third response should differ with different temperature in payload + # Third response should differ with different seed in payload responses.append( client.post( "/v1/completions", diff --git a/python/openai/tests/test_openai_client.py b/python/openai/tests/test_openai_client.py index 6f1b456ab4..6c083d91ee 100644 --- a/python/openai/tests/test_openai_client.py +++ b/python/openai/tests/test_openai_client.py @@ -1,4 +1,4 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -42,6 +42,9 @@ def test_openai_client_models(self, client: openai.OpenAI, backend: str): # tensorrt_llm_bls + # preprocess -> tensorrt_llm -> postprocess assert len(models) == 4 + elif backend == "llmapi": + # Only has one tensorrt_llm model. + assert len(models) == 1 elif backend == "vllm": assert len(models) == 1 else: @@ -75,7 +78,7 @@ def test_openai_client_chat_completion( def test_openai_client_completion_echo( self, client: openai.OpenAI, echo: bool, backend: str, model: str, prompt: str ): - if backend == "tensorrtllm": + if backend == "tensorrtllm" or backend == "llmapi": pytest.skip( reason="TRT-LLM backend currently only supports setting this parameter at model load time", ) @@ -108,6 +111,9 @@ async def test_openai_client_models(self, client: openai.AsyncOpenAI, backend: s # tensorrt_llm_bls + # preprocess -> tensorrt_llm -> postprocess assert len(models) == 4 + elif backend == "llmapi": + # Only has one tensorrt_llm model. + assert len(models) == 1 elif backend == "vllm": assert len(models) == 1 else: diff --git a/python/openai/tests/utils.py b/python/openai/tests/utils.py index fdffcc5ea9..a2b655d86d 100644 --- a/python/openai/tests/utils.py +++ b/python/openai/tests/utils.py @@ -1,4 +1,4 @@ -# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -53,8 +53,17 @@ def setup_server(model_repository: str): def setup_fastapi_app(tokenizer: str, server: tritonserver.Server, backend: str): + # TODO: tensorrllm and llmapi backends both use "tensorrtllm" as the backend flag for OpenAI server. + # In the future if the backends are consolidated, this check can be updated or removed. + # key: the backend value + # value: the corresponding backend flag for OpenAI server + backend_map = { + "tensorrtllm": "tensorrtllm", + "llmapi": "tensorrtllm", + "vllm": "vllm", + } engine: TritonLLMEngine = TritonLLMEngine( - server=server, tokenizer=tokenizer, backend=backend + server=server, tokenizer=tokenizer, backend=backend_map[backend] ) frontend: FastApiFrontend = FastApiFrontend(engine=engine) return frontend.app diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh index 0921bce98e..7a86a6db97 100755 --- a/qa/L0_openai/test.sh +++ b/qa/L0_openai/test.sh @@ -85,6 +85,12 @@ function prepare_tensorrtllm() { python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/postprocessing/config.pbtxt tokenizer_dir:${HF_LLAMA_MODEL},triton_max_batch_size:64,postprocessing_instance_count:1 python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:64,decoupled_mode:True,bls_instance_count:1,accumulate_tokens:False,logits_datatype:TYPE_FP32 python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},batching_strategy:inflight_fused_batching,max_queue_size:0,max_queue_delay_microseconds:1000,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,exclude_input_in_output:True + + # Prepare LLM API setup + LLMAPI_MODEL_REPO="tests/llmapi_models" + mkdir -p ${LLMAPI_MODEL_REPO} + cp /app/all_models/llmapi/* "${LLMAPI_MODEL_REPO}" -r + sed -i 's#"model":"TinyLlama/TinyLlama-1.1B-Chat-v1.0"#"model":"meta-llama/Meta-Llama-3.1-8B-Instruct"#g' ${LLMAPI_MODEL_REPO}/tensorrt_llm/1/model.json } function pre_test() { @@ -103,11 +109,39 @@ function run_test() { # Capture error code without exiting to allow log collection set +e - pytest -s -v --junitxml=test_openai.xml tests/ 2>&1 > ${TEST_LOG} - if [ $? -ne 0 ]; then - cat ${TEST_LOG} - echo -e "\n***\n*** Test Failed\n***" - RET=1 + + if [ "${IMAGE_KIND}" == "TRTLLM" ]; then + echo "Running TensorRT-LLM tests..." + + # First run with default model setup + echo "Running tests with default model setup..." + pytest -s -v --junitxml=test_openai_default.xml tests/ 2>&1 > test_openai_default.log + DEFAULT_RESULT=$? + + # Then run with LLM API setup + echo "Running tests with LLM API setup..." + LLMAPI_SETUP=1 pytest -s -v --junitxml=test_openai_llmapi.xml tests/ 2>&1 > test_openai_llmapi.log + LLMAPI_RESULT=$? + + # Combine results + if [ $DEFAULT_RESULT -ne 0 ]; then + cat test_openai_default.log + echo -e "\n***\n*** Test Failed with default model setup\n***" + RET=1 + fi + if [ $LLMAPI_RESULT -ne 0 ]; then + cat test_openai_llmapi.log + echo -e "\n***\n*** Test Failed with LLM API setup\n***" + RET=1 + fi + else + echo "Running vLLM tests..." + pytest -s -v --junitxml=test_openai.xml tests/ 2>&1 > ${TEST_LOG} + if [ $? -ne 0 ]; then + cat ${TEST_LOG} + echo -e "\n***\n*** Test Failed\n***" + RET=1 + fi fi set -e