Change default chat template (#3723)

mzegla · web-flow · commit 5052e8c2c5ee · 2025-10-23T17:04:55.000+02:00
Intermediate change for merging: #3722
diff --git a/src/llm/servable_initializer.cpp b/src/llm/servable_initializer.cpp
@@ -51,7 +51,7 @@
 namespace ovms {
 
 static const std::string CHAT_TEMPLATE_WARNING_MESSAGE = "Warning: Chat template has not been loaded properly. Servable will not respond to /chat/completions endpoint.";
-static const std::string DEFAULT_CHAT_TEMPLATE = R"({% if messages|length != 1 %} {{ raise_exception('This servable accepts only single message requests') }}{% endif %}{{ messages[0]['content'] }})";
+static const std::string DEFAULT_CHAT_TEMPLATE = R"({% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}{% elif message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %})";
 
 void GenAiServableInitializer::loadChatTemplate(std::shared_ptr<GenAiServableProperties> properties, const std::string& chatTemplateDirectory) {
 #if (PYTHON_DISABLE == 0)
@@ -217,7 +217,7 @@ void GenAiServableInitializer::loadPyTemplateProcessor(std::shared_ptr<GenAiServ
 
             # Default chat template accepts only single message and outputs only it's 'content'
             # effectively turning it into a regular prompt. 
-            default_chat_template = "{% if messages|length != 1 %} {{ raise_exception('This servable accepts only single message requests') }}{% endif %}{{ messages[0]['content'] }}"
+            default_chat_template = "{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] }}{% elif message['role'] == 'system' %}{{ '<|system|>\n' + message['content'] + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}"
 
             bos_token = ""
             eos_token = ""
diff --git a/src/test/llm/assisted_decoding_test.cpp b/src/test/llm/assisted_decoding_test.cpp
@@ -94,8 +94,12 @@ class AssistedDecodingPipelinesHttpTest : public ::testing::Test {
         }
     }
 
-    int generateExpectedText(std::string prompt, bool addSpecialTokens) {
+    int generateExpectedText(std::string prompt, bool addSpecialTokens, bool applyChatTemplate) {
         try {
+            if (applyChatTemplate) {
+                ov::genai::ChatHistory chatHistory({{{"role", "user"}, {"content", prompt}}});
+                prompt = cbPipe->get_tokenizer().apply_chat_template(chatHistory, true);
+            }
             ov::Tensor promptIds = cbPipe->get_tokenizer().encode(prompt, ov::genai::add_special_tokens(addSpecialTokens)).input_ids;
             std::cout << "Generated prompt ids: " << getPromptTokensString(promptIds) << std::endl;
             auto generationHandle = cbPipe->add_request(
@@ -162,7 +166,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonSpeculativeDecodin
     // Generate reference from the base model (unassisted generation)
     config.max_new_tokens = 10;
     config.temperature = 0;
-    ASSERT_EQ(generateExpectedText("What is OpenVINO?", true), 0);
+    ASSERT_EQ(generateExpectedText("What is OpenVINO?", true, false), 0);
     ASSERT_EQ(config.num_return_sequences, expectedMessages.size());
 
     // Static number of candidates
@@ -185,8 +189,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonSpeculativeDecodin
     ASSERT_EQ(parsedResponse["choices"].Capacity(), 1);
     auto& choice = parsedResponse["choices"].GetArray()[0];
     ASSERT_TRUE(choice["text"].IsString());
-    // TODO: awaiting OV/GenAI fix, uncomment when fixed
-    // EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
+    EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
 
     // Dynamic number of candidates
     requestBody = R"(
@@ -208,15 +211,15 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonSpeculativeDecodin
     ASSERT_EQ(parsedResponse["choices"].Capacity(), 1);
     choice = parsedResponse["choices"].GetArray()[0];
     ASSERT_TRUE(choice["text"].IsString());
-    // TODO: awaiting OV/GenAI fix, uncomment when fixed
-    // EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
+    EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
 }
 
 TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonSpeculativeDecoding) {
+    GTEST_SKIP();  // TODO: Temporary skip to synchronize CI workers
     // Generate reference from the base model (unassisted generation)
     config.max_new_tokens = 10;
     config.temperature = 0;
-    ASSERT_EQ(generateExpectedText("What is OpenVINO?", false), 0);
+    ASSERT_EQ(generateExpectedText("What is OpenVINO?", false, true), 0);
     ASSERT_EQ(config.num_return_sequences, expectedMessages.size());
 
     // Static number of candidates
@@ -247,8 +250,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonSpeculativeDec
     ASSERT_TRUE(choice["message"]["content"].IsString());
     ASSERT_TRUE(choice["finish_reason"].IsString());
     ASSERT_FALSE(choice["logprobs"].IsObject());
-    // TODO: awaiting OV/GenAI fix, uncomment when fixed
-    // ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
+    ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
 
     // Dynamic number of candidates
     requestBody = R"(
@@ -278,8 +280,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonSpeculativeDec
     ASSERT_TRUE(choice["message"]["content"].IsString());
     ASSERT_TRUE(choice["finish_reason"].IsString());
     ASSERT_FALSE(choice["logprobs"].IsObject());
-    // TODO: awaiting OV/GenAI fix, uncomment when fixed
-    // ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
+    ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
 }
 
 TEST_F(AssistedDecodingPipelinesHttpTest, speculativeDecodingExclusiveParametersProvided) {
@@ -318,7 +319,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonPromptLookupDecodi
     // Generate reference from the base model (unassisted generation)
     config.max_new_tokens = 10;
     config.temperature = 0;
-    ASSERT_EQ(generateExpectedText("What is OpenVINO?", true), 0);
+    ASSERT_EQ(generateExpectedText("What is OpenVINO?", true, false), 0);
     ASSERT_EQ(config.num_return_sequences, expectedMessages.size());
 
     std::string requestBody = R"(
@@ -341,15 +342,15 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryCompletionsJsonPromptLookupDecodi
     ASSERT_EQ(parsedResponse["choices"].Capacity(), 1);
     auto& choice = parsedResponse["choices"].GetArray()[0];
     ASSERT_TRUE(choice["text"].IsString());
-    // TODO: awaiting OV/GenAI fix, uncomment when fixed
-    // EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
+    EXPECT_STREQ(choice["text"].GetString(), expectedMessages[0].c_str());
 }
 
 TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonPromptLookupDecoding) {
+    GTEST_SKIP();  // TODO: Temporary skip to synchronize CI workers
     // Generate reference from the base model (unassisted generation)
     config.max_new_tokens = 10;
     config.temperature = 0;
-    ASSERT_EQ(generateExpectedText("What is OpenVINO?", false), 0);
+    ASSERT_EQ(generateExpectedText("What is OpenVINO?", false, true), 0);
     ASSERT_EQ(config.num_return_sequences, expectedMessages.size());
 
     auto requestBody = R"(
@@ -380,8 +381,7 @@ TEST_F(AssistedDecodingPipelinesHttpTest, unaryChatCompletionsJsonPromptLookupDe
     ASSERT_TRUE(choice["message"]["content"].IsString());
     ASSERT_TRUE(choice["finish_reason"].IsString());
     ASSERT_FALSE(choice["logprobs"].IsObject());
-    // TODO: awaiting OV/GenAI fix, uncomment when fixed
-    // ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
+    ASSERT_EQ(choice["message"]["content"].GetString(), expectedMessages[0]);
 }
 
 // Consider parametrization of negative tests with request body and endpoint as parameters
diff --git a/src/test/llm/llmnode_test.cpp b/src/test/llm/llmnode_test.cpp
@@ -108,8 +108,12 @@ class LLMFlowHttpTest : public ::testing::Test {
         }
     }
 
-    int generateExpectedText(std::string prompt, bool addSpecialTokens = true) {
+    int generateExpectedText(std::string prompt, bool addSpecialTokens = true, bool applyChatTemplate = false) {
         try {
+            if (applyChatTemplate) {
+                ov::genai::ChatHistory chatHistory({{{"role", "user"}, {"content", prompt}}});
+                prompt = cbPipe->get_tokenizer().apply_chat_template(chatHistory, true);
+            }
             ov::Tensor promptIds = cbPipe->get_tokenizer().encode(prompt, ov::genai::add_special_tokens(addSpecialTokens)).input_ids;
             std::cout << "Generated prompt ids: " << getPromptTokensString(promptIds) << std::endl;
             auto generationHandle = cbPipe->add_request(
@@ -737,14 +741,15 @@ TEST_P(LLMFlowHttpTestParameterized, unaryChatCompletionsJsonNFail) {
 }
 
 TEST_P(LLMFlowHttpTestParameterized, unaryChatCompletionsJsonN) {
+    GTEST_SKIP();  // TODO: Temporary skip to synchronize CI workers
     auto params = GetParam();
     config.max_new_tokens = 5;
     config.rng_seed = 1;
     config.num_beams = 16;
     config.num_return_sequences = 8;
     config.echo = false;
     if (params.generateExpectedOutput) {
-        ASSERT_EQ(generateExpectedText("What is OpenVINO?", false), 0);
+        ASSERT_EQ(generateExpectedText("What is OpenVINO?", false, true), 0);
         ASSERT_EQ(config.num_return_sequences, expectedMessages.size());
     }
     std::string requestBody = R"(
@@ -2754,34 +2759,7 @@ TEST_P(LLMHttpParametersValidationTest, missingContentInMessage) {
     )";
 
     ovms::Status status = handler->dispatchToProcessor(endpointChatCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser);
-#if (PYTHON_DISABLE == 0)
-    bool genAiTemplateParsing = false;  // With Python enabled, we use native Jinja2 template parsing
-#else
-    bool genAiTemplateParsing = true;  // With Python disabled, we use GenAI template parsing
-#endif
-
-    if (params.modelName.find("vlm") != std::string::npos) {
-        ASSERT_EQ(status.getCode(), ovms::StatusCode::OK);  // GenAI accepts such messages, so we expect a successful response
-        return;
-    }
-
-    if (genAiTemplateParsing) {
-        /*
-            This test checks if API handler validation allows messages without content.
-            The reason why we expect generic error here is that with GenAI template rendering missing content is unexpected.
-            On the API handler level this is a positive path as this test confirms that request reaches template processing phase.
-        */
-        ASSERT_EQ(status.getCode(), ovms::StatusCode::MEDIAPIPE_EXECUTION_ERROR);
-        ASSERT_NE(status.string().find("Response generation failed"), std::string::npos);
-    } else {
-        /*
-            This test checks if API handler validation allows messages without content.
-            The reason why we expect error here is that for the tested LLM model, lack of content means that pipeline input is empty.
-            On the API handler level this is a positive path as this test confirms that request reaches template processing phase.
-        */
-        ASSERT_EQ(status.getCode(), ovms::StatusCode::MEDIAPIPE_EXECUTION_ERROR);
-        ASSERT_NE(status.string().find("Final prompt after applying chat template is empty"), std::string::npos);
-    }
+    ASSERT_EQ(status.getCode(), ovms::StatusCode::OK);
 }
 
 TEST_P(LLMHttpParametersValidationTest, roleNotAString) {
@@ -3267,19 +3245,13 @@ TEST_P(LLMHttpParametersValidationTest, MessagesWithOnlyRole) {
         {
             "model": ")" + params.modelName +
                               R"(",
-            "messages": [{"role": "abc"}]
+            "messages": [{"role": "user"}]
         }
     )";
 
-    if (params.modelName.find("vlm") != std::string::npos) {
-        ASSERT_EQ(
-            handler->dispatchToProcessor(endpointChatCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser),
-            ovms::StatusCode::OK);  // GenAI supports such messages
-    } else {
-        ASSERT_EQ(
-            handler->dispatchToProcessor(endpointChatCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser),
-            ovms::StatusCode::MEDIAPIPE_EXECUTION_ERROR);
-    }
+    ASSERT_EQ(
+        handler->dispatchToProcessor(endpointChatCompletions, requestBody, &response, comp, responseComponents, writer, multiPartParser),
+        ovms::StatusCode::OK);  // GenAI supports such messages
 }
 
 TEST_P(LLMHttpParametersValidationTest, SpeculativeDecodingNoSDSpecificParametersProvided) {
@@ -3345,7 +3317,7 @@ TEST_P(LLMHttpParametersValidationTest, MessagesWithMoreMessageFields) {
             "model": ")" + params.modelName +
                               R"(",
             "max_tokens": 1,
-            "messages": [{"role": "123", "content": "def", "unexpected": "123"}]
+            "messages": [{"role": "user", "content": "def", "unexpected": "123"}]
         }
     )";
 
diff --git a/src/test/llm/llmtemplate_test.cpp b/src/test/llm/llmtemplate_test.cpp