Introduce VLM pipeline support (#3095)

mzegla · web-flow · commit 35a663be3601 · 2025-03-06T10:30:22.000+01:00
Implement VisualLanguageModelServable and its dependecies to support multimodal pipelines in HttpLLMCalculator.
diff --git a/demos/common/export_models/export_model.py b/demos/common/export_models/export_model.py
@@ -38,6 +38,7 @@ def add_common_arguments(parser):
 subparsers = parser.add_subparsers(help='subcommand help', required=True, dest='task')
 parser_text = subparsers.add_parser('text_generation', help='export model for chat and completion endpoints')
 add_common_arguments(parser_text)
+parser_text.add_argument('--pipeline_type', default='CONTINUOUS_BATCHING', help='Type of the pipeline to be used. Can be either CONTINUOUS_BATCHING or VISUAL_LANGUAGE_MODEL.', dest='pipeline_type')
 parser_text.add_argument('--kv_cache_precision', default=None, choices=["u8"], help='u8 or empty (model default). Reduced kv cache precision to u8 lowers the cache size consumption.', dest='kv_cache_precision')
 parser_text.add_argument('--enable_prefix_caching', action='store_true', help='This algorithm is used to cache the prompt tokens.', dest='enable_prefix_caching')
 parser_text.add_argument('--disable_dynamic_split_fuse', action='store_false', help='The maximum number of tokens that can be batched together.', dest='dynamic_split_fuse')
@@ -143,6 +144,7 @@ def add_common_arguments(parser):
   }
   node_options: {
       [type.googleapis.com / mediapipe.LLMCalculatorOptions]: {
+          pipeline_type: {{pipeline_type}},
           models_path: "{{model_path}}",
           plugin_config: '{ {% if kv_cache_precision %}"KV_CACHE_PRECISION": "{{kv_cache_precision}}"{% endif %}}',
           enable_prefix_caching: {% if not enable_prefix_caching %}false{% else %} true{% endif%},
diff --git a/prepare_llm_models.sh b/prepare_llm_models.sh
@@ -20,10 +20,11 @@ if [ -z "$1" ]; then
   exit 1
 fi
 
+CB_MODEL="facebook/opt-125m"
 EMBEDDING_MODEL="thenlper/gte-small"
 RERANK_MODEL="BAAI/bge-reranker-base"
 VLM_MODEL="OpenGVLab/InternVL2-1B"
-if [ -d "$1/facebook/opt-125m" ] && [ -d "$1/$EMBEDDING_MODEL" ] && [ -d "$1/$RERANK_MODEL" ]; then
+if [ -d "$1/$CB_MODEL" ] && [ -d "$1/$EMBEDDING_MODEL" ] && [ -d "$1/$RERANK_MODEL" ] && [ -d "$1/$VLM_MODEL" ]; then
   echo "Models directory $1 exists. Skipping downloading models."
   exit 0
 fi
@@ -44,16 +45,16 @@ else
 fi
 mkdir -p $1
 
-if [ -d "$1/facebook/opt-125m" ]; then
-  echo "Models directory $1/facebook/opt-125m exists. Skipping downloading models."
+if [ -d "$1/$CB_MODEL" ]; then
+  echo "Models directory $1/$CB_MODEL exists. Skipping downloading models."
 else
-  python3 demos/common/export_models/export_model.py text_generation --source_model facebook/opt-125m --weight-format int8 --model_repository_path $1
+  python3 demos/common/export_models/export_model.py text_generation --source_model "$CB_MODEL" --weight-format int8 --model_repository_path $1
 fi
 
 if [ -d "$1/$VLM_MODEL" ]; then
   echo "Models directory $1/$VLM_MODEL exists. Skipping downloading models."
 else
-  python3 demos/common/export_models/export_model.py text_generation --source_model "$VLM_MODEL" --weight-format int4 --kv_cache_precision u8 --model_repository_path $1
+  python3 demos/common/export_models/export_model.py text_generation --pipeline_type VISUAL_LANGUAGE_MODEL --source_model "$VLM_MODEL" --weight-format int4 --kv_cache_precision u8 --model_repository_path $1
 fi
 
 if [ -d "$1/$EMBEDDING_MODEL" ]; then
diff --git a/src/BUILD b/src/BUILD
@@ -1896,6 +1896,7 @@ cc_test(
                 "test/llmnode_test.cpp",
                 "test/llmtemplate_test.cpp",
                 "test/text_streamer_test.cpp",
+                "test/llm/visual_language_model/complete_flow_test.cpp",
             ],
             "//:disable_python" : [],
         }),
diff --git a/src/llm/BUILD b/src/llm/BUILD
@@ -93,8 +93,10 @@ cc_library(
 cc_library(
     name = "genai_servables",
     hdrs = ["servable.hpp", "servable_initializer.hpp", 
-            "continuous_batching/servable.hpp", "continuous_batching/llm_executor.hpp", "continuous_batching/servable_initializer.hpp"],
-    srcs = ["servable.cpp", "servable_initializer.cpp", "continuous_batching/servable.cpp", "continuous_batching/servable_initializer.cpp"],
+            "continuous_batching/servable.hpp", "continuous_batching/llm_executor.hpp", "continuous_batching/servable_initializer.hpp",
+            "visual_language_model/servable.hpp"],
+    srcs = ["servable.cpp", "servable_initializer.cpp", "continuous_batching/servable.cpp", "continuous_batching/servable_initializer.cpp",
+            "visual_language_model/servable.cpp"],
     deps = [
         "//third_party:openvino",
         "@mediapipe//mediapipe/framework:calculator_framework",
diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp
@@ -139,10 +139,14 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages() {
         auto& obj = it->value.GetArray()[i];
         if (!obj.IsObject())
             return absl::InvalidArgumentError("Message is not a JSON object");
+        // Add new message to chat history
+        request.chatHistory.push_back({});
         for (auto member = obj.MemberBegin(); member != obj.MemberEnd(); member++) {
             if (!member->name.IsString())
                 return absl::InvalidArgumentError("Invalid message structure");
             if (member->value.IsString()) {
+                // Add new field to the last message in history
+                request.chatHistory.back().insert({member->name.GetString(), member->value.GetString()});
                 continue;
             } else {
                 if (member->name.GetString() == std::string("content") && member->value.IsArray()) {
@@ -191,7 +195,13 @@ absl::Status OpenAIChatCompletionsHandler::parseMessages() {
                             return absl::InvalidArgumentError("Unsupported content type");
                         }
                     }
+                    // Pulling out text from nested structure to the "content" field for text and erase whole "content" value for image data
+                    // since images are stored separately in request.images
                     member->value = contentText;
+                    // Add new field to the last message in history if content is text
+                    if (member->value.IsString()) {
+                        request.chatHistory.back().insert({member->name.GetString(), member->value.GetString()});
+                    }
                 } else {
                     return absl::InvalidArgumentError("Invalid message structure - content should be string or array");
                 }
@@ -214,6 +224,10 @@ const std::vector<ov::Tensor> OpenAIChatCompletionsHandler::getImages() const {
     return request.images;
 }
 
+const ov::genai::ChatHistory& OpenAIChatCompletionsHandler::getChatHistory() const {
+    return request.chatHistory;
+}
+
 absl::Status OpenAIChatCompletionsHandler::parseChatCompletionsPart(uint32_t maxTokensLimit) {
     // messages: [{role: content}, {role: content}, ...]; required
     auto status = parseMessages();
diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp
@@ -63,6 +63,7 @@ struct CompletionUsageStatistics {
 
 // Class that maps OpenAI request content and provides methods to create GenerationConfig from it.
 struct OpenAIChatCompletionsRequest {
+    ov::genai::ChatHistory chatHistory;
     std::string processedJson;
     std::vector<ov::Tensor> images;
     std::optional<std::string> prompt{std::nullopt};
@@ -189,6 +190,7 @@ class OpenAIChatCompletionsHandler {
     StreamOptions getStreamOptions() const;
     const std::string& getProcessedJson() const;
     const std::vector<ov::Tensor> getImages() const;
+    const ov::genai::ChatHistory& getChatHistory() const;
 
     bool isStream() const;
     std::string getModel() const;
diff --git a/src/llm/continuous_batching/servable.cpp b/src/llm/continuous_batching/servable.cpp
@@ -48,6 +48,13 @@ void ContinuousBatchingServable::notifyExecutorThread() {
     properties->llmExecutorWrapper->notifyNewRequestArrived();
 }
 
+absl::Status ContinuousBatchingServable::addRequestToPipeline(std::shared_ptr<ContinuousBatchingServableExecutionContext>& executionContext) {
+    executionContext->generationHandle = properties->pipeline->add_request(currentRequestId++,  // to be removed from API?
+        executionContext->inputIds,
+        executionContext->apiHandler->createGenerationConfig());
+    return absl::OkStatus();
+}
+
 // Node resources interface start
 std::shared_ptr<GenAiServableExecutionContext> ContinuousBatchingServable::createExecutionContext() {
     return std::make_shared<ContinuousBatchingServableExecutionContext>();
@@ -63,9 +70,10 @@ absl::Status ContinuousBatchingServable::scheduleExecution(std::shared_ptr<GenAi
         return absl::CancelledError();
     }
 
-    cbExecutionContext->generationHandle = properties->pipeline->add_request(currentRequestId++,  // to be removed from API?
-        cbExecutionContext->inputIds,
-        cbExecutionContext->apiHandler->createGenerationConfig());
+    auto status = addRequestToPipeline(cbExecutionContext);
+    if (!status.ok()) {
+        return status;
+    }
 
     cbExecutionContext->payload.client->registerDisconnectionCallback([genHandle = cbExecutionContext->generationHandle]() {
         genHandle->stop();
diff --git a/src/llm/continuous_batching/servable.hpp b/src/llm/continuous_batching/servable.hpp
@@ -37,16 +37,19 @@ struct ContinuousBatchingServableProperties : public GenAiServableProperties {
 };
 
 class ContinuousBatchingServable : public GenAiServable {
-    std::shared_ptr<ContinuousBatchingServableProperties> properties;
-
 protected:
+    std::shared_ptr<ContinuousBatchingServableProperties> properties;
     void notifyExecutorThread();
 
 public:
     ContinuousBatchingServable() {
         properties = std::make_shared<ContinuousBatchingServableProperties>();
     }
 
+    // addRequestToPipeline implementation can be specific for different servables with Continuous Batching engine
+    // This method is used in scheduleExecution and MUST fill generationHandle in executionContext
+    virtual absl::Status addRequestToPipeline(std::shared_ptr<ContinuousBatchingServableExecutionContext>& executionContext);
+
     // Interface methods
     std::shared_ptr<GenAiServableExecutionContext> createExecutionContext() override;
     std::shared_ptr<GenAiServableProperties> getProperties() override;
diff --git a/src/llm/continuous_batching/servable_initializer.cpp b/src/llm/continuous_batching/servable_initializer.cpp
@@ -68,8 +68,6 @@ Status ContinuousBatchingServableInitializer::initializeExperimental(std::shared
     if (!status.ok()) {
         return status;
     }
-
-    servable = std::make_shared<ContinuousBatchingServable>();
     auto properties = std::static_pointer_cast<ContinuousBatchingServableProperties>(servable->getProperties());
     properties->modelsPath = getBasePath();
 
@@ -132,8 +130,6 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
     if (!status.ok()) {
         return status;
     }
-
-    servable = std::make_shared<ContinuousBatchingServable>();
     auto properties = std::static_pointer_cast<ContinuousBatchingServableProperties>(servable->getProperties());
 
     properties->modelsPath = getBasePath();
diff --git a/src/llm/servable_initializer.cpp b/src/llm/servable_initializer.cpp
@@ -32,9 +32,10 @@
 #include "../logging.hpp"
 #include "../mediapipe_internal/mediapipe_utils.hpp"
 #include "../status.hpp"
+#include "continuous_batching/servable.hpp"
 #include "continuous_batching/servable_initializer.hpp"
-#include "servable.hpp"
 #include "servable_initializer.hpp"
+#include "visual_language_model/servable.hpp"
 
 namespace ovms {
 
@@ -146,19 +147,34 @@ Status initializeGenAiServable(std::shared_ptr<GenAiServable>& servable, const :
     Status status;
     if (nodeOptions.has_models_path()) {  // Stable initialization
         if (nodeOptions.pipeline_type() == mediapipe::LLMCalculatorOptions::CONTINUOUS_BATCHING) {
+            SPDLOG_LOGGER_INFO(modelmanager_logger, "Initializing Continuous Batching servable");
             ContinuousBatchingServableInitializer cbServableInitializer;
+            servable = std::make_shared<ContinuousBatchingServable>();
             status = cbServableInitializer.initialize(servable, nodeOptions, graphPath);
             if (status != StatusCode::OK) {
                 SPDLOG_LOGGER_ERROR(modelmanager_logger, "Error during LLM node resources initialization: {}", status.string());
                 return status;
             }
+        } else if (nodeOptions.pipeline_type() == mediapipe::LLMCalculatorOptions::VISUAL_LANGUAGE_MODEL) {
+            // VLM uses CB engine, so initialization part is shared (both servables share the same properties),
+            // therefore we can use CB servable initializer to initialize VLM servable
+            SPDLOG_LOGGER_INFO(modelmanager_logger, "Initializing Visual Language Model servable");
+            ContinuousBatchingServableInitializer cbServableInitializer;
+            servable = std::make_shared<VisualLanguageModelServable>();
+            status = cbServableInitializer.initialize(servable, nodeOptions, graphPath);
+            if (status != StatusCode::OK) {
+                SPDLOG_LOGGER_ERROR(modelmanager_logger, "Error during LLM node resources initialization: {}", status.string());
+                return status;
+            }
+
         } else {
             SPDLOG_LOGGER_ERROR(modelmanager_logger, "LLM node options do not contain any recognized pipeline configuration.");
             return StatusCode::INTERNAL_ERROR;
         }
     } else {
         if (nodeOptions.has_continuous_batching_pipeline_config()) {  // Experimental initialization
             ContinuousBatchingServableInitializer cbServableInitializer;
+            servable = std::make_shared<ContinuousBatchingServable>();
             status = cbServableInitializer.initializeExperimental(servable, nodeOptions, graphPath);
         } else {
             SPDLOG_LOGGER_ERROR(modelmanager_logger, "LLM node options do not contain any recognized pipeline configuration.");
diff --git a/src/llm/visual_language_model/servable.cpp b/src/llm/visual_language_model/servable.cpp
@@ -0,0 +1,69 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+
+#include "servable.hpp"
+
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include "../../logging.hpp"
+
+namespace ovms {
+
+absl::Status VisualLanguageModelServable::addRequestToPipeline(std::shared_ptr<ContinuousBatchingServableExecutionContext>& executionContext) {
+    auto vlmExecutionContext = std::static_pointer_cast<VisualLanguageModelServableExecutionContext>(executionContext);
+    vlmExecutionContext->generationHandle = properties->pipeline->add_request(currentRequestId++,  // to be removed from API?
+        vlmExecutionContext->inputText, vlmExecutionContext->inputImages,
+        vlmExecutionContext->apiHandler->createGenerationConfig());
+    return absl::OkStatus();
+}
+
+std::shared_ptr<GenAiServableExecutionContext> VisualLanguageModelServable::createExecutionContext() {
+    return std::make_shared<VisualLanguageModelServableExecutionContext>();
+}
+
+std::shared_ptr<GenAiServableProperties> VisualLanguageModelServable::getProperties() {
+    return properties;
+}
+
+absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr<GenAiServableExecutionContext>& executionContext) {
+    auto vlmExecutionContext = std::static_pointer_cast<VisualLanguageModelServableExecutionContext>(executionContext);
+    // Earlier we should validate that chat completion endpoint is used.
+    if (vlmExecutionContext->apiHandler == nullptr) {
+        return absl::Status(absl::StatusCode::kInvalidArgument, "API handler is not initialized");
+    }
+
+    vlmExecutionContext->inputImages = vlmExecutionContext->apiHandler->getImages();
+
+    auto& chatHistory = vlmExecutionContext->apiHandler->getChatHistory();
+    if (chatHistory.size() == 0) {
+        return absl::Status(absl::StatusCode::kInvalidArgument, "Chat history is empty");
+    }
+
+    constexpr bool add_generation_prompt = true;  // confirm it should be hardcoded
+    vlmExecutionContext->inputText = properties->tokenizer.apply_chat_template(chatHistory, add_generation_prompt);
+
+    // Below logic is used only for the statistics and debugging purposes and does not affect the model execution.
+    bool encodeAddSpecialTokens = false;  // assuming chat template application added special tokens
+    ov::Tensor inputTextIds = getProperties()->tokenizer.encode(vlmExecutionContext->inputText, ov::genai::add_special_tokens(encodeAddSpecialTokens)).input_ids;
+    vlmExecutionContext->apiHandler->setPromptTokensUsage(inputTextIds.get_size());
+    SPDLOG_LOGGER_TRACE(llm_calculator_logger, "{}", getPromptTokensString(inputTextIds));
+
+    return absl::OkStatus();
+}
+}  // namespace ovms
diff --git a/src/llm/visual_language_model/servable.hpp b/src/llm/visual_language_model/servable.hpp
@@ -0,0 +1,55 @@
+//*****************************************************************************
+// Copyright 2025 Intel Corporation
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//*****************************************************************************
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <openvino/genai/continuous_batching_pipeline.hpp>
+
+#include "../continuous_batching/servable.hpp"
+
+namespace ovms {
+
+/*
+VisualLanguageModelServable extends ContinuousBatchingServable since in GenAI VLM is executed by CB engine, so many parts are common.
+This servable also reuses CB servable initializer.
+*/
+
+using VisualLanguageModelServableProperties = ContinuousBatchingServableProperties;
+
+struct VisualLanguageModelServableExecutionContext : public ContinuousBatchingServableExecutionContext {
+    // Currently, scheduleExecution uses add_request call with prompt as std::string and images as std::vector<ov::Tensor>
+    // so prepareInputs provides inputText and inputImages instead of inputIds from the base class.
+    std::vector<ov::Tensor> inputImages;
+    std::string inputText;
+};
+
+class VisualLanguageModelServable : public ContinuousBatchingServable {
+public:
+    VisualLanguageModelServable() {
+        properties = std::make_shared<VisualLanguageModelServableProperties>();
+    }
+
+    // Overriding ContinuousBatchingServable method
+    absl::Status addRequestToPipeline(std::shared_ptr<ContinuousBatchingServableExecutionContext>& executionContext) override;
+
+    // Interface methods
+    std::shared_ptr<GenAiServableExecutionContext> createExecutionContext() override;
+    std::shared_ptr<GenAiServableProperties> getProperties() override;
+    absl::Status prepareInputs(std::shared_ptr<GenAiServableExecutionContext>& executionContext) override;
+};
+}  // namespace ovms
diff --git a/src/test/llm/visual_language_model/complete_flow_test.cpp b/src/test/llm/visual_language_model/complete_flow_test.cpp
diff --git a/src/test/llm/visual_language_model/config.json b/src/test/llm/visual_language_model/config.json
diff --git a/src/test/llm/visual_language_model/graph.pbtxt b/src/test/llm/visual_language_model/graph.pbtxt
diff --git a/third_party/llm_engine/llm_engine.bzl b/third_party/llm_engine/llm_engine.bzl
diff --git a/windows_prepare_llm_models.bat b/windows_prepare_llm_models.bat