diff --git a/Makefile b/Makefile index 45ae083ff0..9d986a5433 100644 --- a/Makefile +++ b/Makefile @@ -74,9 +74,9 @@ FUZZER_BUILD ?= 0 # NOTE: when changing any value below, you'll need to adjust WORKSPACE file by hand: # - uncomment source build section, comment binary section # - adjust binary version path - version variable is not passed to WORKSPACE file! -OV_SOURCE_BRANCH ?= c5137ff870dc3386bfd4692176e8d6725a17b99a # master / 2024-12-16 +OV_SOURCE_BRANCH ?= 552ba6682b40739fa0a9b8ee5a28675d9874fe34 # master / 2025-01-06 OV_CONTRIB_BRANCH ?= c39462ca8d7c550266dc70cdbfbe4fc8c5be0677 # master / 2024-10-31 -OV_TOKENIZERS_BRANCH ?= bcfd3eda25ae3ec423502a4074e35c774506c732 # master / 2024-12-16 +OV_TOKENIZERS_BRANCH ?= cd9fb9b7fe4085fe74edbff54a1b70fa9dd11423 # master / 2024-12-27 OV_SOURCE_ORG ?= openvinotoolkit OV_CONTRIB_ORG ?= openvinotoolkit @@ -170,7 +170,7 @@ ifeq ($(findstring ubuntu,$(BASE_OS)),ubuntu) else ifeq ($(BASE_OS_TAG),22.04) OS=ubuntu22 INSTALL_DRIVER_VERSION ?= "24.39.31294" - DLDT_PACKAGE_URL ?= https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17638-c5137ff870d/l_openvino_toolkit_ubuntu22_2025.0.0.dev20241217_x86_64.tgz + DLDT_PACKAGE_URL ?= https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.0.0-17757-552ba6682b4/l_openvino_toolkit_ubuntu22_2025.0.0.dev20250106_x86_64.tgz endif endif ifeq ($(BASE_OS),redhat) diff --git a/src/llm/apis/openai_completions.cpp b/src/llm/apis/openai_completions.cpp index 5eaad4ccf3..8aab15d156 100644 --- a/src/llm/apis/openai_completions.cpp +++ b/src/llm/apis/openai_completions.cpp @@ -516,8 +516,8 @@ void OpenAIChatCompletionsHandler::incrementProcessedTokens(int numTokens) { usage.completionTokens += numTokens; } -ov::genai::GenerationConfig OpenAIChatCompletionsHandler::createGenerationConfig() const { - return request.createGenerationConfig(); +ov::genai::GenerationConfig OpenAIChatCompletionsHandler::createGenerationConfig(ov::genai::AdapterConfig adapters) const { + return request.createGenerationConfig(adapters); } absl::Status OpenAIChatCompletionsHandler::parseRequest(uint32_t maxTokensLimit, uint32_t bestOfLimit) { diff --git a/src/llm/apis/openai_completions.hpp b/src/llm/apis/openai_completions.hpp index bd4130d951..9256093534 100644 --- a/src/llm/apis/openai_completions.hpp +++ b/src/llm/apis/openai_completions.hpp @@ -86,8 +86,9 @@ struct OpenAIChatCompletionsRequest { OpenAIChatCompletionsRequest() = default; ~OpenAIChatCompletionsRequest() = default; - ov::genai::GenerationConfig createGenerationConfig() const { + ov::genai::GenerationConfig createGenerationConfig(ov::genai::AdapterConfig adapters) const { ov::genai::GenerationConfig config; + config.adapters = adapters; // Generic if (maxTokens.has_value()) @@ -140,7 +141,6 @@ struct OpenAIChatCompletionsRequest { if (logprobschat || logprobs > 0) config.logprobs = 1; - return config; } }; @@ -181,7 +181,7 @@ class OpenAIChatCompletionsHandler { void incrementProcessedTokens(int numTokens = 1); - ov::genai::GenerationConfig createGenerationConfig() const; + ov::genai::GenerationConfig createGenerationConfig(ov::genai::AdapterConfig) const; absl::Status parseRequest(uint32_t maxTokensLimit, uint32_t bestOfLimit); absl::Status parseMessages(); diff --git a/src/llm/http_llm_calculator.cc b/src/llm/http_llm_calculator.cc index 565326fa5f..05993315ef 100644 --- a/src/llm/http_llm_calculator.cc +++ b/src/llm/http_llm_calculator.cc @@ -166,11 +166,14 @@ class HttpLLMCalculator : public CalculatorBase { ov::Tensor finalPromptIds = nodeResources->cbPipe->get_tokenizer().encode(finalPrompt, ov::genai::add_special_tokens(encodeAddSpecialTokens)).input_ids; this->apiHandler->setPromptTokensUsage(finalPromptIds.get_size()); SPDLOG_LOGGER_TRACE(llm_calculator_logger, "{}", getPromptTokensString(finalPromptIds)); + + SPDLOG_LOGGER_INFO(llm_calculator_logger, "Adapters count: {}", nodeResources->adapters.get_adapters().size()); + SPDLOG_LOGGER_INFO(llm_calculator_logger, "Adapters mode: {}", nodeResources->adapters.get_mode()); this->generationHandle = nodeResources->cbPipe->add_request( currentRequestId++, /*to be removed from API?*/ finalPromptIds, - this->apiHandler->createGenerationConfig()); + this->apiHandler->createGenerationConfig(nodeResources->adapters)); // TODO: Revert when drogon adds disconnection callbacks: https://github.com/drogonframework/drogon/pull/2204 // this->client->registerDisconnectionCallback([genHandle = this->generationHandle]() { diff --git a/src/llm/llm_calculator.proto b/src/llm/llm_calculator.proto index 43c7c031fb..8c218e38d8 100644 --- a/src/llm/llm_calculator.proto +++ b/src/llm/llm_calculator.proto @@ -25,6 +25,11 @@ message LLMCalculatorOptions { // no rule to obtain this optional LLMCalculatorOptions ext = 113473750; } + message adapter_config{ + required string lora_path = 1; + optional float alpha = 2 [default = 1]; + } + required string models_path = 1; optional uint64 max_num_batched_tokens = 2 [default = 256]; @@ -45,4 +50,6 @@ message LLMCalculatorOptions { optional uint32 max_tokens_limit = 9 [default = 4096]; optional bool enable_prefix_caching = 10 [default = false]; + + repeated adapter_config adapter_configs = 11; } \ No newline at end of file diff --git a/src/llm/llmnoderesources.cpp b/src/llm/llmnoderesources.cpp index 3f13588a7e..55126c0449 100644 --- a/src/llm/llmnoderesources.cpp +++ b/src/llm/llmnoderesources.cpp @@ -175,6 +175,40 @@ Status LLMNodeResources::initializeLLMNodeResources(LLMNodeResources& nodeResour nodeResources.maxTokensLimit = nodeOptions.max_tokens_limit(); nodeResources.bestOfLimit = nodeOptions.best_of_limit(); + for (const auto& adapterConfig : nodeOptions.adapter_configs()) { + float alpha = adapterConfig.alpha(); + auto fsLoraPath = std::filesystem::path(adapterConfig.lora_path()); + std::string lora_path; + if (fsLoraPath.is_relative()) { + lora_path = (std::filesystem::path(graphPath) / fsLoraPath).string(); + } else { + lora_path = fsModelsPath.string(); + } + SPDLOG_LOGGER_INFO(modelmanager_logger, "LORA PATH {}, alpha {}", lora_path, alpha); + if (lora_path.empty()) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Adapter path for adapter"); + return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; + } + if (!std::filesystem::exists(lora_path)) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Adapter path for adapter {} does not exist: {}", lora_path); + return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; + } + if (!std::filesystem::is_regular_file(lora_path)) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Adapter path for adapter {} is not a file: {}", lora_path); + return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; + } + try { + nodeResources.adapters.add(ov::genai::Adapter(std::filesystem::path(lora_path)), alpha); + SPDLOG_LOGGER_INFO(modelmanager_logger,"Adapter loaded from path {} with alpha {}", lora_path, alpha); + } catch (const std::exception& e) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Error loading adapter from path {}: {}", lora_path, e.what()); + return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; + } catch (...) { + SPDLOG_LOGGER_ERROR(modelmanager_logger, "Unknown error loading adapter from path {}", lora_path); + return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED; + } + } + nodeResources.initiateGeneration(); return StatusCode::OK; @@ -186,6 +220,8 @@ void LLMNodeResources::initializeContinuousBatchingPipeline( const std::string& device, const plugin_config_t& pluginConfig, const plugin_config_t& tokenizerPluginConfig) { + ov::genai::GenerationConfig gen_config; + gen_config.adapters = this->adapters; this->cbPipe = std::make_unique(basePath, schedulerConfig, device, pluginConfig, tokenizerPluginConfig); } diff --git a/src/llm/llmnoderesources.hpp b/src/llm/llmnoderesources.hpp index debe5942e1..111ca96930 100644 --- a/src/llm/llmnoderesources.hpp +++ b/src/llm/llmnoderesources.hpp @@ -24,6 +24,7 @@ #include #include +#include #include #pragma GCC diagnostic push @@ -110,6 +111,7 @@ struct LLMNodeResources { plugin_config_t pluginConfig; ov::genai::SchedulerConfig schedulerConfig; TextProcessor textProcessor; + ov::genai::AdapterConfig adapters; int maxTokensLimit; int bestOfLimit; diff --git a/third_party/llm_engine/llm_engine.bzl b/third_party/llm_engine/llm_engine.bzl index 74807cccaf..b0c583e0ad 100644 --- a/third_party/llm_engine/llm_engine.bzl +++ b/third_party/llm_engine/llm_engine.bzl @@ -24,7 +24,7 @@ def llm_engine(): new_git_repository( name = "llm_engine", remote = "https://github.com/openvinotoolkit/openvino.genai", - commit = "973b26b2b1fed25b878ea6108b4d7c5ae825dc12", # master / Dec 17 + commit = "cb6b68e25027de0d39116de998391e436681ec1e", # master / Jan 5 build_file = "@_llm_engine//:BUILD", init_submodules = True, recursive_init_submodules = True,