Skip to content

Commit 9834f6b

Browse files
authored
Enable C++ only text generation (#3260)
1 parent b327445 commit 9834f6b

33 files changed

+389
-197
lines changed

.bazelrc

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -242,8 +242,6 @@ build:windows --override_repository="boringssl=C:\\opt\\boringSSL-SwiftPM"
242242

243243
# Windows config default flags
244244
build:windows --define=CLOUD_DISABLE=1
245-
build:windows --define=PYTHON_DISABLE=0
246-
build:windows --define=MEDIAPIPE_DISABLE=0
247245
# Change this path alongside WORKSPACE:"windows_openvino" path for model_api cmake compilation.
248246
build:windows --action_env OpenVINO_DIR="c:/opt/openvino/runtime/cmake"
249247
build:windows --action_env OpenCV_DIR="c:/opt/opencv"
@@ -252,6 +250,13 @@ build:windows --repo_env=HERMETIC_PYTHON_VERSION=3.12
252250
build:windows --define=USE_DROGON=1
253251
build:windows --define=GENAI_USE_BINARY=1
254252

253+
build:win_mp_on_py_on --config=windows
254+
build:win_mp_on_py_on --define=MEDIAPIPE_DISABLE=0
255+
build:win_mp_on_py_on --define=PYTHON_DISABLE=0
256+
build:win_mp_on_py_off --config=windows
257+
build:win_mp_on_py_off --define=MEDIAPIPE_DISABLE=0
258+
build:win_mp_on_py_off --define=PYTHON_DISABLE=1
259+
255260
# Tests settings ############################################################################################################################
256261
# sometimes failed logs exceed this threshold
257262
test --experimental_ui_max_stdouterr_bytes=304857600

ci/loadWin.groovy

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,14 +113,15 @@ def clean() {
113113
}
114114

115115
def build(){
116-
def status = bat(returnStatus: true, script: 'windows_build.bat ' + get_short_bazel_path() + " //src:ovms_test")
116+
def pythonOption = env.OVMS_PYTHON_ENABLED == "1" ? "--with_python" : "--no_python"
117+
def status = bat(returnStatus: true, script: 'windows_build.bat ' + get_short_bazel_path() + ' ' + pythonOption + ' --with_tests')
117118
status = bat(returnStatus: true, script: 'grep "Build completed successfully" win_build.log"')
118119
if (status != 0) {
119120
error "Error: Windows build failed ${status}. Check win_build.log for details."
120121
} else {
121122
echo "Build successful."
122123
}
123-
def status_pkg = bat(returnStatus: true, script: 'windows_create_package.bat ' + get_short_bazel_path())
124+
def status_pkg = bat(returnStatus: true, script: 'windows_create_package.bat ' + get_short_bazel_path() + ' ' + pythonOption)
124125
if (status_pkg != 0) {
125126
error "Error: Windows package failed ${status_pkg}."
126127
} else {
@@ -129,7 +130,8 @@ def build(){
129130
}
130131

131132
def unit_test(){
132-
status = bat(returnStatus: true, script: 'windows_test.bat ' + get_short_bazel_path())
133+
def pythonOption = env.OVMS_PYTHON_ENABLED == "1" ? "--with_python" : "--no_python"
134+
status = bat(returnStatus: true, script: 'windows_test.bat ' + get_short_bazel_path() + ' ' + pythonOption)
133135
if (status != 0) {
134136
error "Error: Windows build test failed ${status}. Check win_build_test.log for details."
135137
} else {

docs/deploying_server_baremetal.md

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@ To deploy Model Server on baremetal, use pre-compiled binaries for Ubuntu22, Ubu
66
::::{tab-set}
77
:::{tab-item} Ubuntu 22.04
88
:sync: ubuntu-22-04
9-
Download precompiled package (without python support):
9+
Download precompiled package (without python):
1010
```{code} sh
1111
wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_ubuntu22.tar.gz
1212
tar -xzvf ovms_ubuntu22.tar.gz
1313
```
14-
or precompiled package (with python and LLM support):
14+
or precompiled package (with python):
1515
```{code} sh
1616
wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_ubuntu22_python_on.tar.gz
1717
tar -xzvf ovms_ubuntu22_python_on.tar.gz
@@ -25,7 +25,7 @@ Set path to the libraries and add binary to the `PATH`
2525
export LD_LIBRARY_PATH=${PWD}/ovms/lib
2626
export PATH=$PATH:${PWD}/ovms/bin
2727
```
28-
In case of the version with python and LLM support run also:
28+
In case of the version with python run also:
2929
```{code} sh
3030
export PYTHONPATH=${PWD}/ovms/lib/python
3131
sudo apt -y install libpython3.10
@@ -34,12 +34,12 @@ pip3 install "Jinja2==3.1.6" "MarkupSafe==3.0.2"
3434
:::
3535
:::{tab-item} Ubuntu 24.04
3636
:sync: ubuntu-24-04
37-
Download precompiled package (without python support):
37+
Download precompiled package (without python):
3838
```{code} sh
3939
wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_ubuntu24.tar.gz
4040
tar -xzvf ovms_ubuntu24.tar.gz
4141
```
42-
or precompiled package (with python and LLM support):
42+
or precompiled package (with python):
4343
```{code} sh
4444
wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_ubuntu24_python_on.tar.gz
4545
tar -xzvf ovms_ubuntu24_python_on.tar.gz
@@ -53,7 +53,7 @@ Set path to the libraries and add binary to the `PATH`
5353
export LD_LIBRARY_PATH=${PWD}/ovms/lib
5454
export PATH=$PATH:${PWD}/ovms/bin
5555
```
56-
In case of the version with python and LLM support run also:
56+
In case of the version with python run also:
5757
```{code} sh
5858
export PYTHONPATH=${PWD}/ovms/lib/python
5959
sudo apt -y install libpython3.12
@@ -62,12 +62,12 @@ pip3 install "Jinja2==3.1.6" "MarkupSafe==3.0.2"
6262
:::
6363
:::{tab-item} RHEL 9.5
6464
:sync: rhel-9.5
65-
Download precompiled package (without python support):
65+
Download precompiled package (without python):
6666
```{code} sh
6767
wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_redhat.tar.gz
6868
tar -xzvf ovms_redhat.tar.gz
6969
```
70-
or precompiled package (with python and LLM support):
70+
or precompiled package (with python):
7171
```{code} sh
7272
wget https://github.com/openvinotoolkit/model_server/releases/download/v2025.1/ovms_redhat_python_on.tar.gz
7373
tar -xzvf ovms_redhat_python_on.tar.gz
@@ -81,7 +81,7 @@ Set path to the libraries and add binary to the `PATH`
8181
export LD_LIBRARY_PATH=${PWD}/ovms/lib
8282
export PATH=$PATH:${PWD}/ovms/bin
8383
```
84-
In case of the version with python and LLM support run also:
84+
In case of the version with python run also:
8585
```{code} sh
8686
export PYTHONPATH=${PWD}/ovms/lib/python
8787
sudo yum install -y python39-libs
@@ -111,9 +111,9 @@ Run `setupvars` script to set required environment variables.
111111
.\ovms\setupvars.ps1
112112
```
113113

114-
> **Note**: Running this script changes Python settings for the shell that runs it. Environment variables are set only for the current shell so make sure you rerun the script before using model server in a new shell.
114+
> **Note**: If package contains Python, running this script changes Python settings for the shell that runs it. Environment variables are set only for the current shell so make sure you rerun the script before using model server in a new shell.
115115
116-
> **Note**: When serving LLM models, OVMS uses Python's Jinja package to apply chat template. Please ensure you have Windows "Beta Unicode UTF-8 for worldwide language support" enabled. [Instruction](llm_utf8_troubleshoot.png)
116+
> **Note**: If package contains Python, OVMS uses Python's Jinja package to apply chat template when serving LLMs. In such case, please ensure you have Windows "Beta Unicode UTF-8 for worldwide language support" enabled. [Instruction](llm_utf8_troubleshoot.png)
117117
118118
You can also build model server from source by following the [developer guide](windows_developer_guide.md).
119119

setupvars.bat

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515
::
1616
@echo off
1717
set "OVMS_DIR=%~dp0"
18-
set "PYTHONHOME=%OVMS_DIR%\python"
19-
set "PATH=%OVMS_DIR%;%PYTHONHOME%;%PATH%"
18+
if exist "%OVMS_DIR%\python" (
19+
set "PYTHONHOME=%OVMS_DIR%\python"
20+
set "PATH=%OVMS_DIR%;%PYTHONHOME%;%PATH%"
21+
) else (
22+
set "PATH=%OVMS_DIR%;%PATH%"
23+
)
2024
echo "OpenVINO Model Server Environment Initialized"

setupvars.ps1

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515
#
1616

1717
$env:OVMS_DIR=$PSScriptRoot
18-
$env:PYTHONHOME="$env:OVMS_DIR\python"
19-
$env:PATH="$env:OVMS_DIR;$env:PYTHONHOME;$env:PATH"
18+
if (Test-Path "$env:OVMS_DIR\python") {
19+
$env:PYTHONHOME="$env:OVMS_DIR\python"
20+
$env:PATH="$env:OVMS_DIR;$env:PYTHONHOME;$env:PATH"
21+
} else {
22+
$env:PATH="$env:OVMS_DIR;$env:PATH"
23+
}
2024
echo "OpenVINO Model Server Environment Initialized"

src/BUILD

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -603,18 +603,15 @@ cc_library(
603603
deps = select({
604604
"//:not_disable_python": [
605605
"//src/python:libovmspythonmodule",
606-
# Jinja template processing is done in Python
607-
"//src/llm:llmcalculator",
608-
"//src/llm:genai_servables",
609-
"//src/llm:text_processor",
610606
],
611607
"//:disable_python": []
612608
}) + select({
613609
"//conditions:default": [],
614610
"//:not_disable_mediapipe" : [
615611
"//src/llm:openai_completions_api_handler",
616612
"//src/embeddings:embeddingscalculator",
617-
"//src/rerank:rerankcalculator",],
613+
"//src/rerank:rerankcalculator",
614+
"//src/llm:llmcalculator",],
618615
}) + select({
619616
"//:enable_drogon": [
620617
"libdrogon_http_server",
@@ -2804,6 +2801,11 @@ cc_test(
28042801
"test/mediapipe_framework_test.cpp",
28052802
"test/http_openai_handler_test.cpp",
28062803
"test/multipart_calculator_test.cpp",
2804+
"test/llm/llmnode_test.cpp",
2805+
"test/llm/max_model_length_test.cpp",
2806+
"test/llm/text_streamer_test.cpp",
2807+
"test/llm/visual_language_model/complete_flow_test.cpp",
2808+
"test/llm/visual_language_model/initialization_test.cpp",
28072809
],
28082810
"//:disable_mediapipe" : [
28092811
"test/disabled_mediapipe_test.cpp",
@@ -2813,14 +2815,8 @@ cc_test(
28132815
# OvmsPyTensor is currently not used in OVMS core and is just a base for the binding.
28142816
# "test/python/ovms_py_tensor_test.cpp",
28152817
"test/pythonnode_test.cpp",
2816-
# LLM logic uses Python for processing Jinja templates
2817-
"test/llm/llmnode_test.cpp",
2818-
"test/llm/assisted_decoding_test.cpp",
2819-
"test/llm/max_model_length_test.cpp",
2818+
# LLM logic uses Python for processing Jinja templates when built with Python enabled
28202819
"test/llm/llmtemplate_test.cpp",
2821-
"test/llm/text_streamer_test.cpp",
2822-
"test/llm/visual_language_model/complete_flow_test.cpp",
2823-
"test/llm/visual_language_model/initialization_test.cpp",
28242820
],
28252821
"//:disable_python" : [],
28262822
}) + select({

src/llm/BUILD

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@
1616

1717
load("@mediapipe//mediapipe/framework/port:build_config.bzl", "mediapipe_cc_proto_library", "mediapipe_proto_library")
1818
load("//:common_settings.bzl",
19-
"COMMON_STATIC_LIBS_COPTS", "COMMON_STATIC_LIBS_LINKOPTS", "COMMON_FUZZER_COPTS", "COMMON_FUZZER_LINKOPTS", "COMMON_LOCAL_DEFINES", "PYBIND_DEPS")
19+
"COMMON_STATIC_LIBS_COPTS", "COMMON_STATIC_LIBS_LINKOPTS", "COMMON_FUZZER_COPTS", "COMMON_FUZZER_LINKOPTS", "COMMON_LOCAL_DEFINES", "PYBIND_DEPS", "COPTS_PYTHON")
2020

21-
COPTS_ADJUSTED = COMMON_STATIC_LIBS_COPTS + select({
21+
COPTS_ADJUSTED = COMMON_STATIC_LIBS_COPTS + COPTS_PYTHON + select({
2222
"//conditions:default": [],
2323
"//:fuzzer_build" : COMMON_FUZZER_COPTS,
2424
})
@@ -93,13 +93,30 @@ cc_library(
9393

9494
cc_library(
9595
name = "genai_servables",
96-
hdrs = ["servable.hpp", "servable_initializer.hpp",
97-
"language_model/continuous_batching/servable.hpp", "language_model/continuous_batching/llm_executor.hpp", "language_model/continuous_batching/servable_initializer.hpp",
98-
"visual_language_model/continuous_batching/servable.hpp", "language_model/legacy/servable.hpp", "language_model/legacy/servable_initializer.hpp", "language_model/legacy/legacy_executor.hpp",
99-
"visual_language_model/legacy/servable.hpp", "visual_language_model/legacy/servable_initializer.hpp", "visual_language_model/legacy/legacy_executor.hpp"],
100-
srcs = ["servable.cpp", "servable_initializer.cpp", "language_model/continuous_batching/servable.cpp", "language_model/continuous_batching/servable_initializer.cpp",
101-
"visual_language_model/continuous_batching/servable.cpp", "language_model/legacy/servable.cpp", "language_model/legacy/servable_initializer.cpp", "language_model/legacy/legacy_executor.cpp",
102-
"visual_language_model/legacy/servable.cpp", "visual_language_model/legacy/servable_initializer.cpp", "visual_language_model/legacy/legacy_executor.cpp"],
96+
hdrs = ["servable.hpp",
97+
"servable_initializer.hpp",
98+
"language_model/continuous_batching/servable.hpp",
99+
"language_model/continuous_batching/llm_executor.hpp",
100+
"language_model/continuous_batching/servable_initializer.hpp",
101+
"visual_language_model/continuous_batching/servable.hpp",
102+
"language_model/legacy/servable.hpp",
103+
"language_model/legacy/servable_initializer.hpp",
104+
"language_model/legacy/legacy_executor.hpp",
105+
"visual_language_model/legacy/servable.hpp",
106+
"visual_language_model/legacy/servable_initializer.hpp",
107+
"visual_language_model/legacy/legacy_executor.hpp",
108+
"text_utils.hpp"],
109+
srcs = ["servable.cpp",
110+
"servable_initializer.cpp",
111+
"language_model/continuous_batching/servable.cpp",
112+
"language_model/continuous_batching/servable_initializer.cpp",
113+
"visual_language_model/continuous_batching/servable.cpp",
114+
"language_model/legacy/servable.cpp",
115+
"language_model/legacy/servable_initializer.cpp",
116+
"language_model/legacy/legacy_executor.cpp",
117+
"visual_language_model/legacy/servable.cpp",
118+
"visual_language_model/legacy/servable_initializer.cpp",
119+
"visual_language_model/legacy/legacy_executor.cpp"],
103120
deps = [
104121
"//third_party:openvino",
105122
"@mediapipe//mediapipe/framework:calculator_framework",
@@ -111,14 +128,15 @@ cc_library(
111128
"//src:libovmsprofiler",
112129
"//src:libovmsfilesystem",
113130
"llmcalculator_cc_proto",
114-
"//src/python:utils",
115-
":text_processor",
116131
":openai_completions_api_handler",
117132
"//src:httppayload",
118133
"//src:libhttpclientconnection",
119-
] + PYBIND_DEPS + select({
134+
] + select({
120135
"//conditions:default": ["//third_party:genai", ":llm_engine"],
121136
"//:not_genai_bin" : [":llm_engine"],
137+
}) + select({
138+
"//:disable_python": [],
139+
"//:not_disable_python" : [":py_jinja_template_processor"],
122140
}),
123141
visibility = ["//visibility:public"],
124142
local_defines = COMMON_LOCAL_DEFINES,
@@ -128,9 +146,9 @@ cc_library(
128146
)
129147

130148
cc_library(
131-
name = "text_processor",
132-
hdrs = ["text_processor.hpp"],
133-
srcs = ["text_processor.cpp"],
149+
name = "py_jinja_template_processor",
150+
hdrs = ["py_jinja_template_processor.hpp"],
151+
srcs = ["py_jinja_template_processor.cpp"],
134152
deps = ["@mediapipe//mediapipe/framework:calculator_framework",
135153
"//third_party:openvino",
136154
"//src:libovmslogging",

src/llm/http_llm_calculator.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#pragma warning(pop)
2727

2828
#include "../http_payload.hpp"
29+
#include "../logging.hpp"
2930
#include "../profiler.hpp"
3031
#include "apis/openai_completions.hpp"
3132
#include "servable.hpp"

src/llm/language_model/continuous_batching/servable.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,10 @@
3333
#include "../../../http_payload.hpp"
3434
#include "../../../mediapipe_internal/mediapipe_utils.hpp"
3535
#include "../../apis/openai_completions.hpp"
36-
#include "../../text_processor.hpp"
36+
#include "../../text_utils.hpp"
37+
#if (PYTHON_DISABLE == 0)
38+
#include "../../py_jinja_template_processor.hpp"
39+
#endif
3740
#include "llm_executor.hpp"
3841
#include "servable.hpp"
3942

src/llm/language_model/continuous_batching/servable_initializer.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -57,15 +57,12 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
5757
return status;
5858
}
5959
auto properties = std::static_pointer_cast<ContinuousBatchingServableProperties>(servable->getProperties());
60-
6160
properties->modelsPath = parsedModelsPath;
62-
6361
properties->schedulerConfig.max_num_batched_tokens = nodeOptions.max_num_batched_tokens();
6462
properties->schedulerConfig.cache_size = nodeOptions.cache_size();
6563
properties->schedulerConfig.dynamic_split_fuse = nodeOptions.dynamic_split_fuse();
6664
properties->schedulerConfig.max_num_seqs = nodeOptions.max_num_seqs();
6765
properties->schedulerConfig.enable_prefix_caching = nodeOptions.enable_prefix_caching();
68-
6966
properties->device = nodeOptions.device();
7067

7168
if (!nodeOptions.draft_models_path().empty()) {
@@ -91,7 +88,6 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
9188
}
9289

9390
} else if (nodeOptions.has_draft_max_num_batched_tokens() || nodeOptions.has_draft_cache_size() || nodeOptions.has_draft_dynamic_split_fuse() || nodeOptions.has_draft_max_num_seqs() || nodeOptions.has_draft_block_size() || nodeOptions.has_draft_device()) {
94-
// Consider moving draft parameters to separate structure in node options, so it's validated on the proto level
9591
SPDLOG_ERROR("Draft model path is not provided, but draft scheduler options are set.");
9692
return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
9793
}
@@ -116,14 +112,18 @@ Status ContinuousBatchingServableInitializer::initialize(std::shared_ptr<GenAiSe
116112
return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
117113
}
118114

119-
loadTextProcessor(properties, parsedModelsPath);
115+
#if (PYTHON_DISABLE == 0)
116+
loadPyTemplateProcessor(properties, parsedModelsPath);
117+
#else
118+
loadDefaultTemplateProcessorIfNeeded(properties);
119+
#endif
120120
if (nodeOptions.has_max_tokens_limit()) {
121121
properties->maxTokensLimit = nodeOptions.max_tokens_limit();
122122
}
123123
properties->bestOfLimit = nodeOptions.best_of_limit();
124124
properties->maxModelLength = parseMaxModelLength(parsedModelsPath);
125-
126125
properties->llmExecutorWrapper = std::make_shared<LLMExecutorWrapper>(properties->pipeline);
126+
127127
return StatusCode::OK;
128128
}
129129

src/llm/language_model/legacy/servable.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,10 @@
3333
#include "../../../http_payload.hpp"
3434
#include "../../../mediapipe_internal/mediapipe_utils.hpp"
3535
#include "../../apis/openai_completions.hpp"
36-
#include "../../text_processor.hpp"
36+
#include "../../text_utils.hpp"
37+
#if (PYTHON_DISABLE == 0)
38+
#include "../../py_jinja_template_processor.hpp"
39+
#endif
3740
#include "servable.hpp"
3841

3942
namespace ovms {

src/llm/language_model/legacy/servable_initializer.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,11 @@ Status LegacyServableInitializer::initialize(std::shared_ptr<GenAiServable>& ser
9595
SPDLOG_ERROR("Error during llm node initialization for models_path: {}", parsedModelsPath);
9696
return StatusCode::LLM_NODE_RESOURCE_STATE_INITIALIZATION_FAILED;
9797
}
98-
99-
loadTextProcessor(properties, parsedModelsPath);
98+
#if (PYTHON_DISABLE == 0)
99+
loadPyTemplateProcessor(properties, parsedModelsPath);
100+
#else
101+
loadDefaultTemplateProcessorIfNeeded(properties);
102+
#endif
100103
properties->legacyExecutor = std::make_shared<LegacyExecutorWrapper>(properties->pipeline);
101104
if (nodeOptions.has_max_tokens_limit()) {
102105
properties->maxTokensLimit = nodeOptions.max_tokens_limit();

0 commit comments

Comments
 (0)