diff --git a/demos/code_local_assistant/README.md b/demos/code_local_assistant/README.md index 15cbc0a183..740f467fe1 100644 --- a/demos/code_local_assistant/README.md +++ b/demos/code_local_assistant/README.md @@ -22,7 +22,7 @@ mkdir models Export `codellama/CodeLlama-7b-Instruct-hf`: ```console -python export_model.py text_generation --source_model codellama/CodeLlama-7b-Instruct-hf --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device NPU --overwrite_models +python export_model.py text_generation --source_model codellama/CodeLlama-7b-Instruct-hf --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --task text_generation --target_device NPU --overwrite_models ``` > **Note:** Use `--target_device GPU` for Intel GPU or omit this parameter to run on Intel CPU @@ -34,7 +34,7 @@ Code completion works in non-streaming, unary mode. Do not use instruct model, t Export `Qwen/Qwen2.5-Coder-1.5B`: ```console -python export_model.py text_generation --source_model Qwen/Qwen2.5-Coder-1.5B --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --target_device NPU --overwrite_models +python export_model.py text_generation --source_model Qwen/Qwen2.5-Coder-1.5B --weight-format int4 --config_file_path models/config_all.json --model_repository_path models --task text_generation --target_device NPU --overwrite_models ``` Examine that workspace is set up properly `models/config_all.json`: diff --git a/demos/continuous_batching/rag/README.md b/demos/continuous_batching/rag/README.md index 3fac03d741..61144f0c9d 100644 --- a/demos/continuous_batching/rag/README.md +++ b/demos/continuous_batching/rag/README.md @@ -10,7 +10,7 @@ ```bash mkdir models -docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-ov +docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/Qwen3-8B-int4-ov --task text_generation docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/bge-base-en-v1.5-fp16-ov --task embeddings docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/model_server:latest --pull --model_repository_path /models --source_model OpenVINO/bge-reranker-base-fp16-ov --task rerank @@ -25,9 +25,10 @@ docker run --user $(id -u):$(id -g) --rm -v $(pwd)/models:/models:rw openvino/mo ```bat mkdir models -ovms --pull --model_repository_path models --source_model OpenVINO/Qwen3-8B-int4-ov -ovms --pull --model_repository_path models --source_model OpenVINO/bge-base-en-v1.5-fp16-ov --task embeddings -ovms --pull --model_repository_path models --source_model OpenVINO/bge-reranker-base-fp16-ov --task rerank + +ovms.exe --pull --model_repository_path models --source_model OpenVINO/Qwen3-8B-int4-ov --task text_generation +ovms.exe --pull --model_repository_path models --source_model OpenVINO/bge-base-en-v1.5-fp16-ov --task embeddings +ovms.exe --pull --model_repository_path models --source_model OpenVINO/bge-reranker-base-fp16-ov --task rerank ovms --add_to_config models --model_name OpenVINO/Qwen3-8B-int4-ov --model_path OpenVINO/Qwen3-8B-int4-ov ovms --add_to_config models --model_name OpenVINO/bge-base-en-v1.5-fp16-ov --model_path OpenVINO/bge-base-en-v1.5-fp16-ov diff --git a/demos/continuous_batching/vlm/README.md b/demos/continuous_batching/vlm/README.md index b642de9b00..f09d5f020e 100644 --- a/demos/continuous_batching/vlm/README.md +++ b/demos/continuous_batching/vlm/README.md @@ -30,7 +30,7 @@ Select deployment option depending on how you prepared models in the previous st Running this command starts the container with CPU only target device: ```bash mkdir -p models -docker run -d -u $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models:rw openvino/model_server:latest --rest_port 8000 --source_model OpenVINO/InternVL2-2B-int4-ov --model_repository_path /models --model_name OpenGVLab/InternVL2-2B --pipeline_type VLM +docker run -d -u $(id -u):$(id -g) --rm -p 8000:8000 -v $(pwd)/models:/models:rw openvino/model_server:latest --rest_port 8000 --source_model OpenVINO/InternVL2-2B-int4-ov --model_repository_path /models --model_name OpenGVLab/InternVL2-2B --task text_generation --pipeline_type VLM ``` **GPU** @@ -39,7 +39,7 @@ to `docker run` command, use the image with GPU support. It can be applied using the commands below: ```bash mkdir -p models -docker run -d -u $(id -u):$(id -g) --rm -p 8000:8000 --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -v $(pwd)/models:/models:rw openvino/model_server:latest-gpu --rest_port 8000 --source_model OpenVINO/InternVL2-2B-int4-ov --model_repository_path models --model_name OpenGVLab/InternVL2-2B --target_device GPU --pipeline_type VLM +docker run -d -u $(id -u):$(id -g) --rm -p 8000:8000 --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render* | head -n 1) -v $(pwd)/models:/models:rw openvino/model_server:latest-gpu --rest_port 8000 --source_model OpenVINO/InternVL2-2B-int4-ov --model_repository_path models --model_name OpenGVLab/InternVL2-2B --task text_generation --target_device GPU --pipeline_type VLM ``` ::: @@ -49,11 +49,11 @@ If you run on GPU make sure to have appropriate drivers installed, so the device ```bat mkdir models -ovms --rest_port 8000 --source_model OpenVINO/InternVL2-2B-int4-ov --model_repository_path models --model_name OpenGVLab/InternVL2-2B --pipeline_type VLM --target_device CPU +ovms --rest_port 8000 --source_model OpenVINO/InternVL2-2B-int4-ov --model_repository_path models --model_name OpenGVLab/InternVL2-2B --task text_generation --pipeline_type VLM --target_device CPU ``` or ```bat -ovms --rest_port 8000 --source_model OpenVINO/InternVL2-2B-int4-ov --model_repository_path models --model_name OpenGVLab/InternVL2-2B --pipeline_type VLM --target_device GPU +ovms --rest_port 8000 --source_model OpenVINO/InternVL2-2B-int4-ov --model_repository_path models --model_name OpenGVLab/InternVL2-2B --task text_generation --pipeline_type VLM --target_device GPU ``` ::: diff --git a/demos/llm_npu/README.md b/demos/llm_npu/README.md index 252f3666b0..c3592c1277 100644 --- a/demos/llm_npu/README.md +++ b/demos/llm_npu/README.md @@ -38,7 +38,7 @@ Run `export_model.py` script to download and quantize the model: **LLM** ```console -python export_model.py text_generation --source_model meta-llama/Llama-3.1-8B-Instruct --target_device NPU --config_file_path models/config.json --ov_cache_dir ./models/.ov_cache --model_repository_path models --overwrite_models +python export_model.py text_generation --source_model meta-llama/Llama-3.1-8B-Instruct --target_device NPU --config_file_path models/config.json --ov_cache_dir ./models/.ov_cache --model_repository_path models --task text_generation --overwrite_models ``` **Note:** The parameter `--ov_cache` stores the model compilation cache to speedup initialization time for sequential startup. Drop this parameter if you don't want to store the compilation cache. diff --git a/docs/llm/quickstart.md b/docs/llm/quickstart.md index 2a4e9f3737..f203365cad 100644 --- a/docs/llm/quickstart.md +++ b/docs/llm/quickstart.md @@ -19,7 +19,7 @@ You can use another model from [OpenVINO organization on HuggingFace](https://hu ```bash mkdir models -docker run --user $(id -u):$(id -g) -d --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render*) --rm -p 8000:8000 -v $(pwd)/models:/models:rw openvino/model_server:latest-gpu --source_model OpenVINO/Phi-3.5-mini-instruct-int4-ov --model_repository_path models --rest_port 8000 --target_device GPU --cache_size 2 +docker run --user $(id -u):$(id -g) -d --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render*) --rm -p 8000:8000 -v $(pwd)/models:/models:rw openvino/model_server:latest-gpu --source_model OpenVINO/Phi-3.5-mini-instruct-int4-ov --model_repository_path models --task text_generation --rest_port 8000 --target_device GPU --cache_size 2 ``` ::: @@ -27,7 +27,7 @@ docker run --user $(id -u):$(id -g) -d --device /dev/dri --group-add=$(stat -c " **Required:** OpenVINO Model Server package - see [deployment instructions](../deploying_server_baremetal.md) for details. ```bat -ovms.exe --source_model OpenVINO/Phi-3.5-mini-instruct-int4-ov --model_repository_path models --rest_port 8000 --target_device GPU --cache_size 2 +ovms.exe --source_model OpenVINO/Phi-3.5-mini-instruct-int4-ov --model_repository_path models --rest_port 8000 --task text_generation --target_device GPU --cache_size 2 ``` ::: :::: diff --git a/docs/parameters.md b/docs/parameters.md index e663e58c32..42ed4098c9 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -78,12 +78,12 @@ Shared configuration options for the pull, and pull & start mode. In the presenc | Option | Value format | Description | |-----------------------------|--------------|---------------------------------------------------------------------------------------------------------------| -| `--pull` | `NA` | Runs the server in pull mode to download the model from the Hugging Face repository. | -| `--source_model` | `string` | Name of the model in the Hugging Face repository. If not set, `model_name` is used. | +| `--pull` | `NA` | Runs the server in pull mode to download the model from the Hugging Face repository. | +| `--source_model` | `string` | Name of the model in the Hugging Face repository. If not set, `model_name` is used. `Required` | | `--model_repository_path` | `string` | Directory where all required model files will be saved. | | `--model_name` | `string` | Name of the model as exposed externally by the server. | -| `--target_device` | `string` | Device name to be used to execute inference operations. Accepted values are: `"CPU"/"GPU"/"MULTI"/"HETERO"` | -| `--task` | `string` | Task type the model will support (`text_generation`, `embedding`, `rerank`, `image_generation`). Default: `text_generation` | +| `--target_device` | `string` | Device name to be used to execute inference operations. Accepted values are: `"CPU"/"GPU"/"MULTI"/"HETERO"` | +| `--task` | `string` | Task type the model will support (`text_generation`, `embedding`, `rerank`, `image_generation`). | | `--overwrite_models` | `NA` | If set, an existing model with the same name will be overwritten. If not set, the server will use existing model files if available. | ## Pull Mode Options for optimum-cli mode diff --git a/src/cli_parser.cpp b/src/cli_parser.cpp index f7d76984a2..1fe91772a3 100644 --- a/src/cli_parser.cpp +++ b/src/cli_parser.cpp @@ -186,7 +186,7 @@ void CLIParser::parse(int argc, char** argv) { "MODEL_REPOSITORY_PATH") ("task", "Choose type of model export: text_generation - chat and completion endpoints, embeddings - embeddings endpoint, rerank - rerank endpoint, image_generation - image generation/edit/inpainting endpoints.", - cxxopts::value()->default_value("text_generation"), + cxxopts::value(), "TASK") ("weight-format", "Model precision used in optimum-cli export with conversion", @@ -289,11 +289,8 @@ void CLIParser::parse(int argc, char** argv) { } } } else { - // Default task is text_generation - task = TEXT_GENERATION_GRAPH; - GraphCLIParser cliParser; - unmatchedOptions = cliParser.parse(result->unmatched()); - this->graphOptionsParser = std::move(cliParser); + std::cerr << "error parsing options - --task parameter wasn't passed"; + exit(OVMS_EX_USAGE); } if (unmatchedOptions.size()) { diff --git a/src/test/ovmsconfig_test.cpp b/src/test/ovmsconfig_test.cpp index 7efb1064cc..f7bd241893 100644 --- a/src/test/ovmsconfig_test.cpp +++ b/src/test/ovmsconfig_test.cpp @@ -354,6 +354,19 @@ TEST_F(OvmsConfigDeathTest, hfWrongTask) { EXPECT_EXIT(ovms::Config::instance().parse(arg_count, n_argv), ::testing::ExitedWithCode(OVMS_EX_USAGE), "error parsing options - --task parameter unsupported value: bad_task"); } +TEST_F(OvmsConfigDeathTest, hfNoTaskParameter) { + char* n_argv[] = { + "ovms", + "--pull", + "--source_model", + "some/model", + "--model_repository_path", + "/some/path", + }; + int arg_count = 6; + EXPECT_EXIT(ovms::Config::instance().parse(arg_count, n_argv), ::testing::ExitedWithCode(OVMS_EX_USAGE), "error parsing options - --task parameter wasn't passed"); +} + TEST_F(OvmsConfigDeathTest, hfBadTextGraphParameter) { char* n_argv[] = { "ovms", @@ -362,10 +375,12 @@ TEST_F(OvmsConfigDeathTest, hfBadTextGraphParameter) { "some/model", "--model_repository_path", "/some/path", + "--task", + "text_generation", "--max_allowed_chunks", "1400", }; - int arg_count = 8; + int arg_count = 10; EXPECT_EXIT(ovms::Config::instance().parse(arg_count, n_argv), ::testing::ExitedWithCode(OVMS_EX_USAGE), "task: text_generation - error parsing options - unmatched arguments : --max_allowed_chunks, 1400,"); } @@ -548,10 +563,12 @@ TEST_F(OvmsConfigDeathTest, hfBadTextGraphParameterName) { "some/model", "--model_repository_path", "/some/path", + "--task", + "text_generation", "--min_num_batched_tokens", "145", }; - int arg_count = 8; + int arg_count = 10; EXPECT_EXIT(ovms::Config::instance().parse(arg_count, n_argv), ::testing::ExitedWithCode(OVMS_EX_USAGE), "task: text_generation - error parsing options - unmatched arguments : --min_num_batched_tokens, 145,"); } @@ -629,10 +646,12 @@ TEST_F(OvmsConfigDeathTest, hfBadTextGenGraphNoPull) { "some/model", "--model_repository_path", "/some/path", + "--task", + "text_generation", "--normalizes", "true", }; - int arg_count = 7; + int arg_count = 9; EXPECT_EXIT(ovms::Config::instance().parse(arg_count, n_argv), ::testing::ExitedWithCode(OVMS_EX_USAGE), "task: text_generation - error parsing options - unmatched arguments : --normalizes, true,"); } @@ -857,8 +876,10 @@ TEST_F(OvmsConfigDeathTest, simultaneousPullAndListModels) { "OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov", "--model_repository_path", "/models", + "--task", + "text_generation", "--list_models"}; - int arg_count = 7; + int arg_count = 9; EXPECT_EXIT(ovms::Config::instance().parse(arg_count, n_argv), ::testing::ExitedWithCode(OVMS_EX_USAGE), "--list_models cannot be used with --pull or --task") << createCmd(arg_count, n_argv) << buffer.str(); } @@ -873,6 +894,8 @@ TEST(OvmsGraphConfigTest, positiveAllChanged) { (char*)modelName.c_str(), (char*)"--model_repository_path", (char*)downloadPath.c_str(), + (char*)"--task", + (char*)"text_generation", (char*)"--pipeline_type", (char*)"VLM", (char*)"--max_num_seqs", @@ -893,7 +916,7 @@ TEST(OvmsGraphConfigTest, positiveAllChanged) { (char*)"parserName", }; - int arg_count = 24; + int arg_count = 26; ConstructorEnabledConfig config; config.parse(arg_count, n_argv); @@ -926,6 +949,8 @@ TEST(OvmsGraphConfigTest, positiveSomeChanged) { (char*)"--overwrite_models", (char*)"--model_repository_path", (char*)downloadPath.c_str(), + (char*)"--task", + (char*)"text_generation", (char*)"--pipeline_type", (char*)"VLM", (char*)"--max_num_seqs", @@ -934,7 +959,7 @@ TEST(OvmsGraphConfigTest, positiveSomeChanged) { (char*)"NPU", }; - int arg_count = 13; + int arg_count = 15; ConstructorEnabledConfig config; config.parse(arg_count, n_argv); @@ -1100,9 +1125,11 @@ TEST(OvmsGraphConfigTest, positiveDefault) { (char*)modelName.c_str(), (char*)"--model_repository_path", (char*)downloadPath.c_str(), + (char*)"--task", + (char*)"text_generation", }; - int arg_count = 6; + int arg_count = 8; ConstructorEnabledConfig config; config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; @@ -1133,11 +1160,13 @@ TEST(OvmsGraphConfigTest, positiveDefaultStart) { (char*)modelName.c_str(), (char*)"--model_repository_path", (char*)downloadPath.c_str(), + (char*)"--task", + (char*)"text_generation", (char*)"--port", (char*)"8080", }; - int arg_count = 7; + int arg_count = 9; ConstructorEnabledConfig config; config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; @@ -1170,11 +1199,13 @@ TEST(OvmsGraphConfigTest, positiveTargetDeviceHetero) { (char*)modelName.c_str(), (char*)"--model_repository_path", (char*)downloadPath.c_str(), + (char*)"--task", + (char*)"text_generation", (char*)"--target_device", (char*)"HETERO", }; - int arg_count = 8; + int arg_count = 10; ConstructorEnabledConfig config; config.parse(arg_count, n_argv); auto& hfSettings = config.getServerSettings().hfSettings; @@ -1192,11 +1223,13 @@ TEST(OvmsGraphConfigTest, negativePipelineType) { (char*)modelName.c_str(), (char*)"--model_repository_path", (char*)downloadPath.c_str(), + (char*)"--task", + (char*)"text_generation", (char*)"--pipeline_type", (char*)"INVALID", }; - int arg_count = 8; + int arg_count = 10; EXPECT_EXIT(ovms::Config::instance().parse(arg_count, n_argv), ::testing::ExitedWithCode(OVMS_EX_USAGE), "pipeline_type: INVALID is not allowed. Supported types: LM, LM_CB, VLM, VLM_CB, AUTO"); } @@ -1210,11 +1243,13 @@ TEST(OvmsGraphConfigTest, negativeTargetDevice) { (char*)modelName.c_str(), (char*)"--model_repository_path", (char*)downloadPath.c_str(), + (char*)"--task", + (char*)"text_generation", (char*)"--target_device", (char*)"INVALID", }; - int arg_count = 8; + int arg_count = 10; EXPECT_EXIT(ovms::Config::instance().parse(arg_count, n_argv), ::testing::ExitedWithCode(OVMS_EX_USAGE), "target_device: INVALID is not allowed. Supported devices: CPU, GPU, NPU, HETERO"); } @@ -1228,11 +1263,13 @@ TEST(OvmsGraphConfigTest, negativeEnablePrefixCaching) { (char*)modelName.c_str(), (char*)"--model_repository_path", (char*)downloadPath.c_str(), + (char*)"--task", + (char*)"text_generation", (char*)"--enable_prefix_caching", (char*)"INVALID", }; - int arg_count = 8; + int arg_count = 10; EXPECT_EXIT(ovms::Config::instance().parse(arg_count, n_argv), ::testing::ExitedWithCode(OVMS_EX_USAGE), "enable_prefix_caching: INVALID is not allowed. Supported values: true, false"); } @@ -1246,14 +1283,34 @@ TEST(OvmsGraphConfigTest, negativeDynamicSplitFuse) { (char*)modelName.c_str(), (char*)"--model_repository_path", (char*)downloadPath.c_str(), + (char*)"--task", + (char*)"text_generation", (char*)"--dynamic_split_fuse", (char*)"INVALID", }; - int arg_count = 8; + int arg_count = 10; EXPECT_EXIT(ovms::Config::instance().parse(arg_count, n_argv), ::testing::ExitedWithCode(OVMS_EX_USAGE), "dynamic_split_fuse: INVALID is not allowed. Supported values: true, false"); } +TEST(OvmsGraphConfigTest, negativeSourceModel) { + std::string modelName = "NonOpenVINO/Phi-3-mini-FastDraft-50M-int8-ov"; + std::string downloadPath = "test/repository"; + char* n_argv[] = { + (char*)"ovms", + (char*)"--pull", + (char*)"--source_model", + (char*)modelName.c_str(), + (char*)"--model_repository_path", + (char*)downloadPath.c_str(), + (char*)"--task", + (char*)"text_generation", + }; + + int arg_count = 8; + EXPECT_EXIT(ovms::Config::instance().parse(arg_count, n_argv), ::testing::ExitedWithCode(OVMS_EX_USAGE), "For now only OpenVINO models are supported in pulling mode"); +} + TEST(OvmsGraphConfigTest, positiveAllChangedRerank) { std::string modelName = "OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov"; std::string downloadPath = "test/repository"; diff --git a/src/test/pull_hf_model_test.cpp b/src/test/pull_hf_model_test.cpp index 54e2042ff8..fdb4dbe29c 100644 --- a/src/test/pull_hf_model_test.cpp +++ b/src/test/pull_hf_model_test.cpp @@ -442,9 +442,11 @@ TEST_F(HfDownloadModelModule, TestInvalidProxyTimeout) { (char*)modelName.c_str(), (char*)"--model_repository_path", (char*)downloadPath.c_str(), + (char*)"--task", + (char*)"text_generation", nullptr}; - int arg_count = 6; + int arg_count = 8; ConstructorEnabledConfig config; { EnvGuard eGuard;