openvinotoolkit
diff --git a/‎Dockerfile.redhat
Lines changed: 2 additions & 0 deletions b/‎Dockerfile.redhat
Lines changed: 2 additions & 0 deletions
diff --git a/‎Dockerfile.ubuntu
Lines changed: 2 additions & 0 deletions b/‎Dockerfile.ubuntu
Lines changed: 2 additions & 0 deletions
diff --git a/‎create_package.sh
Lines changed: 1 addition & 0 deletions b/‎create_package.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/llm/quickstart.md
Lines changed: 15 additions & 34 deletions b/‎docs/llm/quickstart.md
Lines changed: 15 additions & 34 deletions
diff --git a/‎src/BUILD
Lines changed: 1 addition & 0 deletions b/‎src/BUILD
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/capi_frontend/server_settings.hpp
Lines changed: 4 additions & 3 deletions b/‎src/capi_frontend/server_settings.hpp
Lines changed: 4 additions & 3 deletions
@@ -389,6 +389,8 @@ RUN if [ -f /usr/bin/dnf ] ; then export DNF_TOOL=dnf ; echo -e "max_parallel_do
 ENV LD_LIBRARY_PATH=/ovms/lib
 
 COPY --from=pkg /ovms_release /ovms
+# hadolint ignore=SC2114
+RUN mv /ovms/bin/git* /usr/bin
 COPY --from=pkg /usr/local/lib/python3.*/site-packages/jinja2 /ovms/python_deps/jinja2
 COPY --from=pkg /usr/local/lib/python3.*/site-packages/jinja2-3.1.6.dist-info /ovms/python_deps/jinja2-3.1.6.dist-info
 COPY --from=pkg /usr/local/lib64/python3.*/site-packages/MarkupSafe-3.0.2.dist-info /ovms/python_deps/MarkupSafe-3.0.2.dist-info
 
@@ -407,6 +407,8 @@ RUN apt-get update ; \
     useradd --home-dir /home/ovms --create-home --uid 5000 --gid 5000 --groups 39,44 --shell /bin/bash --skel /dev/null ovms
 
 COPY --from=pkg /ovms_release /ovms
+# hadolint ignore=SC2114
+RUN mv /ovms/bin/git* /usr/bin
 COPY --from=build /usr/local/lib/python3.*/dist-packages/MarkupSafe-3.0.2.dist-info /ovms/python_deps/MarkupSafe-3.0.2.dist-info
 COPY --from=build /usr/local/lib/python3.*/dist-packages/jinja2 /ovms/python_deps/jinja2
 COPY --from=build /usr/local/lib/python3.*/dist-packages/jinja2-3.1.6.dist-info /ovms/python_deps/jinja2-3.1.6.dist-info
 
@@ -86,6 +86,7 @@ patchelf --debug --set-rpath '$ORIGIN' /ovms_release/lib/lib*plugin.so
 if [ -f  /ovms_release/lib/libopenvino_nvidia_gpu_plugin.so ] && [ "$BASE_OS" != "redhat" ]; then patchelf  --replace-needed libcutensor.so.1 /usr/lib/x86_64-linux-gnu/libcutensor/11/libcutensor.so.1 /ovms_release/lib/libopenvino_nvidia_gpu_plugin.so ; fi
 
 cp -P /usr/bin/git-lfs .
+cp -P /usr/bin/git .
 
 cd /ovms
 cp -v /ovms/release_files/LICENSE /ovms_release/
 
@@ -6,53 +6,34 @@ It is [microsoft/Phi-3.5-mini-instruct](https://huggingface.co/microsoft/Phi-3.5
 ## Requirements
 - Linux or Windows 11
 - Docker Engine or `ovms` binary package [installed](../deploying_server_baremetal.md)
-- Intel iGPU or ARC GPU 
+- Intel iGPU or ARC GPU
 
 ## Deployment Steps
-
-### 1. Install Python dependencies:
-```console
-pip3 install huggingface_hub jinja2
-```
-
-### 2. Download and Prepare the Model:
-Using `export_model.py` script, download the OpenVINO model and prepare models repository including all configuration required for deployment with OpenVINO Model Server. For details, see [Exporting GEN AI Models](../../demos/common/export_models/README.md).
-
-```console
-curl https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/common/export_models/export_model.py -o export_model.py
-mkdir models
-python export_model.py text_generation --source_model OpenVINO/Phi-3.5-mini-instruct-int4-ov --model_repository_path models --target_device GPU --cache 2
-```
-LLM engine parameters will be defined inside the `graph.pbtxt` file.
-
-> **Note:** The users in China need to set environment variable `HF_ENDPOINT="https://hf-mirror.com"` before running the export script to connect to the HF Hub.
-
-> **Note:** If you want to export models outside of the `OpenVINO` organization in HuggingFace, you need to install additional Python dependencies:
-> ```console
-> pip3 install -r https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/releases/2025/1/demos/common/export_models/requirements.txt
-> ```
 
-### 3. Deploy the Model
+### 1. Deploy the Model
 ::::{tab-set}
 
 :::{tab-item} With Docker
 **Required:** Docker Engine installed
 
 ```bash
-docker run -d --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render*) --rm -p 8000:8000 -v $(pwd)/models:/models:ro openvino/model_server:latest-gpu --rest_port 8000 --model_name Phi-3.5-mini-instruct --model_path /models/OpenVINO/Phi-3.5-mini-instruct-int4-ov
+docker run -d --device /dev/dri --group-add=$(stat -c "%g" /dev/dri/render*) --rm -p 8000:8000 -v $(pwd)/models:/models:rw openvino/model_server:latest-gpu --source_model OpenVINO/Phi-3.5-mini-instruct-int4-ov --model_repository_path models --rest_port 8000 --target_device GPU --cache_size 2
 ```
 :::
 
 :::{tab-item} On Baremetal Host
 **Required:** OpenVINO Model Server package - see [deployment instructions](../deploying_server_baremetal.md) for details.
 
 ```bat
-ovms --rest_port 8000 --model_name Phi-3.5-mini-instruct --model_path models/OpenVINO/Phi-3.5-mini-instruct-int4-ov
+ovms.exe --source_model OpenVINO/Phi-3.5-mini-instruct-int4-ov --model_repository_path models --rest_port 8000 --target_device GPU --cache_size 2
 ```
 :::
 ::::
 
-### 4. Check Model Readiness
+First run of the command will download the https://huggingface.co/OpenVINO/Phi-3.5-mini-instruct-int4-ov to models/OpenVINO/Phi-3.5-mini-instruct-int4-ov directory and start serving it with ovms.
+The consecutive run of the command will check that the model exists and start serving it.
+
+### 2. Check Model Readiness
 
 Wait for the model to load. You can check the status with a simple command:
 
@@ -63,7 +44,7 @@ curl http://localhost:8000/v1/config
 :::{dropdown} Expected Response
 ```json
 {
-  "Phi-3.5-mini-instruct": {
+  "OpenVINO/Phi-3.5-mini-instruct-int4-ov": {
     "model_version_status": [
       {
         "version": "1",
@@ -79,7 +60,7 @@ curl http://localhost:8000/v1/config
 ```
 :::
 
-### 5. Run Generation
+### 3. Run Generation
 
 ::::{tab-set}
 
@@ -88,7 +69,7 @@ curl http://localhost:8000/v1/config
 curl -s http://localhost:8000/v3/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
-    "model": "Phi-3.5-mini-instruct",
+    "model": "OpenVINO/Phi-3.5-mini-instruct-int4-ov",
     "max_tokens": 30,
     "temperature": 0,
     "stream": false,
@@ -107,12 +88,12 @@ Windows Powershell
 (Invoke-WebRequest -Uri "http://localhost:8000/v3/chat/completions" `
  -Method POST `
  -Headers @{ "Content-Type" = "application/json" } `
- -Body '{"model": "Phi-3.5-mini-instruct", "max_tokens": 30, "temperature": 0, "stream": false, "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What are the 3 main tourist attractions in Paris?"}]}').Content
+ -Body '{"model": "OpenVINO/Phi-3.5-mini-instruct-int4-ov", "max_tokens": 30, "temperature": 0, "stream": false, "messages": [{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "What are the 3 main tourist attractions in Paris?"}]}').Content
 ```
 
 Windows Command Prompt
 ```bat
-curl -s http://localhost:8000/v3/chat/completions -H "Content-Type: application/json" -d "{\"model\": \"Phi-3.5-mini-instruct\", \"max_tokens\": 30, \"temperature\": 0, \"stream\": false, \"messages\": [{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}, {\"role\": \"user\", \"content\": \"What are the 3 main tourist attractions in Paris?\"}]}"
+curl -s http://localhost:8000/v3/chat/completions -H "Content-Type: application/json" -d "{\"model\": \"OpenVINO/Phi-3.5-mini-instruct-int4-ov\", \"max_tokens\": 30, \"temperature\": 0, \"stream\": false, \"messages\": [{\"role\": \"system\", \"content\": \"You are a helpful assistant.\"}, {\"role\": \"user\", \"content\": \"What are the 3 main tourist attractions in Paris?\"}]}"
 ```
 :::
 
@@ -133,7 +114,7 @@ curl -s http://localhost:8000/v3/chat/completions -H "Content-Type: application/
     }
   ],
   "created": 1744716414,
-  "model": "Phi-3.5-mini-instruct",
+  "model": "OpenVINO/Phi-3.5-mini-instruct-int4-ov",
   "object": "chat.completion",
   "usage": {
     "prompt_tokens": 24,
@@ -161,7 +142,7 @@ client = OpenAI(
 )
 
 stream = client.chat.completions.create(
-    model="Phi-3.5-mini-instruct",
+    model="OpenVINO/Phi-3.5-mini-instruct-int4-ov",
     messages=[{"role": "system", "content": "You are a helpful assistant."},
               {"role": "user", "content": "What are the 3 main tourist attractions in Paris?"}
     ],
 
@@ -341,6 +341,7 @@ cc_library(
         "@com_github_jarro2783_cxxopts//:cxxopts",
         "libovms_server_settings",
         "libovms_version",
+        "libovmsfilesystem",
         "//src/graph_export:graph_export_types",
         "//src/graph_export:graph_cli_parser",
         "//src/graph_export:rerank_graph_cli_parser",
 
@@ -17,6 +17,7 @@
 #include <cstdint>
 #include <optional>
 #include <string>
+#include <variant>
 #include <vector>
 
 #include "../graph_export/graph_export_types.hpp"
@@ -61,14 +62,14 @@ struct RerankGraphSettingsImpl {
 };
 
 struct HFSettingsImpl {
+    std::string targetDevice = "CPU";
     std::string sourceModel = "";
     std::string downloadPath = "";
     bool pullHfModelMode = false;
+    bool pullHfAndStartModelMode = false;
     bool overwriteModels = false;
     ExportType task = text_generation;
-    TextGenGraphSettingsImpl graphSettings;
-    RerankGraphSettingsImpl rerankGraphSettings;
-    EmbeddingsGraphSettingsImpl embeddingsGraphSettings;
+    std::variant<TextGenGraphSettingsImpl, RerankGraphSettingsImpl, EmbeddingsGraphSettingsImpl> graphSettings;
 };
 
 struct ServerSettingsImpl {