Update speculative-sampling notebook to allow Phi-4 model to be selected (#2936)

shira-g · danielkorat · web-flow · commit e2c65dc30264 · 2025-05-13T16:23:59.000+04:00
Co-authored-by: Daniel Korat &lt;daniel.korat@intel.com&gt;
diff --git a/notebooks/speculative-sampling/speculative-sampling.ipynb b/notebooks/speculative-sampling/speculative-sampling.ipynb
@@ -75,7 +75,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly huggingface_hub datasets"
+    "%pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly huggingface_hub datasets ipywidgets "
    ]
   },
   {
@@ -88,23 +88,80 @@
     "[back to top ⬆️](#Table-of-contents:)\n",
     "\n",
     "As example, we will use already converted LLMs from [OpenVINO collection](https://huggingface.co/collections/OpenVINO/llm-6687aaa2abca3bbcec71a9bd).\n",
-    "You can find OpenVINO optimized FastDraft models can be found in this [collection](https://huggingface.co/collections/OpenVINO/speculative-decoding-draft-models-673f5d944d58b29ba6e94161). As example we will use [Phi-3-mini-4k-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov) as target model and [Phi-3-mini-FastDraft-50M-int8-ov](https://huggingface.co/OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov) as draft.\n",
+    "You can find OpenVINO optimized FastDraft models can be found in this [collection](https://huggingface.co/collections/OpenVINO/speculative-decoding-draft-models-673f5d944d58b29ba6e94161). You can choose either Phi-3 or Phi-4 to be used for demonstrating FastDraft performance. For Phi-3 we will use [Phi-3-mini-4k-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov) as target model and [Phi-3-mini-FastDraft-50M-int8-ov](https://huggingface.co/OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov) as draft. For Phi-4 we will use [Phi-4-mini-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-4-mini-instruct-int4-ov) as target model and [Phi-4-mini-FastDraft-120M-int8-ov](https://huggingface.co/OpenVINO/Phi-4-mini-FastDraft-120M-int8-ov) as draft.\n",
     "\n",
     "In case, if you want run own models, you should convert them using [Hugging Face Optimum](https://huggingface.co/docs/optimum/intel/openvino/export) library accelerated by OpenVINO integration. More details about model preparation can be found in [OpenVINO LLM inference guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide/llm-inference-native-ov.html#convert-hugging-face-tokenizer-and-model-to-openvino-ir-format)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "61ecfe17",
+   "metadata": {},
+   "source": [
+    "### Select model\n",
+    "[back to top ⬆️](#Table-of-contents:)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 1,
+   "id": "fe934261",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b65944bfc43c4acebea6ec5ebd78f981",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Dropdown(description='Select Model:', options=('Phi-3', 'Phi-4'), value='Phi-3')"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import ipywidgets as widgets\n",
+    "\n",
+    "model = widgets.Dropdown(\n",
+    "    options=['Phi-3', 'Phi-4'],\n",
+    "    value='Phi-3',  # default value\n",
+    "    description='Select Model:',\n",
+    ")\n",
+    "\n",
+    "model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ccca9ef6",
+   "metadata": {},
+   "source": [
+    "### Download target and draft models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
    "id": "74bb9f96",
    "metadata": {},
    "outputs": [],
    "source": [
     "from pathlib import Path\n",
     "import huggingface_hub as hf_hub\n",
     "\n",
-    "draft_model_id = \"OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov\"\n",
-    "target_model_id = \"OpenVINO/Phi-3-mini-4k-instruct-int4-ov\"\n",
+    "if model.value == \"Phi-4\":\n",
+    "    target_model_id = \"OpenVINO/Phi-4-mini-instruct-int4-ov\"\n",
+    "    draft_model_id = \"OpenVINO/Phi-4-mini-FastDraft-120M-int8-ov\"\n",
+    "elif model.value == \"Phi-3\":\n",
+    "    target_model_id = \"OpenVINO/Phi-3-mini-4k-instruct-int4-ov\"\n",
+    "    draft_model_id = \"OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov\"\n",
+    "else:\n",
+    "    print(f\"Model {model} is not supported in this demo.\")\n",
     "\n",
     "draft_model_path = Path(draft_model_id.split(\"/\")[-1])\n",
     "target_model_path = Path(target_model_id.split(\"/\")[-1])\n",
@@ -131,7 +188,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "6ddd57de-9f41-403c-bccc-8d3118654a24",
    "metadata": {
     "tags": []
@@ -140,17 +197,16 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "37ad0b345de94225892c9d47519a9164",
+       "model_id": "093ab5068fc542a0826fbd4f8a8d97b8",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
        "Dropdown(description='Device:', options=('CPU', 'GPU'), value='CPU')"
       ]
      },
-     "execution_count": 2,
      "metadata": {},
-     "output_type": "execute_result"
+     "output_type": "display_data"
     }
    ],
    "source": [
@@ -165,8 +221,7 @@
     "from notebook_utils import device_widget\n",
     "\n",
     "device = device_widget(default=\"CPU\", exclude=[\"NPU\", \"AUTO\"])\n",
-    "\n",
-    "device\n",
+    "display(device)\n",
     "\n",
     "# Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry\n",
     "from notebook_utils import collect_telemetry\n",
@@ -200,10 +255,13 @@
     "import openvino_genai as ov_genai\n",
     "import time\n",
     "\n",
+    "print(\"Initializing Auto-Regressive pipeline...\")\n",
     "pipe = ov_genai.LLMPipeline(target_model_path, device.value)\n",
     "\n",
     "config = ov_genai.GenerationConfig()\n",
-    "config.max_new_tokens = 330\n",
+    "config.max_new_tokens = 200\n",
+    "config.apply_chat_template = False\n",
+    "\n",
     "prompt = '''<s>\n",
     "\n",
     "def prime_fib(n: int):\n",
@@ -272,19 +330,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "scheduler_config = ov_genai.SchedulerConfig()\n",
-    "# cache params\n",
-    "scheduler_config.cache_size = 0\n",
-    "scheduler_config.num_kv_blocks = 2048 // 8\n",
-    "scheduler_config.max_num_batched_tokens = 2048\n",
     "\n",
     "draft_model = ov_genai.draft_model(draft_model_path, device.value)\n",
+    "config.num_assistant_tokens = 5\n",
     "\n",
-    "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config)\n",
+    "print(\"Initializing Speculative-Decoding pipeline...\")\n",
+    "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model)\n",
     "\n",
-    "config = ov_genai.GenerationConfig()\n",
-    "config.max_new_tokens = 330\n",
-    "config.num_assistant_tokens = 5\n",
     "start_time = time.perf_counter()\n",
     "result = pipe.generate(prompt, config, streamer=streamer)\n",
     "end_time = time.perf_counter()"
@@ -318,11 +370,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "config = ov_genai.GenerationConfig()\n",
-    "config.max_new_tokens = 100\n",
+    "config.num_assistant_tokens = 0\n",
     "config.assistant_confidence_threshold = 0.05\n",
+    "\n",
     "start_time = time.perf_counter()\n",
-    "result = pipe.generate([\"Sun is yellow because\"], config, streamer)\n",
+    "result = pipe.generate(prompt, config, streamer)\n",
     "end_time = time.perf_counter()"
    ]
   },
@@ -356,22 +408,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "64a36dc1-958c-4f7e-baba-efa89a2d9a8f",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b0f65ad3139a477282c002eafe409d94",
+       "model_id": "ac15c0165c8e4a1c92ad8c1da9bc7879",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
        "Dropdown(description='Data type:', options=('Code', 'Text'), value='Code')"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 4,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -402,7 +454,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 9,
    "id": "13f03634",
    "metadata": {},
    "outputs": [
@@ -444,22 +496,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "1f4ea9e5",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Initializing Auto-Regressive pipline...\n",
+      "Running Warmup...\n",
       "Running Auto-Regressive generation...\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:26<00:00,  5.32s/it]"
+      "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:29<00:00,  5.40s/it]"
      ]
     },
     {
@@ -479,10 +533,10 @@
     {
      "data": {
       "text/plain": [
-       "9"
+       "27"
       ]
      },
-     "execution_count": 6,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -492,12 +546,21 @@
     "import time\n",
     "from tqdm import tqdm\n",
     "\n",
-    "print(\"Running Auto-Regressive generation...\")\n",
+    "print(\"Initializing Auto-Regressive pipline...\")\n",
     "pipe = ov_genai.LLMPipeline(target_model_path, device.value)\n",
     "\n",
     "config = ov_genai.GenerationConfig()\n",
+    "config.apply_chat_template = False\n",
     "config.max_new_tokens = 330\n",
-    "\n",
+    "if data_type.value == \"Code\":\n",
+    "    config.max_new_tokens = 128\n",
+    "    \n",
+    "# warmup\n",
+    "print(\"Running Warmup...\")\n",
+    "for i in range(10):\n",
+    "    pipe.generate(\"this is a warmup prompt\", config)\n",
+    "    \n",
+    "print(\"Running Auto-Regressive generation...\")\n",
     "times_auto_regressive = []\n",
     "for prompt in tqdm(prompts):\n",
     "    start_time = time.perf_counter()\n",
@@ -523,22 +586,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": null,
    "id": "d73e9f37",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Initializing Speculative-Decoding pipline...\n",
+      "Running Warmup...\n",
       "Running Speculative Decoding generation...\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [01:52<00:00,  2.25s/it]"
+      "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:10<00:00,  5.01s/it]"
      ]
     },
     {
@@ -557,20 +622,17 @@
     }
    ],
    "source": [
-    "scheduler_config = ov_genai.SchedulerConfig()\n",
-    "# cache params\n",
-    "scheduler_config.cache_size = 0\n",
-    "scheduler_config.num_kv_blocks = 2048 // 8\n",
-    "scheduler_config.max_num_batched_tokens = 2048\n",
     "\n",
+    "config.num_assistant_tokens = 5\n",
     "draft_model = ov_genai.draft_model(draft_model_path, device.value)\n",
     "\n",
-    "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config)\n",
-    "\n",
-    "config = ov_genai.GenerationConfig()\n",
-    "config.max_new_tokens = 330\n",
-    "config.num_assistant_tokens = 5\n",
+    "print(\"Initializing Speculative-Decoding pipline...\")\n",
+    "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model)\n",
     "\n",
+    "# warmup\n",
+    "print(\"Running Warmup...\")\n",
+    "for i in range(10):\n",
+    "    pipe.generate(\"this is a warmup prompt\", config)\n",
     "\n",
     "times_speculative_decoding = []\n",
     "print(\"Running Speculative Decoding generation...\")\n",
@@ -593,15 +655,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 12,
    "id": "ad898772",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "average speedup: 2.23\n"
+      "average speedup: 1.09\n"
      ]
     }
    ],
@@ -627,7 +689,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.7"
+   "version": "3.12.9"
   },
   "openvino_notebooks": {
    "imageUrl": "https://github.com/user-attachments/assets/eb999dea-d98b-42bb-835e-28d3054e1a84",
diff --git a/utils/notebook_utils.py b/utils/notebook_utils.py
@@ -47,7 +47,6 @@ def device_widget(default="AUTO", exclude=None, added=None, description="Device:
     )
     return device
 
-
 def quantization_widget(default=True):
     import ipywidgets as widgets
 

Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,6 @@ def device_widget(default="AUTO", exclude=None, added=None, description="Device:`
`47`	`47`	`)`
`48`	`48`	`return device`
`49`	`49`
`50`		`-`
`51`	`50`	`def quantization_widget(default=True):`
`52`	`51`	`import ipywidgets as widgets`
`53`	`52`