Revert "Update speculative-sampling notebook to allow Phi-4 model to be selected" (#2940)

eaidova · web-flow · commit a1912c0f8194 · 2025-05-13T16:36:27.000+04:00
Reverts #2936
diff --git a/notebooks/speculative-sampling/speculative-sampling.ipynb b/notebooks/speculative-sampling/speculative-sampling.ipynb
@@ -75,7 +75,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly huggingface_hub datasets ipywidgets "
+    "%pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly huggingface_hub datasets"
    ]
   },
   {
@@ -88,80 +88,23 @@
     "[back to top ⬆️](#Table-of-contents:)\n",
     "\n",
     "As example, we will use already converted LLMs from [OpenVINO collection](https://huggingface.co/collections/OpenVINO/llm-6687aaa2abca3bbcec71a9bd).\n",
-    "You can find OpenVINO optimized FastDraft models can be found in this [collection](https://huggingface.co/collections/OpenVINO/speculative-decoding-draft-models-673f5d944d58b29ba6e94161). You can choose either Phi-3 or Phi-4 to be used for demonstrating FastDraft performance. For Phi-3 we will use [Phi-3-mini-4k-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov) as target model and [Phi-3-mini-FastDraft-50M-int8-ov](https://huggingface.co/OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov) as draft. For Phi-4 we will use [Phi-4-mini-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-4-mini-instruct-int4-ov) as target model and [Phi-4-mini-FastDraft-120M-int8-ov](https://huggingface.co/OpenVINO/Phi-4-mini-FastDraft-120M-int8-ov) as draft.\n",
+    "You can find OpenVINO optimized FastDraft models can be found in this [collection](https://huggingface.co/collections/OpenVINO/speculative-decoding-draft-models-673f5d944d58b29ba6e94161). As example we will use [Phi-3-mini-4k-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov) as target model and [Phi-3-mini-FastDraft-50M-int8-ov](https://huggingface.co/OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov) as draft.\n",
     "\n",
     "In case, if you want run own models, you should convert them using [Hugging Face Optimum](https://huggingface.co/docs/optimum/intel/openvino/export) library accelerated by OpenVINO integration. More details about model preparation can be found in [OpenVINO LLM inference guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide/llm-inference-native-ov.html#convert-hugging-face-tokenizer-and-model-to-openvino-ir-format)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "61ecfe17",
-   "metadata": {},
-   "source": [
-    "### Select model\n",
-    "[back to top ⬆️](#Table-of-contents:)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "fe934261",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "b65944bfc43c4acebea6ec5ebd78f981",
-       "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": [
-       "Dropdown(description='Select Model:', options=('Phi-3', 'Phi-4'), value='Phi-3')"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "import ipywidgets as widgets\n",
-    "\n",
-    "model = widgets.Dropdown(\n",
-    "    options=['Phi-3', 'Phi-4'],\n",
-    "    value='Phi-3',  # default value\n",
-    "    description='Select Model:',\n",
-    ")\n",
-    "\n",
-    "model"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ccca9ef6",
-   "metadata": {},
-   "source": [
-    "### Download target and draft models"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
    "id": "74bb9f96",
    "metadata": {},
    "outputs": [],
    "source": [
     "from pathlib import Path\n",
     "import huggingface_hub as hf_hub\n",
     "\n",
-    "if model.value == \"Phi-4\":\n",
-    "    target_model_id = \"OpenVINO/Phi-4-mini-instruct-int4-ov\"\n",
-    "    draft_model_id = \"OpenVINO/Phi-4-mini-FastDraft-120M-int8-ov\"\n",
-    "elif model.value == \"Phi-3\":\n",
-    "    target_model_id = \"OpenVINO/Phi-3-mini-4k-instruct-int4-ov\"\n",
-    "    draft_model_id = \"OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov\"\n",
-    "else:\n",
-    "    print(f\"Model {model} is not supported in this demo.\")\n",
+    "draft_model_id = \"OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov\"\n",
+    "target_model_id = \"OpenVINO/Phi-3-mini-4k-instruct-int4-ov\"\n",
     "\n",
     "draft_model_path = Path(draft_model_id.split(\"/\")[-1])\n",
     "target_model_path = Path(target_model_id.split(\"/\")[-1])\n",
@@ -188,7 +131,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "id": "6ddd57de-9f41-403c-bccc-8d3118654a24",
    "metadata": {
     "tags": []
@@ -197,16 +140,17 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "093ab5068fc542a0826fbd4f8a8d97b8",
+       "model_id": "37ad0b345de94225892c9d47519a9164",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
        "Dropdown(description='Device:', options=('CPU', 'GPU'), value='CPU')"
       ]
      },
+     "execution_count": 2,
      "metadata": {},
-     "output_type": "display_data"
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -221,7 +165,8 @@
     "from notebook_utils import device_widget\n",
     "\n",
     "device = device_widget(default=\"CPU\", exclude=[\"NPU\", \"AUTO\"])\n",
-    "display(device)\n",
+    "\n",
+    "device\n",
     "\n",
     "# Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry\n",
     "from notebook_utils import collect_telemetry\n",
@@ -255,13 +200,10 @@
     "import openvino_genai as ov_genai\n",
     "import time\n",
     "\n",
-    "print(\"Initializing Auto-Regressive pipeline...\")\n",
     "pipe = ov_genai.LLMPipeline(target_model_path, device.value)\n",
     "\n",
     "config = ov_genai.GenerationConfig()\n",
-    "config.max_new_tokens = 200\n",
-    "config.apply_chat_template = False\n",
-    "\n",
+    "config.max_new_tokens = 330\n",
     "prompt = '''<s>\n",
     "\n",
     "def prime_fib(n: int):\n",
@@ -330,13 +272,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "scheduler_config = ov_genai.SchedulerConfig()\n",
+    "# cache params\n",
+    "scheduler_config.cache_size = 0\n",
+    "scheduler_config.num_kv_blocks = 2048 // 8\n",
+    "scheduler_config.max_num_batched_tokens = 2048\n",
     "\n",
     "draft_model = ov_genai.draft_model(draft_model_path, device.value)\n",
-    "config.num_assistant_tokens = 5\n",
     "\n",
-    "print(\"Initializing Speculative-Decoding pipeline...\")\n",
-    "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model)\n",
+    "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config)\n",
     "\n",
+    "config = ov_genai.GenerationConfig()\n",
+    "config.max_new_tokens = 330\n",
+    "config.num_assistant_tokens = 5\n",
     "start_time = time.perf_counter()\n",
     "result = pipe.generate(prompt, config, streamer=streamer)\n",
     "end_time = time.perf_counter()"
@@ -370,11 +318,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "config.num_assistant_tokens = 0\n",
+    "config = ov_genai.GenerationConfig()\n",
+    "config.max_new_tokens = 100\n",
     "config.assistant_confidence_threshold = 0.05\n",
-    "\n",
     "start_time = time.perf_counter()\n",
-    "result = pipe.generate(prompt, config, streamer)\n",
+    "result = pipe.generate([\"Sun is yellow because\"], config, streamer)\n",
     "end_time = time.perf_counter()"
    ]
   },
@@ -408,22 +356,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "id": "64a36dc1-958c-4f7e-baba-efa89a2d9a8f",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "ac15c0165c8e4a1c92ad8c1da9bc7879",
+       "model_id": "b0f65ad3139a477282c002eafe409d94",
        "version_major": 2,
        "version_minor": 0
       },
       "text/plain": [
        "Dropdown(description='Data type:', options=('Code', 'Text'), value='Code')"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 3,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -454,7 +402,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 4,
    "id": "13f03634",
    "metadata": {},
    "outputs": [
@@ -496,24 +444,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "id": "1f4ea9e5",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Initializing Auto-Regressive pipline...\n",
-      "Running Warmup...\n",
       "Running Auto-Regressive generation...\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:29<00:00,  5.40s/it]"
+      "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:26<00:00,  5.32s/it]"
      ]
     },
     {
@@ -533,10 +479,10 @@
     {
      "data": {
       "text/plain": [
-       "27"
+       "9"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -546,21 +492,12 @@
     "import time\n",
     "from tqdm import tqdm\n",
     "\n",
-    "print(\"Initializing Auto-Regressive pipline...\")\n",
+    "print(\"Running Auto-Regressive generation...\")\n",
     "pipe = ov_genai.LLMPipeline(target_model_path, device.value)\n",
     "\n",
     "config = ov_genai.GenerationConfig()\n",
-    "config.apply_chat_template = False\n",
     "config.max_new_tokens = 330\n",
-    "if data_type.value == \"Code\":\n",
-    "    config.max_new_tokens = 128\n",
-    "    \n",
-    "# warmup\n",
-    "print(\"Running Warmup...\")\n",
-    "for i in range(10):\n",
-    "    pipe.generate(\"this is a warmup prompt\", config)\n",
-    "    \n",
-    "print(\"Running Auto-Regressive generation...\")\n",
+    "\n",
     "times_auto_regressive = []\n",
     "for prompt in tqdm(prompts):\n",
     "    start_time = time.perf_counter()\n",
@@ -586,24 +523,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "d73e9f37",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Initializing Speculative-Decoding pipline...\n",
-      "Running Warmup...\n",
       "Running Speculative Decoding generation...\n"
      ]
     },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:10<00:00,  5.01s/it]"
+      "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [01:52<00:00,  2.25s/it]"
      ]
     },
     {
@@ -622,17 +557,20 @@
     }
    ],
    "source": [
+    "scheduler_config = ov_genai.SchedulerConfig()\n",
+    "# cache params\n",
+    "scheduler_config.cache_size = 0\n",
+    "scheduler_config.num_kv_blocks = 2048 // 8\n",
+    "scheduler_config.max_num_batched_tokens = 2048\n",
     "\n",
-    "config.num_assistant_tokens = 5\n",
     "draft_model = ov_genai.draft_model(draft_model_path, device.value)\n",
     "\n",
-    "print(\"Initializing Speculative-Decoding pipline...\")\n",
-    "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model)\n",
+    "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config)\n",
+    "\n",
+    "config = ov_genai.GenerationConfig()\n",
+    "config.max_new_tokens = 330\n",
+    "config.num_assistant_tokens = 5\n",
     "\n",
-    "# warmup\n",
-    "print(\"Running Warmup...\")\n",
-    "for i in range(10):\n",
-    "    pipe.generate(\"this is a warmup prompt\", config)\n",
     "\n",
     "times_speculative_decoding = []\n",
     "print(\"Running Speculative Decoding generation...\")\n",
@@ -655,15 +593,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 8,
    "id": "ad898772",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "average speedup: 1.09\n"
+      "average speedup: 2.23\n"
      ]
     }
    ],
@@ -689,7 +627,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.9"
+   "version": "3.12.7"
   },
   "openvino_notebooks": {
    "imageUrl": "https://github.com/user-attachments/assets/eb999dea-d98b-42bb-835e-28d3054e1a84",
diff --git a/utils/notebook_utils.py b/utils/notebook_utils.py
@@ -47,6 +47,7 @@ def device_widget(default="AUTO", exclude=None, added=None, description="Device:
     )
     return device
 
+
 def quantization_widget(default=True):
     import ipywidgets as widgets
 

Original file line number	Diff line number	Diff line change
`@@ -47,6 +47,7 @@ def device_widget(default="AUTO", exclude=None, added=None, description="Device:`
`47`	`47`	`)`
`48`	`48`	`return device`
`49`	`49`
	`50`	`+`
`50`	`51`	`def quantization_widget(default=True):`
`51`	`52`	`import ipywidgets as widgets`
`52`	`53`