Skip to content

Commit e2c65dc

Browse files
shira-gdanielkorat
andauthored
Update speculative-sampling notebook to allow Phi-4 model to be selected (#2936)
Co-authored-by: Daniel Korat <daniel.korat@intel.com>
1 parent b81bba9 commit e2c65dc

File tree

2 files changed

+110
-49
lines changed

2 files changed

+110
-49
lines changed

notebooks/speculative-sampling/speculative-sampling.ipynb

Lines changed: 110 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@
7575
"metadata": {},
7676
"outputs": [],
7777
"source": [
78-
"%pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly huggingface_hub datasets"
78+
"%pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly huggingface_hub datasets ipywidgets "
7979
]
8080
},
8181
{
@@ -88,23 +88,80 @@
8888
"[back to top ⬆️](#Table-of-contents:)\n",
8989
"\n",
9090
"As example, we will use already converted LLMs from [OpenVINO collection](https://huggingface.co/collections/OpenVINO/llm-6687aaa2abca3bbcec71a9bd).\n",
91-
"You can find OpenVINO optimized FastDraft models can be found in this [collection](https://huggingface.co/collections/OpenVINO/speculative-decoding-draft-models-673f5d944d58b29ba6e94161). As example we will use [Phi-3-mini-4k-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov) as target model and [Phi-3-mini-FastDraft-50M-int8-ov](https://huggingface.co/OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov) as draft.\n",
91+
"You can find OpenVINO optimized FastDraft models can be found in this [collection](https://huggingface.co/collections/OpenVINO/speculative-decoding-draft-models-673f5d944d58b29ba6e94161). You can choose either Phi-3 or Phi-4 to be used for demonstrating FastDraft performance. For Phi-3 we will use [Phi-3-mini-4k-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov) as target model and [Phi-3-mini-FastDraft-50M-int8-ov](https://huggingface.co/OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov) as draft. For Phi-4 we will use [Phi-4-mini-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-4-mini-instruct-int4-ov) as target model and [Phi-4-mini-FastDraft-120M-int8-ov](https://huggingface.co/OpenVINO/Phi-4-mini-FastDraft-120M-int8-ov) as draft.\n",
9292
"\n",
9393
"In case, if you want run own models, you should convert them using [Hugging Face Optimum](https://huggingface.co/docs/optimum/intel/openvino/export) library accelerated by OpenVINO integration. More details about model preparation can be found in [OpenVINO LLM inference guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide/llm-inference-native-ov.html#convert-hugging-face-tokenizer-and-model-to-openvino-ir-format)"
9494
]
9595
},
96+
{
97+
"cell_type": "markdown",
98+
"id": "61ecfe17",
99+
"metadata": {},
100+
"source": [
101+
"### Select model\n",
102+
"[back to top ⬆️](#Table-of-contents:)"
103+
]
104+
},
96105
{
97106
"cell_type": "code",
98107
"execution_count": 1,
108+
"id": "fe934261",
109+
"metadata": {},
110+
"outputs": [
111+
{
112+
"data": {
113+
"application/vnd.jupyter.widget-view+json": {
114+
"model_id": "b65944bfc43c4acebea6ec5ebd78f981",
115+
"version_major": 2,
116+
"version_minor": 0
117+
},
118+
"text/plain": [
119+
"Dropdown(description='Select Model:', options=('Phi-3', 'Phi-4'), value='Phi-3')"
120+
]
121+
},
122+
"execution_count": 1,
123+
"metadata": {},
124+
"output_type": "execute_result"
125+
}
126+
],
127+
"source": [
128+
"import ipywidgets as widgets\n",
129+
"\n",
130+
"model = widgets.Dropdown(\n",
131+
" options=['Phi-3', 'Phi-4'],\n",
132+
" value='Phi-3', # default value\n",
133+
" description='Select Model:',\n",
134+
")\n",
135+
"\n",
136+
"model"
137+
]
138+
},
139+
{
140+
"cell_type": "markdown",
141+
"id": "ccca9ef6",
142+
"metadata": {},
143+
"source": [
144+
"### Download target and draft models"
145+
]
146+
},
147+
{
148+
"cell_type": "code",
149+
"execution_count": 2,
99150
"id": "74bb9f96",
100151
"metadata": {},
101152
"outputs": [],
102153
"source": [
103154
"from pathlib import Path\n",
104155
"import huggingface_hub as hf_hub\n",
105156
"\n",
106-
"draft_model_id = \"OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov\"\n",
107-
"target_model_id = \"OpenVINO/Phi-3-mini-4k-instruct-int4-ov\"\n",
157+
"if model.value == \"Phi-4\":\n",
158+
" target_model_id = \"OpenVINO/Phi-4-mini-instruct-int4-ov\"\n",
159+
" draft_model_id = \"OpenVINO/Phi-4-mini-FastDraft-120M-int8-ov\"\n",
160+
"elif model.value == \"Phi-3\":\n",
161+
" target_model_id = \"OpenVINO/Phi-3-mini-4k-instruct-int4-ov\"\n",
162+
" draft_model_id = \"OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov\"\n",
163+
"else:\n",
164+
" print(f\"Model {model} is not supported in this demo.\")\n",
108165
"\n",
109166
"draft_model_path = Path(draft_model_id.split(\"/\")[-1])\n",
110167
"target_model_path = Path(target_model_id.split(\"/\")[-1])\n",
@@ -131,7 +188,7 @@
131188
},
132189
{
133190
"cell_type": "code",
134-
"execution_count": 2,
191+
"execution_count": 3,
135192
"id": "6ddd57de-9f41-403c-bccc-8d3118654a24",
136193
"metadata": {
137194
"tags": []
@@ -140,17 +197,16 @@
140197
{
141198
"data": {
142199
"application/vnd.jupyter.widget-view+json": {
143-
"model_id": "37ad0b345de94225892c9d47519a9164",
200+
"model_id": "093ab5068fc542a0826fbd4f8a8d97b8",
144201
"version_major": 2,
145202
"version_minor": 0
146203
},
147204
"text/plain": [
148205
"Dropdown(description='Device:', options=('CPU', 'GPU'), value='CPU')"
149206
]
150207
},
151-
"execution_count": 2,
152208
"metadata": {},
153-
"output_type": "execute_result"
209+
"output_type": "display_data"
154210
}
155211
],
156212
"source": [
@@ -165,8 +221,7 @@
165221
"from notebook_utils import device_widget\n",
166222
"\n",
167223
"device = device_widget(default=\"CPU\", exclude=[\"NPU\", \"AUTO\"])\n",
168-
"\n",
169-
"device\n",
224+
"display(device)\n",
170225
"\n",
171226
"# Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry\n",
172227
"from notebook_utils import collect_telemetry\n",
@@ -200,10 +255,13 @@
200255
"import openvino_genai as ov_genai\n",
201256
"import time\n",
202257
"\n",
258+
"print(\"Initializing Auto-Regressive pipeline...\")\n",
203259
"pipe = ov_genai.LLMPipeline(target_model_path, device.value)\n",
204260
"\n",
205261
"config = ov_genai.GenerationConfig()\n",
206-
"config.max_new_tokens = 330\n",
262+
"config.max_new_tokens = 200\n",
263+
"config.apply_chat_template = False\n",
264+
"\n",
207265
"prompt = '''<s>\n",
208266
"\n",
209267
"def prime_fib(n: int):\n",
@@ -272,19 +330,13 @@
272330
"metadata": {},
273331
"outputs": [],
274332
"source": [
275-
"scheduler_config = ov_genai.SchedulerConfig()\n",
276-
"# cache params\n",
277-
"scheduler_config.cache_size = 0\n",
278-
"scheduler_config.num_kv_blocks = 2048 // 8\n",
279-
"scheduler_config.max_num_batched_tokens = 2048\n",
280333
"\n",
281334
"draft_model = ov_genai.draft_model(draft_model_path, device.value)\n",
335+
"config.num_assistant_tokens = 5\n",
282336
"\n",
283-
"pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config)\n",
337+
"print(\"Initializing Speculative-Decoding pipeline...\")\n",
338+
"pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model)\n",
284339
"\n",
285-
"config = ov_genai.GenerationConfig()\n",
286-
"config.max_new_tokens = 330\n",
287-
"config.num_assistant_tokens = 5\n",
288340
"start_time = time.perf_counter()\n",
289341
"result = pipe.generate(prompt, config, streamer=streamer)\n",
290342
"end_time = time.perf_counter()"
@@ -318,11 +370,11 @@
318370
"metadata": {},
319371
"outputs": [],
320372
"source": [
321-
"config = ov_genai.GenerationConfig()\n",
322-
"config.max_new_tokens = 100\n",
373+
"config.num_assistant_tokens = 0\n",
323374
"config.assistant_confidence_threshold = 0.05\n",
375+
"\n",
324376
"start_time = time.perf_counter()\n",
325-
"result = pipe.generate([\"Sun is yellow because\"], config, streamer)\n",
377+
"result = pipe.generate(prompt, config, streamer)\n",
326378
"end_time = time.perf_counter()"
327379
]
328380
},
@@ -356,22 +408,22 @@
356408
},
357409
{
358410
"cell_type": "code",
359-
"execution_count": 3,
411+
"execution_count": 4,
360412
"id": "64a36dc1-958c-4f7e-baba-efa89a2d9a8f",
361413
"metadata": {},
362414
"outputs": [
363415
{
364416
"data": {
365417
"application/vnd.jupyter.widget-view+json": {
366-
"model_id": "b0f65ad3139a477282c002eafe409d94",
418+
"model_id": "ac15c0165c8e4a1c92ad8c1da9bc7879",
367419
"version_major": 2,
368420
"version_minor": 0
369421
},
370422
"text/plain": [
371423
"Dropdown(description='Data type:', options=('Code', 'Text'), value='Code')"
372424
]
373425
},
374-
"execution_count": 3,
426+
"execution_count": 4,
375427
"metadata": {},
376428
"output_type": "execute_result"
377429
}
@@ -402,7 +454,7 @@
402454
},
403455
{
404456
"cell_type": "code",
405-
"execution_count": 4,
457+
"execution_count": 9,
406458
"id": "13f03634",
407459
"metadata": {},
408460
"outputs": [
@@ -444,22 +496,24 @@
444496
},
445497
{
446498
"cell_type": "code",
447-
"execution_count": 6,
499+
"execution_count": null,
448500
"id": "1f4ea9e5",
449501
"metadata": {},
450502
"outputs": [
451503
{
452504
"name": "stdout",
453505
"output_type": "stream",
454506
"text": [
507+
"Initializing Auto-Regressive pipline...\n",
508+
"Running Warmup...\n",
455509
"Running Auto-Regressive generation...\n"
456510
]
457511
},
458512
{
459513
"name": "stderr",
460514
"output_type": "stream",
461515
"text": [
462-
"100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:26<00:00, 5.32s/it]"
516+
"100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:29<00:00, 5.40s/it]"
463517
]
464518
},
465519
{
@@ -479,10 +533,10 @@
479533
{
480534
"data": {
481535
"text/plain": [
482-
"9"
536+
"27"
483537
]
484538
},
485-
"execution_count": 6,
539+
"execution_count": 10,
486540
"metadata": {},
487541
"output_type": "execute_result"
488542
}
@@ -492,12 +546,21 @@
492546
"import time\n",
493547
"from tqdm import tqdm\n",
494548
"\n",
495-
"print(\"Running Auto-Regressive generation...\")\n",
549+
"print(\"Initializing Auto-Regressive pipline...\")\n",
496550
"pipe = ov_genai.LLMPipeline(target_model_path, device.value)\n",
497551
"\n",
498552
"config = ov_genai.GenerationConfig()\n",
553+
"config.apply_chat_template = False\n",
499554
"config.max_new_tokens = 330\n",
500-
"\n",
555+
"if data_type.value == \"Code\":\n",
556+
" config.max_new_tokens = 128\n",
557+
" \n",
558+
"# warmup\n",
559+
"print(\"Running Warmup...\")\n",
560+
"for i in range(10):\n",
561+
" pipe.generate(\"this is a warmup prompt\", config)\n",
562+
" \n",
563+
"print(\"Running Auto-Regressive generation...\")\n",
501564
"times_auto_regressive = []\n",
502565
"for prompt in tqdm(prompts):\n",
503566
" start_time = time.perf_counter()\n",
@@ -523,22 +586,24 @@
523586
},
524587
{
525588
"cell_type": "code",
526-
"execution_count": 7,
589+
"execution_count": null,
527590
"id": "d73e9f37",
528591
"metadata": {},
529592
"outputs": [
530593
{
531594
"name": "stdout",
532595
"output_type": "stream",
533596
"text": [
597+
"Initializing Speculative-Decoding pipline...\n",
598+
"Running Warmup...\n",
534599
"Running Speculative Decoding generation...\n"
535600
]
536601
},
537602
{
538603
"name": "stderr",
539604
"output_type": "stream",
540605
"text": [
541-
"100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [01:52<00:00, 2.25s/it]"
606+
"100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:10<00:00, 5.01s/it]"
542607
]
543608
},
544609
{
@@ -557,20 +622,17 @@
557622
}
558623
],
559624
"source": [
560-
"scheduler_config = ov_genai.SchedulerConfig()\n",
561-
"# cache params\n",
562-
"scheduler_config.cache_size = 0\n",
563-
"scheduler_config.num_kv_blocks = 2048 // 8\n",
564-
"scheduler_config.max_num_batched_tokens = 2048\n",
565625
"\n",
626+
"config.num_assistant_tokens = 5\n",
566627
"draft_model = ov_genai.draft_model(draft_model_path, device.value)\n",
567628
"\n",
568-
"pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config)\n",
569-
"\n",
570-
"config = ov_genai.GenerationConfig()\n",
571-
"config.max_new_tokens = 330\n",
572-
"config.num_assistant_tokens = 5\n",
629+
"print(\"Initializing Speculative-Decoding pipline...\")\n",
630+
"pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model)\n",
573631
"\n",
632+
"# warmup\n",
633+
"print(\"Running Warmup...\")\n",
634+
"for i in range(10):\n",
635+
" pipe.generate(\"this is a warmup prompt\", config)\n",
574636
"\n",
575637
"times_speculative_decoding = []\n",
576638
"print(\"Running Speculative Decoding generation...\")\n",
@@ -593,15 +655,15 @@
593655
},
594656
{
595657
"cell_type": "code",
596-
"execution_count": 8,
658+
"execution_count": 12,
597659
"id": "ad898772",
598660
"metadata": {},
599661
"outputs": [
600662
{
601663
"name": "stdout",
602664
"output_type": "stream",
603665
"text": [
604-
"average speedup: 2.23\n"
666+
"average speedup: 1.09\n"
605667
]
606668
}
607669
],
@@ -627,7 +689,7 @@
627689
"name": "python",
628690
"nbconvert_exporter": "python",
629691
"pygments_lexer": "ipython3",
630-
"version": "3.12.7"
692+
"version": "3.12.9"
631693
},
632694
"openvino_notebooks": {
633695
"imageUrl": "https://github.com/user-attachments/assets/eb999dea-d98b-42bb-835e-28d3054e1a84",

utils/notebook_utils.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@ def device_widget(default="AUTO", exclude=None, added=None, description="Device:
4747
)
4848
return device
4949

50-
5150
def quantization_widget(default=True):
5251
import ipywidgets as widgets
5352

0 commit comments

Comments
 (0)