Skip to content

Commit 6b9d67f

Browse files
authored
add option to select Phi-4 in speculative-sampling notebook (#2941)
1 parent a1912c0 commit 6b9d67f

File tree

1 file changed

+110
-50
lines changed

1 file changed

+110
-50
lines changed

notebooks/speculative-sampling/speculative-sampling.ipynb

Lines changed: 110 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@
7575
"metadata": {},
7676
"outputs": [],
7777
"source": [
78-
"%pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly huggingface_hub datasets"
78+
"%pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly huggingface_hub datasets ipywidgets"
7979
]
8080
},
8181
{
@@ -88,23 +88,80 @@
8888
"[back to top ⬆️](#Table-of-contents:)\n",
8989
"\n",
9090
"As example, we will use already converted LLMs from [OpenVINO collection](https://huggingface.co/collections/OpenVINO/llm-6687aaa2abca3bbcec71a9bd).\n",
91-
"You can find OpenVINO optimized FastDraft models can be found in this [collection](https://huggingface.co/collections/OpenVINO/speculative-decoding-draft-models-673f5d944d58b29ba6e94161). As example we will use [Phi-3-mini-4k-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov) as target model and [Phi-3-mini-FastDraft-50M-int8-ov](https://huggingface.co/OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov) as draft.\n",
91+
"You can find OpenVINO optimized FastDraft models can be found in this [collection](https://huggingface.co/collections/OpenVINO/speculative-decoding-draft-models-673f5d944d58b29ba6e94161). You can choose either Phi-3 or Phi-4 to be used for demonstrating FastDraft performance. For Phi-3 we will use [Phi-3-mini-4k-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov) as target model and [Phi-3-mini-FastDraft-50M-int8-ov](https://huggingface.co/OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov) as draft. For Phi-4 we will use [Phi-4-mini-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-4-mini-instruct-int4-ov) as target model and [Phi-4-mini-FastDraft-120M-int8-ov](https://huggingface.co/OpenVINO/Phi-4-mini-FastDraft-120M-int8-ov) as draft.\n",
9292
"\n",
9393
"In case, if you want run own models, you should convert them using [Hugging Face Optimum](https://huggingface.co/docs/optimum/intel/openvino/export) library accelerated by OpenVINO integration. More details about model preparation can be found in [OpenVINO LLM inference guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide/llm-inference-native-ov.html#convert-hugging-face-tokenizer-and-model-to-openvino-ir-format)"
9494
]
9595
},
96+
{
97+
"cell_type": "markdown",
98+
"id": "61ecfe17",
99+
"metadata": {},
100+
"source": [
101+
"### Select model\n",
102+
"[back to top ⬆️](#Table-of-contents:)"
103+
]
104+
},
96105
{
97106
"cell_type": "code",
98-
"execution_count": 1,
107+
"execution_count": null,
108+
"id": "fe934261",
109+
"metadata": {},
110+
"outputs": [
111+
{
112+
"data": {
113+
"application/vnd.jupyter.widget-view+json": {
114+
"model_id": "b65944bfc43c4acebea6ec5ebd78f981",
115+
"version_major": 2,
116+
"version_minor": 0
117+
},
118+
"text/plain": [
119+
"Dropdown(description='Select Model:', options=('Phi-3', 'Phi-4'), value='Phi-3')"
120+
]
121+
},
122+
"execution_count": 1,
123+
"metadata": {},
124+
"output_type": "execute_result"
125+
}
126+
],
127+
"source": [
128+
"import ipywidgets as widgets\n",
129+
"\n",
130+
"model = widgets.Dropdown(\n",
131+
" options=[\"Phi-3\", \"Phi-4\"],\n",
132+
" value=\"Phi-3\", # default value\n",
133+
" description=\"Select Model:\",\n",
134+
")\n",
135+
"\n",
136+
"model"
137+
]
138+
},
139+
{
140+
"cell_type": "markdown",
141+
"id": "ccca9ef6",
142+
"metadata": {},
143+
"source": [
144+
"### Download target and draft models"
145+
]
146+
},
147+
{
148+
"cell_type": "code",
149+
"execution_count": 2,
99150
"id": "74bb9f96",
100151
"metadata": {},
101152
"outputs": [],
102153
"source": [
103154
"from pathlib import Path\n",
104155
"import huggingface_hub as hf_hub\n",
105156
"\n",
106-
"draft_model_id = \"OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov\"\n",
107-
"target_model_id = \"OpenVINO/Phi-3-mini-4k-instruct-int4-ov\"\n",
157+
"if model.value == \"Phi-4\":\n",
158+
" target_model_id = \"OpenVINO/Phi-4-mini-instruct-int4-ov\"\n",
159+
" draft_model_id = \"OpenVINO/Phi-4-mini-FastDraft-120M-int8-ov\"\n",
160+
"elif model.value == \"Phi-3\":\n",
161+
" target_model_id = \"OpenVINO/Phi-3-mini-4k-instruct-int4-ov\"\n",
162+
" draft_model_id = \"OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov\"\n",
163+
"else:\n",
164+
" print(f\"Model {model} is not supported in this demo.\")\n",
108165
"\n",
109166
"draft_model_path = Path(draft_model_id.split(\"/\")[-1])\n",
110167
"target_model_path = Path(target_model_id.split(\"/\")[-1])\n",
@@ -131,7 +188,7 @@
131188
},
132189
{
133190
"cell_type": "code",
134-
"execution_count": 2,
191+
"execution_count": 3,
135192
"id": "6ddd57de-9f41-403c-bccc-8d3118654a24",
136193
"metadata": {
137194
"tags": []
@@ -140,17 +197,16 @@
140197
{
141198
"data": {
142199
"application/vnd.jupyter.widget-view+json": {
143-
"model_id": "37ad0b345de94225892c9d47519a9164",
200+
"model_id": "093ab5068fc542a0826fbd4f8a8d97b8",
144201
"version_major": 2,
145202
"version_minor": 0
146203
},
147204
"text/plain": [
148205
"Dropdown(description='Device:', options=('CPU', 'GPU'), value='CPU')"
149206
]
150207
},
151-
"execution_count": 2,
152208
"metadata": {},
153-
"output_type": "execute_result"
209+
"output_type": "display_data"
154210
}
155211
],
156212
"source": [
@@ -165,8 +221,7 @@
165221
"from notebook_utils import device_widget\n",
166222
"\n",
167223
"device = device_widget(default=\"CPU\", exclude=[\"NPU\", \"AUTO\"])\n",
168-
"\n",
169-
"device\n",
224+
"display(device)\n",
170225
"\n",
171226
"# Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry\n",
172227
"from notebook_utils import collect_telemetry\n",
@@ -200,10 +255,13 @@
200255
"import openvino_genai as ov_genai\n",
201256
"import time\n",
202257
"\n",
258+
"print(\"Initializing Auto-Regressive pipeline...\")\n",
203259
"pipe = ov_genai.LLMPipeline(target_model_path, device.value)\n",
204260
"\n",
205261
"config = ov_genai.GenerationConfig()\n",
206-
"config.max_new_tokens = 330\n",
262+
"config.max_new_tokens = 200\n",
263+
"config.apply_chat_template = False\n",
264+
"\n",
207265
"prompt = '''<s>\n",
208266
"\n",
209267
"def prime_fib(n: int):\n",
@@ -272,19 +330,12 @@
272330
"metadata": {},
273331
"outputs": [],
274332
"source": [
275-
"scheduler_config = ov_genai.SchedulerConfig()\n",
276-
"# cache params\n",
277-
"scheduler_config.cache_size = 0\n",
278-
"scheduler_config.num_kv_blocks = 2048 // 8\n",
279-
"scheduler_config.max_num_batched_tokens = 2048\n",
280-
"\n",
281333
"draft_model = ov_genai.draft_model(draft_model_path, device.value)\n",
334+
"config.num_assistant_tokens = 5\n",
282335
"\n",
283-
"pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config)\n",
336+
"print(\"Initializing Speculative-Decoding pipeline...\")\n",
337+
"pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model)\n",
284338
"\n",
285-
"config = ov_genai.GenerationConfig()\n",
286-
"config.max_new_tokens = 330\n",
287-
"config.num_assistant_tokens = 5\n",
288339
"start_time = time.perf_counter()\n",
289340
"result = pipe.generate(prompt, config, streamer=streamer)\n",
290341
"end_time = time.perf_counter()"
@@ -318,11 +369,11 @@
318369
"metadata": {},
319370
"outputs": [],
320371
"source": [
321-
"config = ov_genai.GenerationConfig()\n",
322-
"config.max_new_tokens = 100\n",
372+
"config.num_assistant_tokens = 0\n",
323373
"config.assistant_confidence_threshold = 0.05\n",
374+
"\n",
324375
"start_time = time.perf_counter()\n",
325-
"result = pipe.generate([\"Sun is yellow because\"], config, streamer)\n",
376+
"result = pipe.generate(prompt, config, streamer)\n",
326377
"end_time = time.perf_counter()"
327378
]
328379
},
@@ -356,22 +407,22 @@
356407
},
357408
{
358409
"cell_type": "code",
359-
"execution_count": 3,
410+
"execution_count": 4,
360411
"id": "64a36dc1-958c-4f7e-baba-efa89a2d9a8f",
361412
"metadata": {},
362413
"outputs": [
363414
{
364415
"data": {
365416
"application/vnd.jupyter.widget-view+json": {
366-
"model_id": "b0f65ad3139a477282c002eafe409d94",
417+
"model_id": "ac15c0165c8e4a1c92ad8c1da9bc7879",
367418
"version_major": 2,
368419
"version_minor": 0
369420
},
370421
"text/plain": [
371422
"Dropdown(description='Data type:', options=('Code', 'Text'), value='Code')"
372423
]
373424
},
374-
"execution_count": 3,
425+
"execution_count": 4,
375426
"metadata": {},
376427
"output_type": "execute_result"
377428
}
@@ -402,7 +453,7 @@
402453
},
403454
{
404455
"cell_type": "code",
405-
"execution_count": 4,
456+
"execution_count": 9,
406457
"id": "13f03634",
407458
"metadata": {},
408459
"outputs": [
@@ -444,22 +495,24 @@
444495
},
445496
{
446497
"cell_type": "code",
447-
"execution_count": 6,
498+
"execution_count": null,
448499
"id": "1f4ea9e5",
449500
"metadata": {},
450501
"outputs": [
451502
{
452503
"name": "stdout",
453504
"output_type": "stream",
454505
"text": [
506+
"Initializing Auto-Regressive pipline...\n",
507+
"Running Warmup...\n",
455508
"Running Auto-Regressive generation...\n"
456509
]
457510
},
458511
{
459512
"name": "stderr",
460513
"output_type": "stream",
461514
"text": [
462-
"100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:26<00:00, 5.32s/it]"
515+
"100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:29<00:00, 5.40s/it]"
463516
]
464517
},
465518
{
@@ -479,10 +532,10 @@
479532
{
480533
"data": {
481534
"text/plain": [
482-
"9"
535+
"27"
483536
]
484537
},
485-
"execution_count": 6,
538+
"execution_count": 10,
486539
"metadata": {},
487540
"output_type": "execute_result"
488541
}
@@ -492,12 +545,21 @@
492545
"import time\n",
493546
"from tqdm import tqdm\n",
494547
"\n",
495-
"print(\"Running Auto-Regressive generation...\")\n",
548+
"print(\"Initializing Auto-Regressive pipline...\")\n",
496549
"pipe = ov_genai.LLMPipeline(target_model_path, device.value)\n",
497550
"\n",
498551
"config = ov_genai.GenerationConfig()\n",
552+
"config.apply_chat_template = False\n",
499553
"config.max_new_tokens = 330\n",
554+
"if data_type.value == \"Code\":\n",
555+
" config.max_new_tokens = 128\n",
556+
"\n",
557+
"# warmup\n",
558+
"print(\"Running Warmup...\")\n",
559+
"for i in range(10):\n",
560+
" pipe.generate(\"this is a warmup prompt\", config)\n",
500561
"\n",
562+
"print(\"Running Auto-Regressive generation...\")\n",
501563
"times_auto_regressive = []\n",
502564
"for prompt in tqdm(prompts):\n",
503565
" start_time = time.perf_counter()\n",
@@ -523,22 +585,24 @@
523585
},
524586
{
525587
"cell_type": "code",
526-
"execution_count": 7,
588+
"execution_count": null,
527589
"id": "d73e9f37",
528590
"metadata": {},
529591
"outputs": [
530592
{
531593
"name": "stdout",
532594
"output_type": "stream",
533595
"text": [
596+
"Initializing Speculative-Decoding pipline...\n",
597+
"Running Warmup...\n",
534598
"Running Speculative Decoding generation...\n"
535599
]
536600
},
537601
{
538602
"name": "stderr",
539603
"output_type": "stream",
540604
"text": [
541-
"100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [01:52<00:00, 2.25s/it]"
605+
"100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:10<00:00, 5.01s/it]"
542606
]
543607
},
544608
{
@@ -557,20 +621,16 @@
557621
}
558622
],
559623
"source": [
560-
"scheduler_config = ov_genai.SchedulerConfig()\n",
561-
"# cache params\n",
562-
"scheduler_config.cache_size = 0\n",
563-
"scheduler_config.num_kv_blocks = 2048 // 8\n",
564-
"scheduler_config.max_num_batched_tokens = 2048\n",
565-
"\n",
624+
"config.num_assistant_tokens = 5\n",
566625
"draft_model = ov_genai.draft_model(draft_model_path, device.value)\n",
567626
"\n",
568-
"pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config)\n",
569-
"\n",
570-
"config = ov_genai.GenerationConfig()\n",
571-
"config.max_new_tokens = 330\n",
572-
"config.num_assistant_tokens = 5\n",
627+
"print(\"Initializing Speculative-Decoding pipline...\")\n",
628+
"pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model)\n",
573629
"\n",
630+
"# warmup\n",
631+
"print(\"Running Warmup...\")\n",
632+
"for i in range(10):\n",
633+
" pipe.generate(\"this is a warmup prompt\", config)\n",
574634
"\n",
575635
"times_speculative_decoding = []\n",
576636
"print(\"Running Speculative Decoding generation...\")\n",
@@ -593,15 +653,15 @@
593653
},
594654
{
595655
"cell_type": "code",
596-
"execution_count": 8,
656+
"execution_count": 12,
597657
"id": "ad898772",
598658
"metadata": {},
599659
"outputs": [
600660
{
601661
"name": "stdout",
602662
"output_type": "stream",
603663
"text": [
604-
"average speedup: 2.23\n"
664+
"average speedup: 1.09\n"
605665
]
606666
}
607667
],
@@ -627,7 +687,7 @@
627687
"name": "python",
628688
"nbconvert_exporter": "python",
629689
"pygments_lexer": "ipython3",
630-
"version": "3.12.7"
690+
"version": "3.12.9"
631691
},
632692
"openvino_notebooks": {
633693
"imageUrl": "https://github.com/user-attachments/assets/eb999dea-d98b-42bb-835e-28d3054e1a84",

0 commit comments

Comments
 (0)