Skip to content

Commit a1912c0

Browse files
authored
Revert "Update speculative-sampling notebook to allow Phi-4 model to be selected" (#2940)
Reverts #2936
1 parent e2c65dc commit a1912c0

File tree

2 files changed

+49
-110
lines changed

2 files changed

+49
-110
lines changed

notebooks/speculative-sampling/speculative-sampling.ipynb

Lines changed: 48 additions & 110 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@
7575
"metadata": {},
7676
"outputs": [],
7777
"source": [
78-
"%pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly huggingface_hub datasets ipywidgets "
78+
"%pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly huggingface_hub datasets"
7979
]
8080
},
8181
{
@@ -88,80 +88,23 @@
8888
"[back to top ⬆️](#Table-of-contents:)\n",
8989
"\n",
9090
"As example, we will use already converted LLMs from [OpenVINO collection](https://huggingface.co/collections/OpenVINO/llm-6687aaa2abca3bbcec71a9bd).\n",
91-
"You can find OpenVINO optimized FastDraft models can be found in this [collection](https://huggingface.co/collections/OpenVINO/speculative-decoding-draft-models-673f5d944d58b29ba6e94161). You can choose either Phi-3 or Phi-4 to be used for demonstrating FastDraft performance. For Phi-3 we will use [Phi-3-mini-4k-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov) as target model and [Phi-3-mini-FastDraft-50M-int8-ov](https://huggingface.co/OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov) as draft. For Phi-4 we will use [Phi-4-mini-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-4-mini-instruct-int4-ov) as target model and [Phi-4-mini-FastDraft-120M-int8-ov](https://huggingface.co/OpenVINO/Phi-4-mini-FastDraft-120M-int8-ov) as draft.\n",
91+
"You can find OpenVINO optimized FastDraft models can be found in this [collection](https://huggingface.co/collections/OpenVINO/speculative-decoding-draft-models-673f5d944d58b29ba6e94161). As example we will use [Phi-3-mini-4k-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov) as target model and [Phi-3-mini-FastDraft-50M-int8-ov](https://huggingface.co/OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov) as draft.\n",
9292
"\n",
9393
"In case, if you want run own models, you should convert them using [Hugging Face Optimum](https://huggingface.co/docs/optimum/intel/openvino/export) library accelerated by OpenVINO integration. More details about model preparation can be found in [OpenVINO LLM inference guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide/llm-inference-native-ov.html#convert-hugging-face-tokenizer-and-model-to-openvino-ir-format)"
9494
]
9595
},
96-
{
97-
"cell_type": "markdown",
98-
"id": "61ecfe17",
99-
"metadata": {},
100-
"source": [
101-
"### Select model\n",
102-
"[back to top ⬆️](#Table-of-contents:)"
103-
]
104-
},
10596
{
10697
"cell_type": "code",
10798
"execution_count": 1,
108-
"id": "fe934261",
109-
"metadata": {},
110-
"outputs": [
111-
{
112-
"data": {
113-
"application/vnd.jupyter.widget-view+json": {
114-
"model_id": "b65944bfc43c4acebea6ec5ebd78f981",
115-
"version_major": 2,
116-
"version_minor": 0
117-
},
118-
"text/plain": [
119-
"Dropdown(description='Select Model:', options=('Phi-3', 'Phi-4'), value='Phi-3')"
120-
]
121-
},
122-
"execution_count": 1,
123-
"metadata": {},
124-
"output_type": "execute_result"
125-
}
126-
],
127-
"source": [
128-
"import ipywidgets as widgets\n",
129-
"\n",
130-
"model = widgets.Dropdown(\n",
131-
" options=['Phi-3', 'Phi-4'],\n",
132-
" value='Phi-3', # default value\n",
133-
" description='Select Model:',\n",
134-
")\n",
135-
"\n",
136-
"model"
137-
]
138-
},
139-
{
140-
"cell_type": "markdown",
141-
"id": "ccca9ef6",
142-
"metadata": {},
143-
"source": [
144-
"### Download target and draft models"
145-
]
146-
},
147-
{
148-
"cell_type": "code",
149-
"execution_count": 2,
15099
"id": "74bb9f96",
151100
"metadata": {},
152101
"outputs": [],
153102
"source": [
154103
"from pathlib import Path\n",
155104
"import huggingface_hub as hf_hub\n",
156105
"\n",
157-
"if model.value == \"Phi-4\":\n",
158-
" target_model_id = \"OpenVINO/Phi-4-mini-instruct-int4-ov\"\n",
159-
" draft_model_id = \"OpenVINO/Phi-4-mini-FastDraft-120M-int8-ov\"\n",
160-
"elif model.value == \"Phi-3\":\n",
161-
" target_model_id = \"OpenVINO/Phi-3-mini-4k-instruct-int4-ov\"\n",
162-
" draft_model_id = \"OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov\"\n",
163-
"else:\n",
164-
" print(f\"Model {model} is not supported in this demo.\")\n",
106+
"draft_model_id = \"OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov\"\n",
107+
"target_model_id = \"OpenVINO/Phi-3-mini-4k-instruct-int4-ov\"\n",
165108
"\n",
166109
"draft_model_path = Path(draft_model_id.split(\"/\")[-1])\n",
167110
"target_model_path = Path(target_model_id.split(\"/\")[-1])\n",
@@ -188,7 +131,7 @@
188131
},
189132
{
190133
"cell_type": "code",
191-
"execution_count": 3,
134+
"execution_count": 2,
192135
"id": "6ddd57de-9f41-403c-bccc-8d3118654a24",
193136
"metadata": {
194137
"tags": []
@@ -197,16 +140,17 @@
197140
{
198141
"data": {
199142
"application/vnd.jupyter.widget-view+json": {
200-
"model_id": "093ab5068fc542a0826fbd4f8a8d97b8",
143+
"model_id": "37ad0b345de94225892c9d47519a9164",
201144
"version_major": 2,
202145
"version_minor": 0
203146
},
204147
"text/plain": [
205148
"Dropdown(description='Device:', options=('CPU', 'GPU'), value='CPU')"
206149
]
207150
},
151+
"execution_count": 2,
208152
"metadata": {},
209-
"output_type": "display_data"
153+
"output_type": "execute_result"
210154
}
211155
],
212156
"source": [
@@ -221,7 +165,8 @@
221165
"from notebook_utils import device_widget\n",
222166
"\n",
223167
"device = device_widget(default=\"CPU\", exclude=[\"NPU\", \"AUTO\"])\n",
224-
"display(device)\n",
168+
"\n",
169+
"device\n",
225170
"\n",
226171
"# Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry\n",
227172
"from notebook_utils import collect_telemetry\n",
@@ -255,13 +200,10 @@
255200
"import openvino_genai as ov_genai\n",
256201
"import time\n",
257202
"\n",
258-
"print(\"Initializing Auto-Regressive pipeline...\")\n",
259203
"pipe = ov_genai.LLMPipeline(target_model_path, device.value)\n",
260204
"\n",
261205
"config = ov_genai.GenerationConfig()\n",
262-
"config.max_new_tokens = 200\n",
263-
"config.apply_chat_template = False\n",
264-
"\n",
206+
"config.max_new_tokens = 330\n",
265207
"prompt = '''<s>\n",
266208
"\n",
267209
"def prime_fib(n: int):\n",
@@ -330,13 +272,19 @@
330272
"metadata": {},
331273
"outputs": [],
332274
"source": [
275+
"scheduler_config = ov_genai.SchedulerConfig()\n",
276+
"# cache params\n",
277+
"scheduler_config.cache_size = 0\n",
278+
"scheduler_config.num_kv_blocks = 2048 // 8\n",
279+
"scheduler_config.max_num_batched_tokens = 2048\n",
333280
"\n",
334281
"draft_model = ov_genai.draft_model(draft_model_path, device.value)\n",
335-
"config.num_assistant_tokens = 5\n",
336282
"\n",
337-
"print(\"Initializing Speculative-Decoding pipeline...\")\n",
338-
"pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model)\n",
283+
"pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config)\n",
339284
"\n",
285+
"config = ov_genai.GenerationConfig()\n",
286+
"config.max_new_tokens = 330\n",
287+
"config.num_assistant_tokens = 5\n",
340288
"start_time = time.perf_counter()\n",
341289
"result = pipe.generate(prompt, config, streamer=streamer)\n",
342290
"end_time = time.perf_counter()"
@@ -370,11 +318,11 @@
370318
"metadata": {},
371319
"outputs": [],
372320
"source": [
373-
"config.num_assistant_tokens = 0\n",
321+
"config = ov_genai.GenerationConfig()\n",
322+
"config.max_new_tokens = 100\n",
374323
"config.assistant_confidence_threshold = 0.05\n",
375-
"\n",
376324
"start_time = time.perf_counter()\n",
377-
"result = pipe.generate(prompt, config, streamer)\n",
325+
"result = pipe.generate([\"Sun is yellow because\"], config, streamer)\n",
378326
"end_time = time.perf_counter()"
379327
]
380328
},
@@ -408,22 +356,22 @@
408356
},
409357
{
410358
"cell_type": "code",
411-
"execution_count": 4,
359+
"execution_count": 3,
412360
"id": "64a36dc1-958c-4f7e-baba-efa89a2d9a8f",
413361
"metadata": {},
414362
"outputs": [
415363
{
416364
"data": {
417365
"application/vnd.jupyter.widget-view+json": {
418-
"model_id": "ac15c0165c8e4a1c92ad8c1da9bc7879",
366+
"model_id": "b0f65ad3139a477282c002eafe409d94",
419367
"version_major": 2,
420368
"version_minor": 0
421369
},
422370
"text/plain": [
423371
"Dropdown(description='Data type:', options=('Code', 'Text'), value='Code')"
424372
]
425373
},
426-
"execution_count": 4,
374+
"execution_count": 3,
427375
"metadata": {},
428376
"output_type": "execute_result"
429377
}
@@ -454,7 +402,7 @@
454402
},
455403
{
456404
"cell_type": "code",
457-
"execution_count": 9,
405+
"execution_count": 4,
458406
"id": "13f03634",
459407
"metadata": {},
460408
"outputs": [
@@ -496,24 +444,22 @@
496444
},
497445
{
498446
"cell_type": "code",
499-
"execution_count": null,
447+
"execution_count": 6,
500448
"id": "1f4ea9e5",
501449
"metadata": {},
502450
"outputs": [
503451
{
504452
"name": "stdout",
505453
"output_type": "stream",
506454
"text": [
507-
"Initializing Auto-Regressive pipline...\n",
508-
"Running Warmup...\n",
509455
"Running Auto-Regressive generation...\n"
510456
]
511457
},
512458
{
513459
"name": "stderr",
514460
"output_type": "stream",
515461
"text": [
516-
"100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:29<00:00, 5.40s/it]"
462+
"100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:26<00:00, 5.32s/it]"
517463
]
518464
},
519465
{
@@ -533,10 +479,10 @@
533479
{
534480
"data": {
535481
"text/plain": [
536-
"27"
482+
"9"
537483
]
538484
},
539-
"execution_count": 10,
485+
"execution_count": 6,
540486
"metadata": {},
541487
"output_type": "execute_result"
542488
}
@@ -546,21 +492,12 @@
546492
"import time\n",
547493
"from tqdm import tqdm\n",
548494
"\n",
549-
"print(\"Initializing Auto-Regressive pipline...\")\n",
495+
"print(\"Running Auto-Regressive generation...\")\n",
550496
"pipe = ov_genai.LLMPipeline(target_model_path, device.value)\n",
551497
"\n",
552498
"config = ov_genai.GenerationConfig()\n",
553-
"config.apply_chat_template = False\n",
554499
"config.max_new_tokens = 330\n",
555-
"if data_type.value == \"Code\":\n",
556-
" config.max_new_tokens = 128\n",
557-
" \n",
558-
"# warmup\n",
559-
"print(\"Running Warmup...\")\n",
560-
"for i in range(10):\n",
561-
" pipe.generate(\"this is a warmup prompt\", config)\n",
562-
" \n",
563-
"print(\"Running Auto-Regressive generation...\")\n",
500+
"\n",
564501
"times_auto_regressive = []\n",
565502
"for prompt in tqdm(prompts):\n",
566503
" start_time = time.perf_counter()\n",
@@ -586,24 +523,22 @@
586523
},
587524
{
588525
"cell_type": "code",
589-
"execution_count": null,
526+
"execution_count": 7,
590527
"id": "d73e9f37",
591528
"metadata": {},
592529
"outputs": [
593530
{
594531
"name": "stdout",
595532
"output_type": "stream",
596533
"text": [
597-
"Initializing Speculative-Decoding pipline...\n",
598-
"Running Warmup...\n",
599534
"Running Speculative Decoding generation...\n"
600535
]
601536
},
602537
{
603538
"name": "stderr",
604539
"output_type": "stream",
605540
"text": [
606-
"100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:10<00:00, 5.01s/it]"
541+
"100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [01:52<00:00, 2.25s/it]"
607542
]
608543
},
609544
{
@@ -622,17 +557,20 @@
622557
}
623558
],
624559
"source": [
560+
"scheduler_config = ov_genai.SchedulerConfig()\n",
561+
"# cache params\n",
562+
"scheduler_config.cache_size = 0\n",
563+
"scheduler_config.num_kv_blocks = 2048 // 8\n",
564+
"scheduler_config.max_num_batched_tokens = 2048\n",
625565
"\n",
626-
"config.num_assistant_tokens = 5\n",
627566
"draft_model = ov_genai.draft_model(draft_model_path, device.value)\n",
628567
"\n",
629-
"print(\"Initializing Speculative-Decoding pipline...\")\n",
630-
"pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model)\n",
568+
"pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config)\n",
569+
"\n",
570+
"config = ov_genai.GenerationConfig()\n",
571+
"config.max_new_tokens = 330\n",
572+
"config.num_assistant_tokens = 5\n",
631573
"\n",
632-
"# warmup\n",
633-
"print(\"Running Warmup...\")\n",
634-
"for i in range(10):\n",
635-
" pipe.generate(\"this is a warmup prompt\", config)\n",
636574
"\n",
637575
"times_speculative_decoding = []\n",
638576
"print(\"Running Speculative Decoding generation...\")\n",
@@ -655,15 +593,15 @@
655593
},
656594
{
657595
"cell_type": "code",
658-
"execution_count": 12,
596+
"execution_count": 8,
659597
"id": "ad898772",
660598
"metadata": {},
661599
"outputs": [
662600
{
663601
"name": "stdout",
664602
"output_type": "stream",
665603
"text": [
666-
"average speedup: 1.09\n"
604+
"average speedup: 2.23\n"
667605
]
668606
}
669607
],
@@ -689,7 +627,7 @@
689627
"name": "python",
690628
"nbconvert_exporter": "python",
691629
"pygments_lexer": "ipython3",
692-
"version": "3.12.9"
630+
"version": "3.12.7"
693631
},
694632
"openvino_notebooks": {
695633
"imageUrl": "https://github.com/user-attachments/assets/eb999dea-d98b-42bb-835e-28d3054e1a84",

utils/notebook_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ def device_widget(default="AUTO", exclude=None, added=None, description="Device:
4747
)
4848
return device
4949

50+
5051
def quantization_widget(default=True):
5152
import ipywidgets as widgets
5253

0 commit comments

Comments
 (0)