|
75 | 75 | "metadata": {},
|
76 | 76 | "outputs": [],
|
77 | 77 | "source": [
|
78 |
| - "%pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly huggingface_hub datasets" |
| 78 | + "%pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly huggingface_hub datasets ipywidgets" |
79 | 79 | ]
|
80 | 80 | },
|
81 | 81 | {
|
|
88 | 88 | "[back to top ⬆️](#Table-of-contents:)\n",
|
89 | 89 | "\n",
|
90 | 90 | "As example, we will use already converted LLMs from [OpenVINO collection](https://huggingface.co/collections/OpenVINO/llm-6687aaa2abca3bbcec71a9bd).\n",
|
91 |
| - "You can find OpenVINO optimized FastDraft models can be found in this [collection](https://huggingface.co/collections/OpenVINO/speculative-decoding-draft-models-673f5d944d58b29ba6e94161). As example we will use [Phi-3-mini-4k-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov) as target model and [Phi-3-mini-FastDraft-50M-int8-ov](https://huggingface.co/OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov) as draft.\n", |
| 91 | + "You can find OpenVINO optimized FastDraft models can be found in this [collection](https://huggingface.co/collections/OpenVINO/speculative-decoding-draft-models-673f5d944d58b29ba6e94161). You can choose either Phi-3 or Phi-4 to be used for demonstrating FastDraft performance. For Phi-3 we will use [Phi-3-mini-4k-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov) as target model and [Phi-3-mini-FastDraft-50M-int8-ov](https://huggingface.co/OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov) as draft. For Phi-4 we will use [Phi-4-mini-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-4-mini-instruct-int4-ov) as target model and [Phi-4-mini-FastDraft-120M-int8-ov](https://huggingface.co/OpenVINO/Phi-4-mini-FastDraft-120M-int8-ov) as draft.\n", |
92 | 92 | "\n",
|
93 | 93 | "In case, if you want run own models, you should convert them using [Hugging Face Optimum](https://huggingface.co/docs/optimum/intel/openvino/export) library accelerated by OpenVINO integration. More details about model preparation can be found in [OpenVINO LLM inference guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide/llm-inference-native-ov.html#convert-hugging-face-tokenizer-and-model-to-openvino-ir-format)"
|
94 | 94 | ]
|
95 | 95 | },
|
| 96 | + { |
| 97 | + "cell_type": "markdown", |
| 98 | + "id": "61ecfe17", |
| 99 | + "metadata": {}, |
| 100 | + "source": [ |
| 101 | + "### Select model\n", |
| 102 | + "[back to top ⬆️](#Table-of-contents:)" |
| 103 | + ] |
| 104 | + }, |
96 | 105 | {
|
97 | 106 | "cell_type": "code",
|
98 |
| - "execution_count": 1, |
| 107 | + "execution_count": null, |
| 108 | + "id": "fe934261", |
| 109 | + "metadata": {}, |
| 110 | + "outputs": [ |
| 111 | + { |
| 112 | + "data": { |
| 113 | + "application/vnd.jupyter.widget-view+json": { |
| 114 | + "model_id": "b65944bfc43c4acebea6ec5ebd78f981", |
| 115 | + "version_major": 2, |
| 116 | + "version_minor": 0 |
| 117 | + }, |
| 118 | + "text/plain": [ |
| 119 | + "Dropdown(description='Select Model:', options=('Phi-3', 'Phi-4'), value='Phi-3')" |
| 120 | + ] |
| 121 | + }, |
| 122 | + "execution_count": 1, |
| 123 | + "metadata": {}, |
| 124 | + "output_type": "execute_result" |
| 125 | + } |
| 126 | + ], |
| 127 | + "source": [ |
| 128 | + "import ipywidgets as widgets\n", |
| 129 | + "\n", |
| 130 | + "model = widgets.Dropdown(\n", |
| 131 | + " options=[\"Phi-3\", \"Phi-4\"],\n", |
| 132 | + " value=\"Phi-3\", # default value\n", |
| 133 | + " description=\"Select Model:\",\n", |
| 134 | + ")\n", |
| 135 | + "\n", |
| 136 | + "model" |
| 137 | + ] |
| 138 | + }, |
| 139 | + { |
| 140 | + "cell_type": "markdown", |
| 141 | + "id": "ccca9ef6", |
| 142 | + "metadata": {}, |
| 143 | + "source": [ |
| 144 | + "### Download target and draft models" |
| 145 | + ] |
| 146 | + }, |
| 147 | + { |
| 148 | + "cell_type": "code", |
| 149 | + "execution_count": 2, |
99 | 150 | "id": "74bb9f96",
|
100 | 151 | "metadata": {},
|
101 | 152 | "outputs": [],
|
102 | 153 | "source": [
|
103 | 154 | "from pathlib import Path\n",
|
104 | 155 | "import huggingface_hub as hf_hub\n",
|
105 | 156 | "\n",
|
106 |
| - "draft_model_id = \"OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov\"\n", |
107 |
| - "target_model_id = \"OpenVINO/Phi-3-mini-4k-instruct-int4-ov\"\n", |
| 157 | + "if model.value == \"Phi-4\":\n", |
| 158 | + " target_model_id = \"OpenVINO/Phi-4-mini-instruct-int4-ov\"\n", |
| 159 | + " draft_model_id = \"OpenVINO/Phi-4-mini-FastDraft-120M-int8-ov\"\n", |
| 160 | + "elif model.value == \"Phi-3\":\n", |
| 161 | + " target_model_id = \"OpenVINO/Phi-3-mini-4k-instruct-int4-ov\"\n", |
| 162 | + " draft_model_id = \"OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov\"\n", |
| 163 | + "else:\n", |
| 164 | + " print(f\"Model {model} is not supported in this demo.\")\n", |
108 | 165 | "\n",
|
109 | 166 | "draft_model_path = Path(draft_model_id.split(\"/\")[-1])\n",
|
110 | 167 | "target_model_path = Path(target_model_id.split(\"/\")[-1])\n",
|
|
131 | 188 | },
|
132 | 189 | {
|
133 | 190 | "cell_type": "code",
|
134 |
| - "execution_count": 2, |
| 191 | + "execution_count": 3, |
135 | 192 | "id": "6ddd57de-9f41-403c-bccc-8d3118654a24",
|
136 | 193 | "metadata": {
|
137 | 194 | "tags": []
|
|
140 | 197 | {
|
141 | 198 | "data": {
|
142 | 199 | "application/vnd.jupyter.widget-view+json": {
|
143 |
| - "model_id": "37ad0b345de94225892c9d47519a9164", |
| 200 | + "model_id": "093ab5068fc542a0826fbd4f8a8d97b8", |
144 | 201 | "version_major": 2,
|
145 | 202 | "version_minor": 0
|
146 | 203 | },
|
147 | 204 | "text/plain": [
|
148 | 205 | "Dropdown(description='Device:', options=('CPU', 'GPU'), value='CPU')"
|
149 | 206 | ]
|
150 | 207 | },
|
151 |
| - "execution_count": 2, |
152 | 208 | "metadata": {},
|
153 |
| - "output_type": "execute_result" |
| 209 | + "output_type": "display_data" |
154 | 210 | }
|
155 | 211 | ],
|
156 | 212 | "source": [
|
|
165 | 221 | "from notebook_utils import device_widget\n",
|
166 | 222 | "\n",
|
167 | 223 | "device = device_widget(default=\"CPU\", exclude=[\"NPU\", \"AUTO\"])\n",
|
168 |
| - "\n", |
169 |
| - "device\n", |
| 224 | + "display(device)\n", |
170 | 225 | "\n",
|
171 | 226 | "# Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry\n",
|
172 | 227 | "from notebook_utils import collect_telemetry\n",
|
|
200 | 255 | "import openvino_genai as ov_genai\n",
|
201 | 256 | "import time\n",
|
202 | 257 | "\n",
|
| 258 | + "print(\"Initializing Auto-Regressive pipeline...\")\n", |
203 | 259 | "pipe = ov_genai.LLMPipeline(target_model_path, device.value)\n",
|
204 | 260 | "\n",
|
205 | 261 | "config = ov_genai.GenerationConfig()\n",
|
206 |
| - "config.max_new_tokens = 330\n", |
| 262 | + "config.max_new_tokens = 200\n", |
| 263 | + "config.apply_chat_template = False\n", |
| 264 | + "\n", |
207 | 265 | "prompt = '''<s>\n",
|
208 | 266 | "\n",
|
209 | 267 | "def prime_fib(n: int):\n",
|
|
272 | 330 | "metadata": {},
|
273 | 331 | "outputs": [],
|
274 | 332 | "source": [
|
275 |
| - "scheduler_config = ov_genai.SchedulerConfig()\n", |
276 |
| - "# cache params\n", |
277 |
| - "scheduler_config.cache_size = 0\n", |
278 |
| - "scheduler_config.num_kv_blocks = 2048 // 8\n", |
279 |
| - "scheduler_config.max_num_batched_tokens = 2048\n", |
280 |
| - "\n", |
281 | 333 | "draft_model = ov_genai.draft_model(draft_model_path, device.value)\n",
|
| 334 | + "config.num_assistant_tokens = 5\n", |
282 | 335 | "\n",
|
283 |
| - "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config)\n", |
| 336 | + "print(\"Initializing Speculative-Decoding pipeline...\")\n", |
| 337 | + "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model)\n", |
284 | 338 | "\n",
|
285 |
| - "config = ov_genai.GenerationConfig()\n", |
286 |
| - "config.max_new_tokens = 330\n", |
287 |
| - "config.num_assistant_tokens = 5\n", |
288 | 339 | "start_time = time.perf_counter()\n",
|
289 | 340 | "result = pipe.generate(prompt, config, streamer=streamer)\n",
|
290 | 341 | "end_time = time.perf_counter()"
|
|
318 | 369 | "metadata": {},
|
319 | 370 | "outputs": [],
|
320 | 371 | "source": [
|
321 |
| - "config = ov_genai.GenerationConfig()\n", |
322 |
| - "config.max_new_tokens = 100\n", |
| 372 | + "config.num_assistant_tokens = 0\n", |
323 | 373 | "config.assistant_confidence_threshold = 0.05\n",
|
| 374 | + "\n", |
324 | 375 | "start_time = time.perf_counter()\n",
|
325 |
| - "result = pipe.generate([\"Sun is yellow because\"], config, streamer)\n", |
| 376 | + "result = pipe.generate(prompt, config, streamer)\n", |
326 | 377 | "end_time = time.perf_counter()"
|
327 | 378 | ]
|
328 | 379 | },
|
|
356 | 407 | },
|
357 | 408 | {
|
358 | 409 | "cell_type": "code",
|
359 |
| - "execution_count": 3, |
| 410 | + "execution_count": 4, |
360 | 411 | "id": "64a36dc1-958c-4f7e-baba-efa89a2d9a8f",
|
361 | 412 | "metadata": {},
|
362 | 413 | "outputs": [
|
363 | 414 | {
|
364 | 415 | "data": {
|
365 | 416 | "application/vnd.jupyter.widget-view+json": {
|
366 |
| - "model_id": "b0f65ad3139a477282c002eafe409d94", |
| 417 | + "model_id": "ac15c0165c8e4a1c92ad8c1da9bc7879", |
367 | 418 | "version_major": 2,
|
368 | 419 | "version_minor": 0
|
369 | 420 | },
|
370 | 421 | "text/plain": [
|
371 | 422 | "Dropdown(description='Data type:', options=('Code', 'Text'), value='Code')"
|
372 | 423 | ]
|
373 | 424 | },
|
374 |
| - "execution_count": 3, |
| 425 | + "execution_count": 4, |
375 | 426 | "metadata": {},
|
376 | 427 | "output_type": "execute_result"
|
377 | 428 | }
|
|
402 | 453 | },
|
403 | 454 | {
|
404 | 455 | "cell_type": "code",
|
405 |
| - "execution_count": 4, |
| 456 | + "execution_count": 9, |
406 | 457 | "id": "13f03634",
|
407 | 458 | "metadata": {},
|
408 | 459 | "outputs": [
|
|
444 | 495 | },
|
445 | 496 | {
|
446 | 497 | "cell_type": "code",
|
447 |
| - "execution_count": 6, |
| 498 | + "execution_count": null, |
448 | 499 | "id": "1f4ea9e5",
|
449 | 500 | "metadata": {},
|
450 | 501 | "outputs": [
|
451 | 502 | {
|
452 | 503 | "name": "stdout",
|
453 | 504 | "output_type": "stream",
|
454 | 505 | "text": [
|
| 506 | + "Initializing Auto-Regressive pipline...\n", |
| 507 | + "Running Warmup...\n", |
455 | 508 | "Running Auto-Regressive generation...\n"
|
456 | 509 | ]
|
457 | 510 | },
|
458 | 511 | {
|
459 | 512 | "name": "stderr",
|
460 | 513 | "output_type": "stream",
|
461 | 514 | "text": [
|
462 |
| - "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:26<00:00, 5.32s/it]" |
| 515 | + "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:29<00:00, 5.40s/it]" |
463 | 516 | ]
|
464 | 517 | },
|
465 | 518 | {
|
|
479 | 532 | {
|
480 | 533 | "data": {
|
481 | 534 | "text/plain": [
|
482 |
| - "9" |
| 535 | + "27" |
483 | 536 | ]
|
484 | 537 | },
|
485 |
| - "execution_count": 6, |
| 538 | + "execution_count": 10, |
486 | 539 | "metadata": {},
|
487 | 540 | "output_type": "execute_result"
|
488 | 541 | }
|
|
492 | 545 | "import time\n",
|
493 | 546 | "from tqdm import tqdm\n",
|
494 | 547 | "\n",
|
495 |
| - "print(\"Running Auto-Regressive generation...\")\n", |
| 548 | + "print(\"Initializing Auto-Regressive pipline...\")\n", |
496 | 549 | "pipe = ov_genai.LLMPipeline(target_model_path, device.value)\n",
|
497 | 550 | "\n",
|
498 | 551 | "config = ov_genai.GenerationConfig()\n",
|
| 552 | + "config.apply_chat_template = False\n", |
499 | 553 | "config.max_new_tokens = 330\n",
|
| 554 | + "if data_type.value == \"Code\":\n", |
| 555 | + " config.max_new_tokens = 128\n", |
| 556 | + "\n", |
| 557 | + "# warmup\n", |
| 558 | + "print(\"Running Warmup...\")\n", |
| 559 | + "for i in range(10):\n", |
| 560 | + " pipe.generate(\"this is a warmup prompt\", config)\n", |
500 | 561 | "\n",
|
| 562 | + "print(\"Running Auto-Regressive generation...\")\n", |
501 | 563 | "times_auto_regressive = []\n",
|
502 | 564 | "for prompt in tqdm(prompts):\n",
|
503 | 565 | " start_time = time.perf_counter()\n",
|
|
523 | 585 | },
|
524 | 586 | {
|
525 | 587 | "cell_type": "code",
|
526 |
| - "execution_count": 7, |
| 588 | + "execution_count": null, |
527 | 589 | "id": "d73e9f37",
|
528 | 590 | "metadata": {},
|
529 | 591 | "outputs": [
|
530 | 592 | {
|
531 | 593 | "name": "stdout",
|
532 | 594 | "output_type": "stream",
|
533 | 595 | "text": [
|
| 596 | + "Initializing Speculative-Decoding pipline...\n", |
| 597 | + "Running Warmup...\n", |
534 | 598 | "Running Speculative Decoding generation...\n"
|
535 | 599 | ]
|
536 | 600 | },
|
537 | 601 | {
|
538 | 602 | "name": "stderr",
|
539 | 603 | "output_type": "stream",
|
540 | 604 | "text": [
|
541 |
| - "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [01:52<00:00, 2.25s/it]" |
| 605 | + "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:10<00:00, 5.01s/it]" |
542 | 606 | ]
|
543 | 607 | },
|
544 | 608 | {
|
|
557 | 621 | }
|
558 | 622 | ],
|
559 | 623 | "source": [
|
560 |
| - "scheduler_config = ov_genai.SchedulerConfig()\n", |
561 |
| - "# cache params\n", |
562 |
| - "scheduler_config.cache_size = 0\n", |
563 |
| - "scheduler_config.num_kv_blocks = 2048 // 8\n", |
564 |
| - "scheduler_config.max_num_batched_tokens = 2048\n", |
565 |
| - "\n", |
| 624 | + "config.num_assistant_tokens = 5\n", |
566 | 625 | "draft_model = ov_genai.draft_model(draft_model_path, device.value)\n",
|
567 | 626 | "\n",
|
568 |
| - "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config)\n", |
569 |
| - "\n", |
570 |
| - "config = ov_genai.GenerationConfig()\n", |
571 |
| - "config.max_new_tokens = 330\n", |
572 |
| - "config.num_assistant_tokens = 5\n", |
| 627 | + "print(\"Initializing Speculative-Decoding pipline...\")\n", |
| 628 | + "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model)\n", |
573 | 629 | "\n",
|
| 630 | + "# warmup\n", |
| 631 | + "print(\"Running Warmup...\")\n", |
| 632 | + "for i in range(10):\n", |
| 633 | + " pipe.generate(\"this is a warmup prompt\", config)\n", |
574 | 634 | "\n",
|
575 | 635 | "times_speculative_decoding = []\n",
|
576 | 636 | "print(\"Running Speculative Decoding generation...\")\n",
|
|
593 | 653 | },
|
594 | 654 | {
|
595 | 655 | "cell_type": "code",
|
596 |
| - "execution_count": 8, |
| 656 | + "execution_count": 12, |
597 | 657 | "id": "ad898772",
|
598 | 658 | "metadata": {},
|
599 | 659 | "outputs": [
|
600 | 660 | {
|
601 | 661 | "name": "stdout",
|
602 | 662 | "output_type": "stream",
|
603 | 663 | "text": [
|
604 |
| - "average speedup: 2.23\n" |
| 664 | + "average speedup: 1.09\n" |
605 | 665 | ]
|
606 | 666 | }
|
607 | 667 | ],
|
|
627 | 687 | "name": "python",
|
628 | 688 | "nbconvert_exporter": "python",
|
629 | 689 | "pygments_lexer": "ipython3",
|
630 |
| - "version": "3.12.7" |
| 690 | + "version": "3.12.9" |
631 | 691 | },
|
632 | 692 | "openvino_notebooks": {
|
633 | 693 | "imageUrl": "https://github.com/user-attachments/assets/eb999dea-d98b-42bb-835e-28d3054e1a84",
|
|
0 commit comments