|
75 | 75 | "metadata": {},
|
76 | 76 | "outputs": [],
|
77 | 77 | "source": [
|
78 |
| - "%pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly huggingface_hub datasets" |
| 78 | + "%pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly huggingface_hub datasets ipywidgets " |
79 | 79 | ]
|
80 | 80 | },
|
81 | 81 | {
|
|
88 | 88 | "[back to top ⬆️](#Table-of-contents:)\n",
|
89 | 89 | "\n",
|
90 | 90 | "As example, we will use already converted LLMs from [OpenVINO collection](https://huggingface.co/collections/OpenVINO/llm-6687aaa2abca3bbcec71a9bd).\n",
|
91 |
| - "You can find OpenVINO optimized FastDraft models can be found in this [collection](https://huggingface.co/collections/OpenVINO/speculative-decoding-draft-models-673f5d944d58b29ba6e94161). As example we will use [Phi-3-mini-4k-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov) as target model and [Phi-3-mini-FastDraft-50M-int8-ov](https://huggingface.co/OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov) as draft.\n", |
| 91 | + "You can find OpenVINO optimized FastDraft models can be found in this [collection](https://huggingface.co/collections/OpenVINO/speculative-decoding-draft-models-673f5d944d58b29ba6e94161). You can choose either Phi-3 or Phi-4 to be used for demonstrating FastDraft performance. For Phi-3 we will use [Phi-3-mini-4k-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov) as target model and [Phi-3-mini-FastDraft-50M-int8-ov](https://huggingface.co/OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov) as draft. For Phi-4 we will use [Phi-4-mini-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-4-mini-instruct-int4-ov) as target model and [Phi-4-mini-FastDraft-120M-int8-ov](https://huggingface.co/OpenVINO/Phi-4-mini-FastDraft-120M-int8-ov) as draft.\n", |
92 | 92 | "\n",
|
93 | 93 | "In case, if you want run own models, you should convert them using [Hugging Face Optimum](https://huggingface.co/docs/optimum/intel/openvino/export) library accelerated by OpenVINO integration. More details about model preparation can be found in [OpenVINO LLM inference guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide/llm-inference-native-ov.html#convert-hugging-face-tokenizer-and-model-to-openvino-ir-format)"
|
94 | 94 | ]
|
95 | 95 | },
|
| 96 | + { |
| 97 | + "cell_type": "markdown", |
| 98 | + "id": "61ecfe17", |
| 99 | + "metadata": {}, |
| 100 | + "source": [ |
| 101 | + "### Select model\n", |
| 102 | + "[back to top ⬆️](#Table-of-contents:)" |
| 103 | + ] |
| 104 | + }, |
96 | 105 | {
|
97 | 106 | "cell_type": "code",
|
98 | 107 | "execution_count": 1,
|
| 108 | + "id": "fe934261", |
| 109 | + "metadata": {}, |
| 110 | + "outputs": [ |
| 111 | + { |
| 112 | + "data": { |
| 113 | + "application/vnd.jupyter.widget-view+json": { |
| 114 | + "model_id": "b65944bfc43c4acebea6ec5ebd78f981", |
| 115 | + "version_major": 2, |
| 116 | + "version_minor": 0 |
| 117 | + }, |
| 118 | + "text/plain": [ |
| 119 | + "Dropdown(description='Select Model:', options=('Phi-3', 'Phi-4'), value='Phi-3')" |
| 120 | + ] |
| 121 | + }, |
| 122 | + "execution_count": 1, |
| 123 | + "metadata": {}, |
| 124 | + "output_type": "execute_result" |
| 125 | + } |
| 126 | + ], |
| 127 | + "source": [ |
| 128 | + "import ipywidgets as widgets\n", |
| 129 | + "\n", |
| 130 | + "model = widgets.Dropdown(\n", |
| 131 | + " options=['Phi-3', 'Phi-4'],\n", |
| 132 | + " value='Phi-3', # default value\n", |
| 133 | + " description='Select Model:',\n", |
| 134 | + ")\n", |
| 135 | + "\n", |
| 136 | + "model" |
| 137 | + ] |
| 138 | + }, |
| 139 | + { |
| 140 | + "cell_type": "markdown", |
| 141 | + "id": "ccca9ef6", |
| 142 | + "metadata": {}, |
| 143 | + "source": [ |
| 144 | + "### Download target and draft models" |
| 145 | + ] |
| 146 | + }, |
| 147 | + { |
| 148 | + "cell_type": "code", |
| 149 | + "execution_count": 2, |
99 | 150 | "id": "74bb9f96",
|
100 | 151 | "metadata": {},
|
101 | 152 | "outputs": [],
|
102 | 153 | "source": [
|
103 | 154 | "from pathlib import Path\n",
|
104 | 155 | "import huggingface_hub as hf_hub\n",
|
105 | 156 | "\n",
|
106 |
| - "draft_model_id = \"OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov\"\n", |
107 |
| - "target_model_id = \"OpenVINO/Phi-3-mini-4k-instruct-int4-ov\"\n", |
| 157 | + "if model.value == \"Phi-4\":\n", |
| 158 | + " target_model_id = \"OpenVINO/Phi-4-mini-instruct-int4-ov\"\n", |
| 159 | + " draft_model_id = \"OpenVINO/Phi-4-mini-FastDraft-120M-int8-ov\"\n", |
| 160 | + "elif model.value == \"Phi-3\":\n", |
| 161 | + " target_model_id = \"OpenVINO/Phi-3-mini-4k-instruct-int4-ov\"\n", |
| 162 | + " draft_model_id = \"OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov\"\n", |
| 163 | + "else:\n", |
| 164 | + " print(f\"Model {model} is not supported in this demo.\")\n", |
108 | 165 | "\n",
|
109 | 166 | "draft_model_path = Path(draft_model_id.split(\"/\")[-1])\n",
|
110 | 167 | "target_model_path = Path(target_model_id.split(\"/\")[-1])\n",
|
|
131 | 188 | },
|
132 | 189 | {
|
133 | 190 | "cell_type": "code",
|
134 |
| - "execution_count": 2, |
| 191 | + "execution_count": 3, |
135 | 192 | "id": "6ddd57de-9f41-403c-bccc-8d3118654a24",
|
136 | 193 | "metadata": {
|
137 | 194 | "tags": []
|
|
140 | 197 | {
|
141 | 198 | "data": {
|
142 | 199 | "application/vnd.jupyter.widget-view+json": {
|
143 |
| - "model_id": "37ad0b345de94225892c9d47519a9164", |
| 200 | + "model_id": "093ab5068fc542a0826fbd4f8a8d97b8", |
144 | 201 | "version_major": 2,
|
145 | 202 | "version_minor": 0
|
146 | 203 | },
|
147 | 204 | "text/plain": [
|
148 | 205 | "Dropdown(description='Device:', options=('CPU', 'GPU'), value='CPU')"
|
149 | 206 | ]
|
150 | 207 | },
|
151 |
| - "execution_count": 2, |
152 | 208 | "metadata": {},
|
153 |
| - "output_type": "execute_result" |
| 209 | + "output_type": "display_data" |
154 | 210 | }
|
155 | 211 | ],
|
156 | 212 | "source": [
|
|
165 | 221 | "from notebook_utils import device_widget\n",
|
166 | 222 | "\n",
|
167 | 223 | "device = device_widget(default=\"CPU\", exclude=[\"NPU\", \"AUTO\"])\n",
|
168 |
| - "\n", |
169 |
| - "device\n", |
| 224 | + "display(device)\n", |
170 | 225 | "\n",
|
171 | 226 | "# Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry\n",
|
172 | 227 | "from notebook_utils import collect_telemetry\n",
|
|
200 | 255 | "import openvino_genai as ov_genai\n",
|
201 | 256 | "import time\n",
|
202 | 257 | "\n",
|
| 258 | + "print(\"Initializing Auto-Regressive pipeline...\")\n", |
203 | 259 | "pipe = ov_genai.LLMPipeline(target_model_path, device.value)\n",
|
204 | 260 | "\n",
|
205 | 261 | "config = ov_genai.GenerationConfig()\n",
|
206 |
| - "config.max_new_tokens = 330\n", |
| 262 | + "config.max_new_tokens = 200\n", |
| 263 | + "config.apply_chat_template = False\n", |
| 264 | + "\n", |
207 | 265 | "prompt = '''<s>\n",
|
208 | 266 | "\n",
|
209 | 267 | "def prime_fib(n: int):\n",
|
|
272 | 330 | "metadata": {},
|
273 | 331 | "outputs": [],
|
274 | 332 | "source": [
|
275 |
| - "scheduler_config = ov_genai.SchedulerConfig()\n", |
276 |
| - "# cache params\n", |
277 |
| - "scheduler_config.cache_size = 0\n", |
278 |
| - "scheduler_config.num_kv_blocks = 2048 // 8\n", |
279 |
| - "scheduler_config.max_num_batched_tokens = 2048\n", |
280 | 333 | "\n",
|
281 | 334 | "draft_model = ov_genai.draft_model(draft_model_path, device.value)\n",
|
| 335 | + "config.num_assistant_tokens = 5\n", |
282 | 336 | "\n",
|
283 |
| - "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config)\n", |
| 337 | + "print(\"Initializing Speculative-Decoding pipeline...\")\n", |
| 338 | + "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model)\n", |
284 | 339 | "\n",
|
285 |
| - "config = ov_genai.GenerationConfig()\n", |
286 |
| - "config.max_new_tokens = 330\n", |
287 |
| - "config.num_assistant_tokens = 5\n", |
288 | 340 | "start_time = time.perf_counter()\n",
|
289 | 341 | "result = pipe.generate(prompt, config, streamer=streamer)\n",
|
290 | 342 | "end_time = time.perf_counter()"
|
|
318 | 370 | "metadata": {},
|
319 | 371 | "outputs": [],
|
320 | 372 | "source": [
|
321 |
| - "config = ov_genai.GenerationConfig()\n", |
322 |
| - "config.max_new_tokens = 100\n", |
| 373 | + "config.num_assistant_tokens = 0\n", |
323 | 374 | "config.assistant_confidence_threshold = 0.05\n",
|
| 375 | + "\n", |
324 | 376 | "start_time = time.perf_counter()\n",
|
325 |
| - "result = pipe.generate([\"Sun is yellow because\"], config, streamer)\n", |
| 377 | + "result = pipe.generate(prompt, config, streamer)\n", |
326 | 378 | "end_time = time.perf_counter()"
|
327 | 379 | ]
|
328 | 380 | },
|
|
356 | 408 | },
|
357 | 409 | {
|
358 | 410 | "cell_type": "code",
|
359 |
| - "execution_count": 3, |
| 411 | + "execution_count": 4, |
360 | 412 | "id": "64a36dc1-958c-4f7e-baba-efa89a2d9a8f",
|
361 | 413 | "metadata": {},
|
362 | 414 | "outputs": [
|
363 | 415 | {
|
364 | 416 | "data": {
|
365 | 417 | "application/vnd.jupyter.widget-view+json": {
|
366 |
| - "model_id": "b0f65ad3139a477282c002eafe409d94", |
| 418 | + "model_id": "ac15c0165c8e4a1c92ad8c1da9bc7879", |
367 | 419 | "version_major": 2,
|
368 | 420 | "version_minor": 0
|
369 | 421 | },
|
370 | 422 | "text/plain": [
|
371 | 423 | "Dropdown(description='Data type:', options=('Code', 'Text'), value='Code')"
|
372 | 424 | ]
|
373 | 425 | },
|
374 |
| - "execution_count": 3, |
| 426 | + "execution_count": 4, |
375 | 427 | "metadata": {},
|
376 | 428 | "output_type": "execute_result"
|
377 | 429 | }
|
|
402 | 454 | },
|
403 | 455 | {
|
404 | 456 | "cell_type": "code",
|
405 |
| - "execution_count": 4, |
| 457 | + "execution_count": 9, |
406 | 458 | "id": "13f03634",
|
407 | 459 | "metadata": {},
|
408 | 460 | "outputs": [
|
|
444 | 496 | },
|
445 | 497 | {
|
446 | 498 | "cell_type": "code",
|
447 |
| - "execution_count": 6, |
| 499 | + "execution_count": null, |
448 | 500 | "id": "1f4ea9e5",
|
449 | 501 | "metadata": {},
|
450 | 502 | "outputs": [
|
451 | 503 | {
|
452 | 504 | "name": "stdout",
|
453 | 505 | "output_type": "stream",
|
454 | 506 | "text": [
|
| 507 | + "Initializing Auto-Regressive pipline...\n", |
| 508 | + "Running Warmup...\n", |
455 | 509 | "Running Auto-Regressive generation...\n"
|
456 | 510 | ]
|
457 | 511 | },
|
458 | 512 | {
|
459 | 513 | "name": "stderr",
|
460 | 514 | "output_type": "stream",
|
461 | 515 | "text": [
|
462 |
| - "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:26<00:00, 5.32s/it]" |
| 516 | + "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:29<00:00, 5.40s/it]" |
463 | 517 | ]
|
464 | 518 | },
|
465 | 519 | {
|
|
479 | 533 | {
|
480 | 534 | "data": {
|
481 | 535 | "text/plain": [
|
482 |
| - "9" |
| 536 | + "27" |
483 | 537 | ]
|
484 | 538 | },
|
485 |
| - "execution_count": 6, |
| 539 | + "execution_count": 10, |
486 | 540 | "metadata": {},
|
487 | 541 | "output_type": "execute_result"
|
488 | 542 | }
|
|
492 | 546 | "import time\n",
|
493 | 547 | "from tqdm import tqdm\n",
|
494 | 548 | "\n",
|
495 |
| - "print(\"Running Auto-Regressive generation...\")\n", |
| 549 | + "print(\"Initializing Auto-Regressive pipline...\")\n", |
496 | 550 | "pipe = ov_genai.LLMPipeline(target_model_path, device.value)\n",
|
497 | 551 | "\n",
|
498 | 552 | "config = ov_genai.GenerationConfig()\n",
|
| 553 | + "config.apply_chat_template = False\n", |
499 | 554 | "config.max_new_tokens = 330\n",
|
500 |
| - "\n", |
| 555 | + "if data_type.value == \"Code\":\n", |
| 556 | + " config.max_new_tokens = 128\n", |
| 557 | + " \n", |
| 558 | + "# warmup\n", |
| 559 | + "print(\"Running Warmup...\")\n", |
| 560 | + "for i in range(10):\n", |
| 561 | + " pipe.generate(\"this is a warmup prompt\", config)\n", |
| 562 | + " \n", |
| 563 | + "print(\"Running Auto-Regressive generation...\")\n", |
501 | 564 | "times_auto_regressive = []\n",
|
502 | 565 | "for prompt in tqdm(prompts):\n",
|
503 | 566 | " start_time = time.perf_counter()\n",
|
|
523 | 586 | },
|
524 | 587 | {
|
525 | 588 | "cell_type": "code",
|
526 |
| - "execution_count": 7, |
| 589 | + "execution_count": null, |
527 | 590 | "id": "d73e9f37",
|
528 | 591 | "metadata": {},
|
529 | 592 | "outputs": [
|
530 | 593 | {
|
531 | 594 | "name": "stdout",
|
532 | 595 | "output_type": "stream",
|
533 | 596 | "text": [
|
| 597 | + "Initializing Speculative-Decoding pipline...\n", |
| 598 | + "Running Warmup...\n", |
534 | 599 | "Running Speculative Decoding generation...\n"
|
535 | 600 | ]
|
536 | 601 | },
|
537 | 602 | {
|
538 | 603 | "name": "stderr",
|
539 | 604 | "output_type": "stream",
|
540 | 605 | "text": [
|
541 |
| - "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [01:52<00:00, 2.25s/it]" |
| 606 | + "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:10<00:00, 5.01s/it]" |
542 | 607 | ]
|
543 | 608 | },
|
544 | 609 | {
|
|
557 | 622 | }
|
558 | 623 | ],
|
559 | 624 | "source": [
|
560 |
| - "scheduler_config = ov_genai.SchedulerConfig()\n", |
561 |
| - "# cache params\n", |
562 |
| - "scheduler_config.cache_size = 0\n", |
563 |
| - "scheduler_config.num_kv_blocks = 2048 // 8\n", |
564 |
| - "scheduler_config.max_num_batched_tokens = 2048\n", |
565 | 625 | "\n",
|
| 626 | + "config.num_assistant_tokens = 5\n", |
566 | 627 | "draft_model = ov_genai.draft_model(draft_model_path, device.value)\n",
|
567 | 628 | "\n",
|
568 |
| - "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config)\n", |
569 |
| - "\n", |
570 |
| - "config = ov_genai.GenerationConfig()\n", |
571 |
| - "config.max_new_tokens = 330\n", |
572 |
| - "config.num_assistant_tokens = 5\n", |
| 629 | + "print(\"Initializing Speculative-Decoding pipline...\")\n", |
| 630 | + "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model)\n", |
573 | 631 | "\n",
|
| 632 | + "# warmup\n", |
| 633 | + "print(\"Running Warmup...\")\n", |
| 634 | + "for i in range(10):\n", |
| 635 | + " pipe.generate(\"this is a warmup prompt\", config)\n", |
574 | 636 | "\n",
|
575 | 637 | "times_speculative_decoding = []\n",
|
576 | 638 | "print(\"Running Speculative Decoding generation...\")\n",
|
|
593 | 655 | },
|
594 | 656 | {
|
595 | 657 | "cell_type": "code",
|
596 |
| - "execution_count": 8, |
| 658 | + "execution_count": 12, |
597 | 659 | "id": "ad898772",
|
598 | 660 | "metadata": {},
|
599 | 661 | "outputs": [
|
600 | 662 | {
|
601 | 663 | "name": "stdout",
|
602 | 664 | "output_type": "stream",
|
603 | 665 | "text": [
|
604 |
| - "average speedup: 2.23\n" |
| 666 | + "average speedup: 1.09\n" |
605 | 667 | ]
|
606 | 668 | }
|
607 | 669 | ],
|
|
627 | 689 | "name": "python",
|
628 | 690 | "nbconvert_exporter": "python",
|
629 | 691 | "pygments_lexer": "ipython3",
|
630 |
| - "version": "3.12.7" |
| 692 | + "version": "3.12.9" |
631 | 693 | },
|
632 | 694 | "openvino_notebooks": {
|
633 | 695 | "imageUrl": "https://github.com/user-attachments/assets/eb999dea-d98b-42bb-835e-28d3054e1a84",
|
|
0 commit comments