|
75 | 75 | "metadata": {},
|
76 | 76 | "outputs": [],
|
77 | 77 | "source": [
|
78 |
| - "%pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly huggingface_hub datasets ipywidgets " |
| 78 | + "%pip install --pre -U openvino-genai --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly huggingface_hub datasets" |
79 | 79 | ]
|
80 | 80 | },
|
81 | 81 | {
|
|
88 | 88 | "[back to top ⬆️](#Table-of-contents:)\n",
|
89 | 89 | "\n",
|
90 | 90 | "As example, we will use already converted LLMs from [OpenVINO collection](https://huggingface.co/collections/OpenVINO/llm-6687aaa2abca3bbcec71a9bd).\n",
|
91 |
| - "You can find OpenVINO optimized FastDraft models can be found in this [collection](https://huggingface.co/collections/OpenVINO/speculative-decoding-draft-models-673f5d944d58b29ba6e94161). You can choose either Phi-3 or Phi-4 to be used for demonstrating FastDraft performance. For Phi-3 we will use [Phi-3-mini-4k-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov) as target model and [Phi-3-mini-FastDraft-50M-int8-ov](https://huggingface.co/OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov) as draft. For Phi-4 we will use [Phi-4-mini-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-4-mini-instruct-int4-ov) as target model and [Phi-4-mini-FastDraft-120M-int8-ov](https://huggingface.co/OpenVINO/Phi-4-mini-FastDraft-120M-int8-ov) as draft.\n", |
| 91 | + "You can find OpenVINO optimized FastDraft models can be found in this [collection](https://huggingface.co/collections/OpenVINO/speculative-decoding-draft-models-673f5d944d58b29ba6e94161). As example we will use [Phi-3-mini-4k-instruct-int4-ov](https://huggingface.co/OpenVINO/Phi-3-mini-4k-instruct-int4-ov) as target model and [Phi-3-mini-FastDraft-50M-int8-ov](https://huggingface.co/OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov) as draft.\n", |
92 | 92 | "\n",
|
93 | 93 | "In case, if you want run own models, you should convert them using [Hugging Face Optimum](https://huggingface.co/docs/optimum/intel/openvino/export) library accelerated by OpenVINO integration. More details about model preparation can be found in [OpenVINO LLM inference guide](https://docs.openvino.ai/2024/learn-openvino/llm_inference_guide/llm-inference-native-ov.html#convert-hugging-face-tokenizer-and-model-to-openvino-ir-format)"
|
94 | 94 | ]
|
95 | 95 | },
|
96 |
| - { |
97 |
| - "cell_type": "markdown", |
98 |
| - "id": "61ecfe17", |
99 |
| - "metadata": {}, |
100 |
| - "source": [ |
101 |
| - "### Select model\n", |
102 |
| - "[back to top ⬆️](#Table-of-contents:)" |
103 |
| - ] |
104 |
| - }, |
105 | 96 | {
|
106 | 97 | "cell_type": "code",
|
107 | 98 | "execution_count": 1,
|
108 |
| - "id": "fe934261", |
109 |
| - "metadata": {}, |
110 |
| - "outputs": [ |
111 |
| - { |
112 |
| - "data": { |
113 |
| - "application/vnd.jupyter.widget-view+json": { |
114 |
| - "model_id": "b65944bfc43c4acebea6ec5ebd78f981", |
115 |
| - "version_major": 2, |
116 |
| - "version_minor": 0 |
117 |
| - }, |
118 |
| - "text/plain": [ |
119 |
| - "Dropdown(description='Select Model:', options=('Phi-3', 'Phi-4'), value='Phi-3')" |
120 |
| - ] |
121 |
| - }, |
122 |
| - "execution_count": 1, |
123 |
| - "metadata": {}, |
124 |
| - "output_type": "execute_result" |
125 |
| - } |
126 |
| - ], |
127 |
| - "source": [ |
128 |
| - "import ipywidgets as widgets\n", |
129 |
| - "\n", |
130 |
| - "model = widgets.Dropdown(\n", |
131 |
| - " options=['Phi-3', 'Phi-4'],\n", |
132 |
| - " value='Phi-3', # default value\n", |
133 |
| - " description='Select Model:',\n", |
134 |
| - ")\n", |
135 |
| - "\n", |
136 |
| - "model" |
137 |
| - ] |
138 |
| - }, |
139 |
| - { |
140 |
| - "cell_type": "markdown", |
141 |
| - "id": "ccca9ef6", |
142 |
| - "metadata": {}, |
143 |
| - "source": [ |
144 |
| - "### Download target and draft models" |
145 |
| - ] |
146 |
| - }, |
147 |
| - { |
148 |
| - "cell_type": "code", |
149 |
| - "execution_count": 2, |
150 | 99 | "id": "74bb9f96",
|
151 | 100 | "metadata": {},
|
152 | 101 | "outputs": [],
|
153 | 102 | "source": [
|
154 | 103 | "from pathlib import Path\n",
|
155 | 104 | "import huggingface_hub as hf_hub\n",
|
156 | 105 | "\n",
|
157 |
| - "if model.value == \"Phi-4\":\n", |
158 |
| - " target_model_id = \"OpenVINO/Phi-4-mini-instruct-int4-ov\"\n", |
159 |
| - " draft_model_id = \"OpenVINO/Phi-4-mini-FastDraft-120M-int8-ov\"\n", |
160 |
| - "elif model.value == \"Phi-3\":\n", |
161 |
| - " target_model_id = \"OpenVINO/Phi-3-mini-4k-instruct-int4-ov\"\n", |
162 |
| - " draft_model_id = \"OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov\"\n", |
163 |
| - "else:\n", |
164 |
| - " print(f\"Model {model} is not supported in this demo.\")\n", |
| 106 | + "draft_model_id = \"OpenVINO/Phi-3-mini-FastDraft-50M-int8-ov\"\n", |
| 107 | + "target_model_id = \"OpenVINO/Phi-3-mini-4k-instruct-int4-ov\"\n", |
165 | 108 | "\n",
|
166 | 109 | "draft_model_path = Path(draft_model_id.split(\"/\")[-1])\n",
|
167 | 110 | "target_model_path = Path(target_model_id.split(\"/\")[-1])\n",
|
|
188 | 131 | },
|
189 | 132 | {
|
190 | 133 | "cell_type": "code",
|
191 |
| - "execution_count": 3, |
| 134 | + "execution_count": 2, |
192 | 135 | "id": "6ddd57de-9f41-403c-bccc-8d3118654a24",
|
193 | 136 | "metadata": {
|
194 | 137 | "tags": []
|
|
197 | 140 | {
|
198 | 141 | "data": {
|
199 | 142 | "application/vnd.jupyter.widget-view+json": {
|
200 |
| - "model_id": "093ab5068fc542a0826fbd4f8a8d97b8", |
| 143 | + "model_id": "37ad0b345de94225892c9d47519a9164", |
201 | 144 | "version_major": 2,
|
202 | 145 | "version_minor": 0
|
203 | 146 | },
|
204 | 147 | "text/plain": [
|
205 | 148 | "Dropdown(description='Device:', options=('CPU', 'GPU'), value='CPU')"
|
206 | 149 | ]
|
207 | 150 | },
|
| 151 | + "execution_count": 2, |
208 | 152 | "metadata": {},
|
209 |
| - "output_type": "display_data" |
| 153 | + "output_type": "execute_result" |
210 | 154 | }
|
211 | 155 | ],
|
212 | 156 | "source": [
|
|
221 | 165 | "from notebook_utils import device_widget\n",
|
222 | 166 | "\n",
|
223 | 167 | "device = device_widget(default=\"CPU\", exclude=[\"NPU\", \"AUTO\"])\n",
|
224 |
| - "display(device)\n", |
| 168 | + "\n", |
| 169 | + "device\n", |
225 | 170 | "\n",
|
226 | 171 | "# Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry\n",
|
227 | 172 | "from notebook_utils import collect_telemetry\n",
|
|
255 | 200 | "import openvino_genai as ov_genai\n",
|
256 | 201 | "import time\n",
|
257 | 202 | "\n",
|
258 |
| - "print(\"Initializing Auto-Regressive pipeline...\")\n", |
259 | 203 | "pipe = ov_genai.LLMPipeline(target_model_path, device.value)\n",
|
260 | 204 | "\n",
|
261 | 205 | "config = ov_genai.GenerationConfig()\n",
|
262 |
| - "config.max_new_tokens = 200\n", |
263 |
| - "config.apply_chat_template = False\n", |
264 |
| - "\n", |
| 206 | + "config.max_new_tokens = 330\n", |
265 | 207 | "prompt = '''<s>\n",
|
266 | 208 | "\n",
|
267 | 209 | "def prime_fib(n: int):\n",
|
|
330 | 272 | "metadata": {},
|
331 | 273 | "outputs": [],
|
332 | 274 | "source": [
|
| 275 | + "scheduler_config = ov_genai.SchedulerConfig()\n", |
| 276 | + "# cache params\n", |
| 277 | + "scheduler_config.cache_size = 0\n", |
| 278 | + "scheduler_config.num_kv_blocks = 2048 // 8\n", |
| 279 | + "scheduler_config.max_num_batched_tokens = 2048\n", |
333 | 280 | "\n",
|
334 | 281 | "draft_model = ov_genai.draft_model(draft_model_path, device.value)\n",
|
335 |
| - "config.num_assistant_tokens = 5\n", |
336 | 282 | "\n",
|
337 |
| - "print(\"Initializing Speculative-Decoding pipeline...\")\n", |
338 |
| - "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model)\n", |
| 283 | + "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config)\n", |
339 | 284 | "\n",
|
| 285 | + "config = ov_genai.GenerationConfig()\n", |
| 286 | + "config.max_new_tokens = 330\n", |
| 287 | + "config.num_assistant_tokens = 5\n", |
340 | 288 | "start_time = time.perf_counter()\n",
|
341 | 289 | "result = pipe.generate(prompt, config, streamer=streamer)\n",
|
342 | 290 | "end_time = time.perf_counter()"
|
|
370 | 318 | "metadata": {},
|
371 | 319 | "outputs": [],
|
372 | 320 | "source": [
|
373 |
| - "config.num_assistant_tokens = 0\n", |
| 321 | + "config = ov_genai.GenerationConfig()\n", |
| 322 | + "config.max_new_tokens = 100\n", |
374 | 323 | "config.assistant_confidence_threshold = 0.05\n",
|
375 |
| - "\n", |
376 | 324 | "start_time = time.perf_counter()\n",
|
377 |
| - "result = pipe.generate(prompt, config, streamer)\n", |
| 325 | + "result = pipe.generate([\"Sun is yellow because\"], config, streamer)\n", |
378 | 326 | "end_time = time.perf_counter()"
|
379 | 327 | ]
|
380 | 328 | },
|
|
408 | 356 | },
|
409 | 357 | {
|
410 | 358 | "cell_type": "code",
|
411 |
| - "execution_count": 4, |
| 359 | + "execution_count": 3, |
412 | 360 | "id": "64a36dc1-958c-4f7e-baba-efa89a2d9a8f",
|
413 | 361 | "metadata": {},
|
414 | 362 | "outputs": [
|
415 | 363 | {
|
416 | 364 | "data": {
|
417 | 365 | "application/vnd.jupyter.widget-view+json": {
|
418 |
| - "model_id": "ac15c0165c8e4a1c92ad8c1da9bc7879", |
| 366 | + "model_id": "b0f65ad3139a477282c002eafe409d94", |
419 | 367 | "version_major": 2,
|
420 | 368 | "version_minor": 0
|
421 | 369 | },
|
422 | 370 | "text/plain": [
|
423 | 371 | "Dropdown(description='Data type:', options=('Code', 'Text'), value='Code')"
|
424 | 372 | ]
|
425 | 373 | },
|
426 |
| - "execution_count": 4, |
| 374 | + "execution_count": 3, |
427 | 375 | "metadata": {},
|
428 | 376 | "output_type": "execute_result"
|
429 | 377 | }
|
|
454 | 402 | },
|
455 | 403 | {
|
456 | 404 | "cell_type": "code",
|
457 |
| - "execution_count": 9, |
| 405 | + "execution_count": 4, |
458 | 406 | "id": "13f03634",
|
459 | 407 | "metadata": {},
|
460 | 408 | "outputs": [
|
|
496 | 444 | },
|
497 | 445 | {
|
498 | 446 | "cell_type": "code",
|
499 |
| - "execution_count": null, |
| 447 | + "execution_count": 6, |
500 | 448 | "id": "1f4ea9e5",
|
501 | 449 | "metadata": {},
|
502 | 450 | "outputs": [
|
503 | 451 | {
|
504 | 452 | "name": "stdout",
|
505 | 453 | "output_type": "stream",
|
506 | 454 | "text": [
|
507 |
| - "Initializing Auto-Regressive pipline...\n", |
508 |
| - "Running Warmup...\n", |
509 | 455 | "Running Auto-Regressive generation...\n"
|
510 | 456 | ]
|
511 | 457 | },
|
512 | 458 | {
|
513 | 459 | "name": "stderr",
|
514 | 460 | "output_type": "stream",
|
515 | 461 | "text": [
|
516 |
| - "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:29<00:00, 5.40s/it]" |
| 462 | + "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:26<00:00, 5.32s/it]" |
517 | 463 | ]
|
518 | 464 | },
|
519 | 465 | {
|
|
533 | 479 | {
|
534 | 480 | "data": {
|
535 | 481 | "text/plain": [
|
536 |
| - "27" |
| 482 | + "9" |
537 | 483 | ]
|
538 | 484 | },
|
539 |
| - "execution_count": 10, |
| 485 | + "execution_count": 6, |
540 | 486 | "metadata": {},
|
541 | 487 | "output_type": "execute_result"
|
542 | 488 | }
|
|
546 | 492 | "import time\n",
|
547 | 493 | "from tqdm import tqdm\n",
|
548 | 494 | "\n",
|
549 |
| - "print(\"Initializing Auto-Regressive pipline...\")\n", |
| 495 | + "print(\"Running Auto-Regressive generation...\")\n", |
550 | 496 | "pipe = ov_genai.LLMPipeline(target_model_path, device.value)\n",
|
551 | 497 | "\n",
|
552 | 498 | "config = ov_genai.GenerationConfig()\n",
|
553 |
| - "config.apply_chat_template = False\n", |
554 | 499 | "config.max_new_tokens = 330\n",
|
555 |
| - "if data_type.value == \"Code\":\n", |
556 |
| - " config.max_new_tokens = 128\n", |
557 |
| - " \n", |
558 |
| - "# warmup\n", |
559 |
| - "print(\"Running Warmup...\")\n", |
560 |
| - "for i in range(10):\n", |
561 |
| - " pipe.generate(\"this is a warmup prompt\", config)\n", |
562 |
| - " \n", |
563 |
| - "print(\"Running Auto-Regressive generation...\")\n", |
| 500 | + "\n", |
564 | 501 | "times_auto_regressive = []\n",
|
565 | 502 | "for prompt in tqdm(prompts):\n",
|
566 | 503 | " start_time = time.perf_counter()\n",
|
|
586 | 523 | },
|
587 | 524 | {
|
588 | 525 | "cell_type": "code",
|
589 |
| - "execution_count": null, |
| 526 | + "execution_count": 7, |
590 | 527 | "id": "d73e9f37",
|
591 | 528 | "metadata": {},
|
592 | 529 | "outputs": [
|
593 | 530 | {
|
594 | 531 | "name": "stdout",
|
595 | 532 | "output_type": "stream",
|
596 | 533 | "text": [
|
597 |
| - "Initializing Speculative-Decoding pipline...\n", |
598 |
| - "Running Warmup...\n", |
599 | 534 | "Running Speculative Decoding generation...\n"
|
600 | 535 | ]
|
601 | 536 | },
|
602 | 537 | {
|
603 | 538 | "name": "stderr",
|
604 | 539 | "output_type": "stream",
|
605 | 540 | "text": [
|
606 |
| - "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [04:10<00:00, 5.01s/it]" |
| 541 | + "100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [01:52<00:00, 2.25s/it]" |
607 | 542 | ]
|
608 | 543 | },
|
609 | 544 | {
|
|
622 | 557 | }
|
623 | 558 | ],
|
624 | 559 | "source": [
|
| 560 | + "scheduler_config = ov_genai.SchedulerConfig()\n", |
| 561 | + "# cache params\n", |
| 562 | + "scheduler_config.cache_size = 0\n", |
| 563 | + "scheduler_config.num_kv_blocks = 2048 // 8\n", |
| 564 | + "scheduler_config.max_num_batched_tokens = 2048\n", |
625 | 565 | "\n",
|
626 |
| - "config.num_assistant_tokens = 5\n", |
627 | 566 | "draft_model = ov_genai.draft_model(draft_model_path, device.value)\n",
|
628 | 567 | "\n",
|
629 |
| - "print(\"Initializing Speculative-Decoding pipline...\")\n", |
630 |
| - "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model)\n", |
| 568 | + "pipe = ov_genai.LLMPipeline(target_model_path, device.value, draft_model=draft_model, scheduler_config=scheduler_config)\n", |
| 569 | + "\n", |
| 570 | + "config = ov_genai.GenerationConfig()\n", |
| 571 | + "config.max_new_tokens = 330\n", |
| 572 | + "config.num_assistant_tokens = 5\n", |
631 | 573 | "\n",
|
632 |
| - "# warmup\n", |
633 |
| - "print(\"Running Warmup...\")\n", |
634 |
| - "for i in range(10):\n", |
635 |
| - " pipe.generate(\"this is a warmup prompt\", config)\n", |
636 | 574 | "\n",
|
637 | 575 | "times_speculative_decoding = []\n",
|
638 | 576 | "print(\"Running Speculative Decoding generation...\")\n",
|
|
655 | 593 | },
|
656 | 594 | {
|
657 | 595 | "cell_type": "code",
|
658 |
| - "execution_count": 12, |
| 596 | + "execution_count": 8, |
659 | 597 | "id": "ad898772",
|
660 | 598 | "metadata": {},
|
661 | 599 | "outputs": [
|
662 | 600 | {
|
663 | 601 | "name": "stdout",
|
664 | 602 | "output_type": "stream",
|
665 | 603 | "text": [
|
666 |
| - "average speedup: 1.09\n" |
| 604 | + "average speedup: 2.23\n" |
667 | 605 | ]
|
668 | 606 | }
|
669 | 607 | ],
|
|
689 | 627 | "name": "python",
|
690 | 628 | "nbconvert_exporter": "python",
|
691 | 629 | "pygments_lexer": "ipython3",
|
692 |
| - "version": "3.12.9" |
| 630 | + "version": "3.12.7" |
693 | 631 | },
|
694 | 632 | "openvino_notebooks": {
|
695 | 633 | "imageUrl": "https://github.com/user-attachments/assets/eb999dea-d98b-42bb-835e-28d3054e1a84",
|
|
0 commit comments