add Minicpmo notebook (#2851)

openvino-dev-samples · web-flow · commit 062f7d17fd2a · 2025-04-08T13:47:01.000+04:00
Exclude TTS
diff --git a/.ci/ignore_treon_docker.txt b/.ci/ignore_treon_docker.txt
@@ -89,3 +89,4 @@ notebooks/glm4-v/glm4-v.ipynb
 notebooks/gemma3/gemma3.ipynb
 notebooks/omniparser/omniparser.ipynb
 notebooks/olmocr-pdf-vlm/olmocr-pdf-vlm.ipynb
+notebooks/minicpm-o-omnimodal-chatbot/minicpm-o-omnimodal-chatbot.ipynb
diff --git a/.ci/skipped_notebooks.yml b/.ci/skipped_notebooks.yml
@@ -542,3 +542,9 @@
     - python:
         - '3.9'
         - '3.10'
+- notebook: notebooks/minicpm-o-omnimodal-chatbot/minicpm-o-omnimodal-chatbot.ipynb
+  skips:
+    - os:
+        - macos-13
+        - ubuntu-22.04
+        - windows-2019
diff --git a/.ci/spellcheck/.pyspelling.wordlist.txt b/.ci/spellcheck/.pyspelling.wordlist.txt
@@ -93,6 +93,7 @@ chatbots
 chatglm
 ChatGLM
 ChatGPT
+ChatTTS
 chinese
 CIN
 ckpt
@@ -637,8 +638,12 @@ OCRBench
 OCRv
 odometry
 olmOCR
+Omni
+omni
 OmniGen
 OmniGen's
+Omnimodal
+omnimodal
 OmniParser
 OMZ
 OneFormer
diff --git a/notebooks/minicpm-o-omnimodal-chatbot/README.md b/notebooks/minicpm-o-omnimodal-chatbot/README.md
@@ -0,0 +1,30 @@
+# Omnimodal assistant with MiniCPM-o 2.6 and OpenVINO
+
+MiniCPM-o 2.6 is the latest and most capable model in the MiniCPM-o series. The model is built in an end-to-end fashion based on SigLip-400M, Whisper-medium-300M, ChatTTS-200M, and Qwen2.5-7B with a total of 8B parameters. It exhibits a significant performance improvement over MiniCPM-V 2.6, and introduces new features for real-time speech conversation and multimodal live streaming.
+
+More details about model can be found in [model card](https://huggingface.co/openbmb/MiniCPM-o-2_6) and original [repo](https://github.com/OpenBMB/MiniCPM-O).
+
+In this tutorial we consider how to convert and optimize MiniCPM-o 2.6 model for creating omnimodal chatbot. Additionally, we demonstrate how to apply stateful transformation on LLM part and model optimization techniques like weights compression using [NNCF](https://github.com/openvinotoolkit/nncf)
+
+
+## Notebook contents
+The tutorial consists from following steps:
+
+- Install requirements
+- Download PyTorch model
+- Convert model to OpenVINO Intermediate Representation (IR)
+- Compress Language Model weights
+- Prepare Inference Pipeline using OpenVINO GenAI
+- Run OpenVINO model inference
+- Launch Interactive demo
+
+In this demonstration, you'll create interactive chatbot that can answer questions about provided image's content. Image bellow shows a result of model work.
+![Image](https://github.com/user-attachments/assets/83a1ff80-e87c-47bd-921e-30ff3c4424fa)
+
+
+## Installation instructions
+This is a self-contained example that relies solely on its own code.</br>
+We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.
+For details, please refer to [Installation Guide](../../README.md).
+
+<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=5b5a4db0-7875-4bfb-bdbd-01698b5b1a77&file=notebooks/minicpm-o-omnimodal-chatbot/README.md" />
diff --git a/notebooks/minicpm-o-omnimodal-chatbot/gradio_helper.py b/notebooks/minicpm-o-omnimodal-chatbot/gradio_helper.py
@@ -0,0 +1,224 @@
+from copy import deepcopy
+from typing import Dict, List
+from PIL import Image
+import librosa
+import gradio as gr
+
+IMAGE_EXTENSIONS = (".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp")
+AUDIO_EXTENSIONS = (".mp3", ".wav", "flac", ".m4a", ".wma")
+
+DEFAULT_SAMPLING_PARAMS = {
+    "top_p": 0.0,
+    "top_k": 1,
+    "temperature": 0.0,
+    "do_sample": True,
+    "num_beams": 1,
+    "repetition_penalty": 1.2,
+}
+MAX_NEW_TOKENS = 512
+
+
+def check_messages(history, message, audio):
+    has_text = message["text"] and message["text"].strip()
+    has_files = len(message["files"]) > 0
+    has_audio = audio is not None
+
+    if not (has_text or has_files or has_audio):
+        raise gr.Error("Message is empty")
+
+    audios = []
+    images = []
+
+    for file_msg in message["files"]:
+        if file_msg.endswith(AUDIO_EXTENSIONS):
+            duration = librosa.get_duration(filename=file_msg)
+            if duration > 60:
+                raise gr.Error("Audio file too long. For efficiency we recommend to use audio < 60s")
+            if duration == 0:
+                raise gr.Error("Audio file too short")
+            audios.append(file_msg)
+        elif file_msg.endswith(IMAGE_EXTENSIONS):
+            images.append(file_msg)
+        else:
+            filename = file_msg.split("/")[-1]
+            raise gr.Error(f"Unsupported file type: {filename}. It should be an image or audio file.")
+
+    if len(audios) > 1:
+        raise gr.Error("Please upload only one audio file.")
+
+    if len(images) > 1:
+        raise gr.Error("Please upload only one image file.")
+
+    if audio is not None:
+        if len(audios) > 0:
+            raise gr.Error("Please upload only one audio file or record audio.")
+        audios.append(audio)
+
+    # Append the message to the history
+    for image in images:
+        history.append({"role": "user", "content": (image,), "metadata": {"title": "image"}})
+
+    for audio in audios:
+        history.append({"role": "user", "content": (audio,), "metadata": {"title": "audio"}})
+
+    if message["text"]:
+        history.append({"role": "user", "content": message["text"], "metadata": {}})
+
+    return history, gr.MultimodalTextbox(value=None, interactive=False), None
+
+
+def history2messages(history: List[Dict]) -> List[Dict]:
+    """
+    Transform gradio history to chat messages.
+    """
+    messages = []
+    cur_message = dict()
+    cur_special_tags = ""
+    for item in history:
+        if item["role"] == "assistant":
+            if len(cur_message) > 0:
+                cur_message["content"].append(cur_message["content"])
+                messages.append(deepcopy(cur_message))
+                cur_message = dict()
+            messages.append({"role": "assistant", "content": item["content"]})
+            continue
+
+        if "role" not in cur_message:
+            cur_message["role"] = "user"
+        if "content" not in cur_message:
+            cur_message["content"] = []
+
+        if "metadata" not in item:
+            item["metadata"] = {"title": None}
+        if item["metadata"].get("title") is None:
+            cur_message["content"].append(item["content"])
+        elif item["metadata"]["title"] == "image":
+            cur_message["content"].append(Image.open(item["content"][0]))
+        elif item["metadata"]["title"] == "audio":
+            cur_message["content"].append(librosa.load(item["content"][0]))
+    if len(cur_message) > 0:
+        messages.append(cur_message)
+    return messages
+
+
+def make_demo(ov_model, processor):
+    def bot(
+        history: list,
+        top_p: float,
+        top_k: int,
+        temperature: float,
+        repetition_penalty: float,
+        max_new_tokens: int = MAX_NEW_TOKENS,
+        regenerate: bool = False,
+    ):
+
+        if history and regenerate:
+            history = history[:-1]
+
+        if not history:
+            return history
+
+        msg = history2messages(history)
+        generation_config = {
+            "top_p": top_p,
+            "top_k": top_k,
+            "temperature": temperature,
+            "repetition_penalty": repetition_penalty,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": temperature > 0,
+        }
+        history.append({"role": "assistant", "content": ""})
+        res = ov_model.chat(
+            msgs=msg,
+            tokenizer=processor.tokenizer,
+            sampling=True,
+            stream=True,
+            **generation_config,
+        )
+        generated_text = ""
+        for new_text in res:
+            generated_text += new_text
+            history[-1]["content"] = generated_text
+            yield history
+
+    def change_state(state):
+        return gr.update(visible=not state), not state
+
+    def reset_user_input():
+        return gr.update(value="")
+
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# 🪐 Chat with OpenVINO MiniCPM-o")
+        chatbot = gr.Chatbot(elem_id="chatbot", bubble_full_width=False, type="messages", height="48vh")
+
+        sampling_params_group_hidden_state = gr.State(False)
+
+        with gr.Row(equal_height=True):
+            chat_input = gr.MultimodalTextbox(
+                file_count="multiple",
+                placeholder="Enter your prompt or upload image/audio here, then press ENTER...",
+                show_label=False,
+                scale=8,
+                file_types=["image", "audio"],
+                interactive=True,
+                # stop_btn=True,
+            )
+        with gr.Row(equal_height=True):
+            audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", scale=1, max_length=30)
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1, min_width=150):
+                with gr.Row(equal_height=True):
+                    regenerate_btn = gr.Button("Regenerate", variant="primary")
+                    clear_btn = gr.ClearButton([chat_input, audio_input, chatbot])
+
+        with gr.Row():
+            sampling_params_toggle_btn = gr.Button("Sampling Parameters")
+
+        with gr.Group(visible=False) as sampling_params_group:
+            with gr.Row():
+                temperature = gr.Slider(minimum=0, maximum=1, value=DEFAULT_SAMPLING_PARAMS["temperature"], label="Temperature")
+                repetition_penalty = gr.Slider(
+                    minimum=0,
+                    maximum=2,
+                    value=DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
+                    label="Repetition Penalty",
+                )
+
+            with gr.Row():
+                top_p = gr.Slider(minimum=0, maximum=1, value=DEFAULT_SAMPLING_PARAMS["top_p"], label="Top-p")
+                top_k = gr.Slider(minimum=0, maximum=1000, value=DEFAULT_SAMPLING_PARAMS["top_k"], label="Top-k")
+
+            with gr.Row():
+                max_new_tokens = gr.Slider(
+                    minimum=1,
+                    maximum=MAX_NEW_TOKENS,
+                    value=MAX_NEW_TOKENS,
+                    label="Max New Tokens",
+                    interactive=True,
+                )
+
+        sampling_params_toggle_btn.click(
+            change_state,
+            sampling_params_group_hidden_state,
+            [sampling_params_group, sampling_params_group_hidden_state],
+        )
+        chat_msg = chat_input.submit(
+            check_messages,
+            [chatbot, chat_input, audio_input],
+            [chatbot, chat_input, audio_input],
+        )
+
+        bot_msg = chat_msg.then(
+            bot,
+            inputs=[chatbot, top_p, top_k, temperature, repetition_penalty, max_new_tokens],
+            outputs=chatbot,
+        )
+
+        bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])
+
+        regenerate_btn.click(
+            bot,
+            inputs=[chatbot, top_p, top_k, temperature, repetition_penalty, max_new_tokens, gr.State(True)],
+            outputs=chatbot,
+        )
+    return demo
diff --git a/notebooks/minicpm-o-omnimodal-chatbot/minicpm-o-omnimodal-chatbot.ipynb b/notebooks/minicpm-o-omnimodal-chatbot/minicpm-o-omnimodal-chatbot.ipynb
diff --git a/notebooks/minicpm-o-omnimodal-chatbot/minicpm_o_helper.py b/notebooks/minicpm-o-omnimodal-chatbot/minicpm_o_helper.py