Skip to content

Commit 062f7d1

Browse files
add Minicpmo notebook (#2851)
Exclude TTS
1 parent 91a37ff commit 062f7d1

File tree

7 files changed

+3763
-0
lines changed

7 files changed

+3763
-0
lines changed

.ci/ignore_treon_docker.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,3 +89,4 @@ notebooks/glm4-v/glm4-v.ipynb
8989
notebooks/gemma3/gemma3.ipynb
9090
notebooks/omniparser/omniparser.ipynb
9191
notebooks/olmocr-pdf-vlm/olmocr-pdf-vlm.ipynb
92+
notebooks/minicpm-o-omnimodal-chatbot/minicpm-o-omnimodal-chatbot.ipynb

.ci/skipped_notebooks.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -542,3 +542,9 @@
542542
- python:
543543
- '3.9'
544544
- '3.10'
545+
- notebook: notebooks/minicpm-o-omnimodal-chatbot/minicpm-o-omnimodal-chatbot.ipynb
546+
skips:
547+
- os:
548+
- macos-13
549+
- ubuntu-22.04
550+
- windows-2019

.ci/spellcheck/.pyspelling.wordlist.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ chatbots
9393
chatglm
9494
ChatGLM
9595
ChatGPT
96+
ChatTTS
9697
chinese
9798
CIN
9899
ckpt
@@ -637,8 +638,12 @@ OCRBench
637638
OCRv
638639
odometry
639640
olmOCR
641+
Omni
642+
omni
640643
OmniGen
641644
OmniGen's
645+
Omnimodal
646+
omnimodal
642647
OmniParser
643648
OMZ
644649
OneFormer
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Omnimodal assistant with MiniCPM-o 2.6 and OpenVINO
2+
3+
MiniCPM-o 2.6 is the latest and most capable model in the MiniCPM-o series. The model is built in an end-to-end fashion based on SigLip-400M, Whisper-medium-300M, ChatTTS-200M, and Qwen2.5-7B with a total of 8B parameters. It exhibits a significant performance improvement over MiniCPM-V 2.6, and introduces new features for real-time speech conversation and multimodal live streaming.
4+
5+
More details about model can be found in [model card](https://huggingface.co/openbmb/MiniCPM-o-2_6) and original [repo](https://github.com/OpenBMB/MiniCPM-O).
6+
7+
In this tutorial we consider how to convert and optimize MiniCPM-o 2.6 model for creating omnimodal chatbot. Additionally, we demonstrate how to apply stateful transformation on LLM part and model optimization techniques like weights compression using [NNCF](https://github.com/openvinotoolkit/nncf)
8+
9+
10+
## Notebook contents
11+
The tutorial consists from following steps:
12+
13+
- Install requirements
14+
- Download PyTorch model
15+
- Convert model to OpenVINO Intermediate Representation (IR)
16+
- Compress Language Model weights
17+
- Prepare Inference Pipeline using OpenVINO GenAI
18+
- Run OpenVINO model inference
19+
- Launch Interactive demo
20+
21+
In this demonstration, you'll create interactive chatbot that can answer questions about provided image's content. Image bellow shows a result of model work.
22+
![Image](https://github.com/user-attachments/assets/83a1ff80-e87c-47bd-921e-30ff3c4424fa)
23+
24+
25+
## Installation instructions
26+
This is a self-contained example that relies solely on its own code.</br>
27+
We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.
28+
For details, please refer to [Installation Guide](../../README.md).
29+
30+
<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=5b5a4db0-7875-4bfb-bdbd-01698b5b1a77&file=notebooks/minicpm-o-omnimodal-chatbot/README.md" />
Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
from copy import deepcopy
2+
from typing import Dict, List
3+
from PIL import Image
4+
import librosa
5+
import gradio as gr
6+
7+
IMAGE_EXTENSIONS = (".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp")
8+
AUDIO_EXTENSIONS = (".mp3", ".wav", "flac", ".m4a", ".wma")
9+
10+
DEFAULT_SAMPLING_PARAMS = {
11+
"top_p": 0.0,
12+
"top_k": 1,
13+
"temperature": 0.0,
14+
"do_sample": True,
15+
"num_beams": 1,
16+
"repetition_penalty": 1.2,
17+
}
18+
MAX_NEW_TOKENS = 512
19+
20+
21+
def check_messages(history, message, audio):
22+
has_text = message["text"] and message["text"].strip()
23+
has_files = len(message["files"]) > 0
24+
has_audio = audio is not None
25+
26+
if not (has_text or has_files or has_audio):
27+
raise gr.Error("Message is empty")
28+
29+
audios = []
30+
images = []
31+
32+
for file_msg in message["files"]:
33+
if file_msg.endswith(AUDIO_EXTENSIONS):
34+
duration = librosa.get_duration(filename=file_msg)
35+
if duration > 60:
36+
raise gr.Error("Audio file too long. For efficiency we recommend to use audio < 60s")
37+
if duration == 0:
38+
raise gr.Error("Audio file too short")
39+
audios.append(file_msg)
40+
elif file_msg.endswith(IMAGE_EXTENSIONS):
41+
images.append(file_msg)
42+
else:
43+
filename = file_msg.split("/")[-1]
44+
raise gr.Error(f"Unsupported file type: {filename}. It should be an image or audio file.")
45+
46+
if len(audios) > 1:
47+
raise gr.Error("Please upload only one audio file.")
48+
49+
if len(images) > 1:
50+
raise gr.Error("Please upload only one image file.")
51+
52+
if audio is not None:
53+
if len(audios) > 0:
54+
raise gr.Error("Please upload only one audio file or record audio.")
55+
audios.append(audio)
56+
57+
# Append the message to the history
58+
for image in images:
59+
history.append({"role": "user", "content": (image,), "metadata": {"title": "image"}})
60+
61+
for audio in audios:
62+
history.append({"role": "user", "content": (audio,), "metadata": {"title": "audio"}})
63+
64+
if message["text"]:
65+
history.append({"role": "user", "content": message["text"], "metadata": {}})
66+
67+
return history, gr.MultimodalTextbox(value=None, interactive=False), None
68+
69+
70+
def history2messages(history: List[Dict]) -> List[Dict]:
71+
"""
72+
Transform gradio history to chat messages.
73+
"""
74+
messages = []
75+
cur_message = dict()
76+
cur_special_tags = ""
77+
for item in history:
78+
if item["role"] == "assistant":
79+
if len(cur_message) > 0:
80+
cur_message["content"].append(cur_message["content"])
81+
messages.append(deepcopy(cur_message))
82+
cur_message = dict()
83+
messages.append({"role": "assistant", "content": item["content"]})
84+
continue
85+
86+
if "role" not in cur_message:
87+
cur_message["role"] = "user"
88+
if "content" not in cur_message:
89+
cur_message["content"] = []
90+
91+
if "metadata" not in item:
92+
item["metadata"] = {"title": None}
93+
if item["metadata"].get("title") is None:
94+
cur_message["content"].append(item["content"])
95+
elif item["metadata"]["title"] == "image":
96+
cur_message["content"].append(Image.open(item["content"][0]))
97+
elif item["metadata"]["title"] == "audio":
98+
cur_message["content"].append(librosa.load(item["content"][0]))
99+
if len(cur_message) > 0:
100+
messages.append(cur_message)
101+
return messages
102+
103+
104+
def make_demo(ov_model, processor):
105+
def bot(
106+
history: list,
107+
top_p: float,
108+
top_k: int,
109+
temperature: float,
110+
repetition_penalty: float,
111+
max_new_tokens: int = MAX_NEW_TOKENS,
112+
regenerate: bool = False,
113+
):
114+
115+
if history and regenerate:
116+
history = history[:-1]
117+
118+
if not history:
119+
return history
120+
121+
msg = history2messages(history)
122+
generation_config = {
123+
"top_p": top_p,
124+
"top_k": top_k,
125+
"temperature": temperature,
126+
"repetition_penalty": repetition_penalty,
127+
"max_new_tokens": max_new_tokens,
128+
"do_sample": temperature > 0,
129+
}
130+
history.append({"role": "assistant", "content": ""})
131+
res = ov_model.chat(
132+
msgs=msg,
133+
tokenizer=processor.tokenizer,
134+
sampling=True,
135+
stream=True,
136+
**generation_config,
137+
)
138+
generated_text = ""
139+
for new_text in res:
140+
generated_text += new_text
141+
history[-1]["content"] = generated_text
142+
yield history
143+
144+
def change_state(state):
145+
return gr.update(visible=not state), not state
146+
147+
def reset_user_input():
148+
return gr.update(value="")
149+
150+
with gr.Blocks(theme=gr.themes.Soft()) as demo:
151+
gr.Markdown("# 🪐 Chat with OpenVINO MiniCPM-o")
152+
chatbot = gr.Chatbot(elem_id="chatbot", bubble_full_width=False, type="messages", height="48vh")
153+
154+
sampling_params_group_hidden_state = gr.State(False)
155+
156+
with gr.Row(equal_height=True):
157+
chat_input = gr.MultimodalTextbox(
158+
file_count="multiple",
159+
placeholder="Enter your prompt or upload image/audio here, then press ENTER...",
160+
show_label=False,
161+
scale=8,
162+
file_types=["image", "audio"],
163+
interactive=True,
164+
# stop_btn=True,
165+
)
166+
with gr.Row(equal_height=True):
167+
audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", scale=1, max_length=30)
168+
with gr.Row(equal_height=True):
169+
with gr.Column(scale=1, min_width=150):
170+
with gr.Row(equal_height=True):
171+
regenerate_btn = gr.Button("Regenerate", variant="primary")
172+
clear_btn = gr.ClearButton([chat_input, audio_input, chatbot])
173+
174+
with gr.Row():
175+
sampling_params_toggle_btn = gr.Button("Sampling Parameters")
176+
177+
with gr.Group(visible=False) as sampling_params_group:
178+
with gr.Row():
179+
temperature = gr.Slider(minimum=0, maximum=1, value=DEFAULT_SAMPLING_PARAMS["temperature"], label="Temperature")
180+
repetition_penalty = gr.Slider(
181+
minimum=0,
182+
maximum=2,
183+
value=DEFAULT_SAMPLING_PARAMS["repetition_penalty"],
184+
label="Repetition Penalty",
185+
)
186+
187+
with gr.Row():
188+
top_p = gr.Slider(minimum=0, maximum=1, value=DEFAULT_SAMPLING_PARAMS["top_p"], label="Top-p")
189+
top_k = gr.Slider(minimum=0, maximum=1000, value=DEFAULT_SAMPLING_PARAMS["top_k"], label="Top-k")
190+
191+
with gr.Row():
192+
max_new_tokens = gr.Slider(
193+
minimum=1,
194+
maximum=MAX_NEW_TOKENS,
195+
value=MAX_NEW_TOKENS,
196+
label="Max New Tokens",
197+
interactive=True,
198+
)
199+
200+
sampling_params_toggle_btn.click(
201+
change_state,
202+
sampling_params_group_hidden_state,
203+
[sampling_params_group, sampling_params_group_hidden_state],
204+
)
205+
chat_msg = chat_input.submit(
206+
check_messages,
207+
[chatbot, chat_input, audio_input],
208+
[chatbot, chat_input, audio_input],
209+
)
210+
211+
bot_msg = chat_msg.then(
212+
bot,
213+
inputs=[chatbot, top_p, top_k, temperature, repetition_penalty, max_new_tokens],
214+
outputs=chatbot,
215+
)
216+
217+
bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])
218+
219+
regenerate_btn.click(
220+
bot,
221+
inputs=[chatbot, top_p, top_k, temperature, repetition_penalty, max_new_tokens, gr.State(True)],
222+
outputs=chatbot,
223+
)
224+
return demo

0 commit comments

Comments
 (0)