|
| 1 | +from copy import deepcopy |
| 2 | +from typing import Dict, List |
| 3 | +from PIL import Image |
| 4 | +import librosa |
| 5 | +import gradio as gr |
| 6 | + |
| 7 | +IMAGE_EXTENSIONS = (".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp") |
| 8 | +AUDIO_EXTENSIONS = (".mp3", ".wav", "flac", ".m4a", ".wma") |
| 9 | + |
| 10 | +DEFAULT_SAMPLING_PARAMS = { |
| 11 | + "top_p": 0.0, |
| 12 | + "top_k": 1, |
| 13 | + "temperature": 0.0, |
| 14 | + "do_sample": True, |
| 15 | + "num_beams": 1, |
| 16 | + "repetition_penalty": 1.2, |
| 17 | +} |
| 18 | +MAX_NEW_TOKENS = 512 |
| 19 | + |
| 20 | + |
| 21 | +def check_messages(history, message, audio): |
| 22 | + has_text = message["text"] and message["text"].strip() |
| 23 | + has_files = len(message["files"]) > 0 |
| 24 | + has_audio = audio is not None |
| 25 | + |
| 26 | + if not (has_text or has_files or has_audio): |
| 27 | + raise gr.Error("Message is empty") |
| 28 | + |
| 29 | + audios = [] |
| 30 | + images = [] |
| 31 | + |
| 32 | + for file_msg in message["files"]: |
| 33 | + if file_msg.endswith(AUDIO_EXTENSIONS): |
| 34 | + duration = librosa.get_duration(filename=file_msg) |
| 35 | + if duration > 60: |
| 36 | + raise gr.Error("Audio file too long. For efficiency we recommend to use audio < 60s") |
| 37 | + if duration == 0: |
| 38 | + raise gr.Error("Audio file too short") |
| 39 | + audios.append(file_msg) |
| 40 | + elif file_msg.endswith(IMAGE_EXTENSIONS): |
| 41 | + images.append(file_msg) |
| 42 | + else: |
| 43 | + filename = file_msg.split("/")[-1] |
| 44 | + raise gr.Error(f"Unsupported file type: {filename}. It should be an image or audio file.") |
| 45 | + |
| 46 | + if len(audios) > 1: |
| 47 | + raise gr.Error("Please upload only one audio file.") |
| 48 | + |
| 49 | + if len(images) > 1: |
| 50 | + raise gr.Error("Please upload only one image file.") |
| 51 | + |
| 52 | + if audio is not None: |
| 53 | + if len(audios) > 0: |
| 54 | + raise gr.Error("Please upload only one audio file or record audio.") |
| 55 | + audios.append(audio) |
| 56 | + |
| 57 | + # Append the message to the history |
| 58 | + for image in images: |
| 59 | + history.append({"role": "user", "content": (image,), "metadata": {"title": "image"}}) |
| 60 | + |
| 61 | + for audio in audios: |
| 62 | + history.append({"role": "user", "content": (audio,), "metadata": {"title": "audio"}}) |
| 63 | + |
| 64 | + if message["text"]: |
| 65 | + history.append({"role": "user", "content": message["text"], "metadata": {}}) |
| 66 | + |
| 67 | + return history, gr.MultimodalTextbox(value=None, interactive=False), None |
| 68 | + |
| 69 | + |
| 70 | +def history2messages(history: List[Dict]) -> List[Dict]: |
| 71 | + """ |
| 72 | + Transform gradio history to chat messages. |
| 73 | + """ |
| 74 | + messages = [] |
| 75 | + cur_message = dict() |
| 76 | + cur_special_tags = "" |
| 77 | + for item in history: |
| 78 | + if item["role"] == "assistant": |
| 79 | + if len(cur_message) > 0: |
| 80 | + cur_message["content"].append(cur_message["content"]) |
| 81 | + messages.append(deepcopy(cur_message)) |
| 82 | + cur_message = dict() |
| 83 | + messages.append({"role": "assistant", "content": item["content"]}) |
| 84 | + continue |
| 85 | + |
| 86 | + if "role" not in cur_message: |
| 87 | + cur_message["role"] = "user" |
| 88 | + if "content" not in cur_message: |
| 89 | + cur_message["content"] = [] |
| 90 | + |
| 91 | + if "metadata" not in item: |
| 92 | + item["metadata"] = {"title": None} |
| 93 | + if item["metadata"].get("title") is None: |
| 94 | + cur_message["content"].append(item["content"]) |
| 95 | + elif item["metadata"]["title"] == "image": |
| 96 | + cur_message["content"].append(Image.open(item["content"][0])) |
| 97 | + elif item["metadata"]["title"] == "audio": |
| 98 | + cur_message["content"].append(librosa.load(item["content"][0])) |
| 99 | + if len(cur_message) > 0: |
| 100 | + messages.append(cur_message) |
| 101 | + return messages |
| 102 | + |
| 103 | + |
| 104 | +def make_demo(ov_model, processor): |
| 105 | + def bot( |
| 106 | + history: list, |
| 107 | + top_p: float, |
| 108 | + top_k: int, |
| 109 | + temperature: float, |
| 110 | + repetition_penalty: float, |
| 111 | + max_new_tokens: int = MAX_NEW_TOKENS, |
| 112 | + regenerate: bool = False, |
| 113 | + ): |
| 114 | + |
| 115 | + if history and regenerate: |
| 116 | + history = history[:-1] |
| 117 | + |
| 118 | + if not history: |
| 119 | + return history |
| 120 | + |
| 121 | + msg = history2messages(history) |
| 122 | + generation_config = { |
| 123 | + "top_p": top_p, |
| 124 | + "top_k": top_k, |
| 125 | + "temperature": temperature, |
| 126 | + "repetition_penalty": repetition_penalty, |
| 127 | + "max_new_tokens": max_new_tokens, |
| 128 | + "do_sample": temperature > 0, |
| 129 | + } |
| 130 | + history.append({"role": "assistant", "content": ""}) |
| 131 | + res = ov_model.chat( |
| 132 | + msgs=msg, |
| 133 | + tokenizer=processor.tokenizer, |
| 134 | + sampling=True, |
| 135 | + stream=True, |
| 136 | + **generation_config, |
| 137 | + ) |
| 138 | + generated_text = "" |
| 139 | + for new_text in res: |
| 140 | + generated_text += new_text |
| 141 | + history[-1]["content"] = generated_text |
| 142 | + yield history |
| 143 | + |
| 144 | + def change_state(state): |
| 145 | + return gr.update(visible=not state), not state |
| 146 | + |
| 147 | + def reset_user_input(): |
| 148 | + return gr.update(value="") |
| 149 | + |
| 150 | + with gr.Blocks(theme=gr.themes.Soft()) as demo: |
| 151 | + gr.Markdown("# 🪐 Chat with OpenVINO MiniCPM-o") |
| 152 | + chatbot = gr.Chatbot(elem_id="chatbot", bubble_full_width=False, type="messages", height="48vh") |
| 153 | + |
| 154 | + sampling_params_group_hidden_state = gr.State(False) |
| 155 | + |
| 156 | + with gr.Row(equal_height=True): |
| 157 | + chat_input = gr.MultimodalTextbox( |
| 158 | + file_count="multiple", |
| 159 | + placeholder="Enter your prompt or upload image/audio here, then press ENTER...", |
| 160 | + show_label=False, |
| 161 | + scale=8, |
| 162 | + file_types=["image", "audio"], |
| 163 | + interactive=True, |
| 164 | + # stop_btn=True, |
| 165 | + ) |
| 166 | + with gr.Row(equal_height=True): |
| 167 | + audio_input = gr.Audio(sources=["microphone", "upload"], type="filepath", scale=1, max_length=30) |
| 168 | + with gr.Row(equal_height=True): |
| 169 | + with gr.Column(scale=1, min_width=150): |
| 170 | + with gr.Row(equal_height=True): |
| 171 | + regenerate_btn = gr.Button("Regenerate", variant="primary") |
| 172 | + clear_btn = gr.ClearButton([chat_input, audio_input, chatbot]) |
| 173 | + |
| 174 | + with gr.Row(): |
| 175 | + sampling_params_toggle_btn = gr.Button("Sampling Parameters") |
| 176 | + |
| 177 | + with gr.Group(visible=False) as sampling_params_group: |
| 178 | + with gr.Row(): |
| 179 | + temperature = gr.Slider(minimum=0, maximum=1, value=DEFAULT_SAMPLING_PARAMS["temperature"], label="Temperature") |
| 180 | + repetition_penalty = gr.Slider( |
| 181 | + minimum=0, |
| 182 | + maximum=2, |
| 183 | + value=DEFAULT_SAMPLING_PARAMS["repetition_penalty"], |
| 184 | + label="Repetition Penalty", |
| 185 | + ) |
| 186 | + |
| 187 | + with gr.Row(): |
| 188 | + top_p = gr.Slider(minimum=0, maximum=1, value=DEFAULT_SAMPLING_PARAMS["top_p"], label="Top-p") |
| 189 | + top_k = gr.Slider(minimum=0, maximum=1000, value=DEFAULT_SAMPLING_PARAMS["top_k"], label="Top-k") |
| 190 | + |
| 191 | + with gr.Row(): |
| 192 | + max_new_tokens = gr.Slider( |
| 193 | + minimum=1, |
| 194 | + maximum=MAX_NEW_TOKENS, |
| 195 | + value=MAX_NEW_TOKENS, |
| 196 | + label="Max New Tokens", |
| 197 | + interactive=True, |
| 198 | + ) |
| 199 | + |
| 200 | + sampling_params_toggle_btn.click( |
| 201 | + change_state, |
| 202 | + sampling_params_group_hidden_state, |
| 203 | + [sampling_params_group, sampling_params_group_hidden_state], |
| 204 | + ) |
| 205 | + chat_msg = chat_input.submit( |
| 206 | + check_messages, |
| 207 | + [chatbot, chat_input, audio_input], |
| 208 | + [chatbot, chat_input, audio_input], |
| 209 | + ) |
| 210 | + |
| 211 | + bot_msg = chat_msg.then( |
| 212 | + bot, |
| 213 | + inputs=[chatbot, top_p, top_k, temperature, repetition_penalty, max_new_tokens], |
| 214 | + outputs=chatbot, |
| 215 | + ) |
| 216 | + |
| 217 | + bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input]) |
| 218 | + |
| 219 | + regenerate_btn.click( |
| 220 | + bot, |
| 221 | + inputs=[chatbot, top_p, top_k, temperature, repetition_penalty, max_new_tokens, gr.State(True)], |
| 222 | + outputs=chatbot, |
| 223 | + ) |
| 224 | + return demo |
0 commit comments