diff --git a/.ci/ignore_treon_docker.txt b/.ci/ignore_treon_docker.txt index d27f40d53ef..54a16ead594 100644 --- a/.ci/ignore_treon_docker.txt +++ b/.ci/ignore_treon_docker.txt @@ -86,4 +86,5 @@ notebooks/omniparser/omniparser.ipynb notebooks/olmocr-pdf-vlm/olmocr-pdf-vlm.ipynb notebooks/minicpm-o-omnimodal-chatbot/minicpm-o-omnimodal-chatbot.ipynb notebooks/kokoro/kokoro.ipynb -notebooks/qwen2.5-omni-chatbot/qwen2.5-omni-chatbot.ipynb \ No newline at end of file +notebooks/qwen2.5-omni-chatbot/qwen2.5-omni-chatbot.ipynb +notebooks/intern-video2-classiciation/intern-video2-classification.ipynb \ No newline at end of file diff --git a/.ci/skipped_notebooks.yml b/.ci/skipped_notebooks.yml index 1f0624755d7..557af667a74 100644 --- a/.ci/skipped_notebooks.yml +++ b/.ci/skipped_notebooks.yml @@ -530,9 +530,15 @@ - macos-13 - ubuntu-22.04 - windows-2019 -- notebook: "notebooks/deepseek-vl2/deepseek-vl2.ipynb" +- notebook: notebooks/deepseek-vl2/deepseek-vl2.ipynb skips: - os: - macos-13 - ubuntu-22.04 - - windows-2019 \ No newline at end of file + - windows-2019 +- notebook: notebooks/intern-video2-classiciation/intern-video2-classification.ipynb + skips: + - os: + - macos-13 + - ubuntu-22.04 + - windows-2019 diff --git a/.ci/spellcheck/.pyspelling.wordlist.txt b/.ci/spellcheck/.pyspelling.wordlist.txt index c3e7eaa800c..b05d680384a 100644 --- a/.ci/spellcheck/.pyspelling.wordlist.txt +++ b/.ci/spellcheck/.pyspelling.wordlist.txt @@ -85,6 +85,7 @@ BLACKBOX boolean CatVTON CentOS +centric CFG charlist charlists @@ -403,6 +404,7 @@ intel interactable InternLM internlm +InternVideo Interpolative interpretable invertible @@ -1074,6 +1076,7 @@ vec VegaRT verovio videpth +ViFM VIO virtualenv VisCPM diff --git a/notebooks/intern-video2-classiciation/README.md b/notebooks/intern-video2-classiciation/README.md new file mode 100644 index 00000000000..569424ff4a5 --- /dev/null +++ b/notebooks/intern-video2-classiciation/README.md @@ -0,0 +1,26 @@ +# Video Classification with InternVideo2 and OpenVINO + +InternVideo2 is family of video foundation models (ViFM) that achieve the state-of-the-art results in video recognition, video-text tasks, and video-centric dialogue. +You can find more information about model in [model card](https://huggingface.co/OpenGVLab/InternVideo2-Stage2_6B), [paper](https://arxiv.org/pdf/2403.15377) and original [repository](https://github.com/OpenGVLab/InternVideo/tree/main/InternVideo2/multi_modality). + +In this tutorial we consider how to convert, optimize and run InternVideo2 Stage2 model for video classification using OpenVINO. + +## Notebook contents +The tutorial consists from following steps: + +- Install requirements +- Convert and Optimize model +- Run OpenVINO model inference +- Launch Interactive demo + +In this demonstration, you'll create text-to-video retrieval pipeline which is responsible to find the most suitable text caption for video content. + +The image bellow illustrates example of model inference result. +![example.png](https://github.com/user-attachments/assets/6720efe0-ab24-4d73-a22f-a8a0499558d8) + +## Installation instructions +This is a self-contained example that relies solely on its own code.
+We recommend running the notebook in a virtual environment. You only need a Jupyter server to start. +For details, please refer to [Installation Guide](../../README.md). + + diff --git a/notebooks/intern-video2-classiciation/gradio_helper.py b/notebooks/intern-video2-classiciation/gradio_helper.py new file mode 100644 index 00000000000..1bbb0e1f8ff --- /dev/null +++ b/notebooks/intern-video2-classiciation/gradio_helper.py @@ -0,0 +1,16 @@ +import gradio as gr + + +def make_demo(classify): + demo = gr.Interface( + classify, + [ + gr.Video(label="Video"), + gr.Textbox(label="Labels", info="Comma-separated list of class labels"), + ], + gr.Label(label="Result"), + examples=[["coco.mp4", "airplane, dog, car"]], + allow_flagging="never", + ) + + return demo diff --git a/notebooks/intern-video2-classiciation/intern-video2-classification.ipynb b/notebooks/intern-video2-classiciation/intern-video2-classification.ipynb new file mode 100644 index 00000000000..1b853400a03 --- /dev/null +++ b/notebooks/intern-video2-classiciation/intern-video2-classification.ipynb @@ -0,0 +1,739 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Video Classification with InternVideo2 and OpenVINO\n", + "\n", + "InternVideo2 is family of video foundation models (ViFM) that achieve the state-of-the-art results in video recognition, video-text tasks, and video-centric dialogue.\n", + "You can find more information about model in [model card](https://huggingface.co/OpenGVLab/InternVideo2-Stage2_6B), [paper](https://arxiv.org/pdf/2403.15377) and original [repository](https://github.com/OpenGVLab/InternVideo/tree/main/InternVideo2/multi_modality).\n", + "\n", + "In this tutorial we consider how to convert, optimize and run InternVideo2 Stage2 model for video classification using OpenVINO.\n", + "\n", + "#### Table of contents:\n", + "\n", + "- [Prerequisites](#Prerequisites)\n", + "- [Convert model to OpenVINO Intermediate Representation](#Convert-model-to-OpenVINO-Intermediate-Representation)\n", + " - [Compress model weights](#Compress-model-weights)\n", + "- [Prepare model inference pipeline](#Prepare-model-inference-pipeline)\n", + " - [Select inference device](#Select-inference-device)\n", + "- [Run model inference](#Run-model-inference)\n", + "- [Interactive demo](#Interactive-demo)\n", + "\n", + "\n", + "### Installation Instructions\n", + "\n", + "This is a self-contained example that relies solely on its own code.\n", + "\n", + "We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.\n", + "For details, please refer to [Installation Guide](https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/README.md#-installation-guide).\n", + "\n", + "\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "[back to top ⬆️](#Table-of-contents:)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -q \"torch>=2.1\" \"torchvision\" \"opencv-python\" \"transformers>=4.45\" \"einops>=0.7.0\" \"timm>=0.5.4\" \"gradio>=4.19\" --extra-index-url https://download.pytorch.org/whl/cpu\n", + "%pip install -q \"openvino>=2025.1.0\" \"nncf>=2.16.0\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from pathlib import Path\n", + "\n", + "if not Path(\"ov_internvideo_helper.py\").exists():\n", + " r = requests.get(\n", + " url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/intern-video2-classiciation/ov_internvideo_helper.py\"\n", + " )\n", + " open(\"ov_internvideo_helper.py\", \"w\").write(r.text)\n", + "\n", + "if not Path(\"gradio_helper.py\").exists():\n", + " r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/intern-video2-classiciation/gradio_helper.py\")\n", + " open(\"gradio_helper.py\", \"w\").write(r.text)\n", + "\n", + "if not Path(\"notebook_utils.py\").exists():\n", + " r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py\")\n", + " open(\"notebook_utils.py\", \"w\").write(r.text)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Convert model to OpenVINO Intermediate Representation\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "\n", + "InternVideo2 is PyTorch model. OpenVINO supports PyTorch models via conversion to OpenVINO Intermediate Representation (IR). [OpenVINO model conversion API](https://docs.openvino.ai/2024/openvino-workflow/model-preparation.html#convert-a-model-with-python-convert-model) should be used for these purposes. `ov.convert_model` function accepts original PyTorch model instance and example input for tracing and returns `ov.Model` representing this model in OpenVINO framework. Converted model can be used for saving on disk using `ov.save_model` function or directly loading on device using `core.complie_model`.\n", + "\n", + "Model consist of 2 parts:\n", + "* **Vision Encoder** for conversion video frames to embeddings space\n", + "* **Text Encoder** for conversion text labels to embeddings space\n", + "\n", + "Model performs text-to-video retrieval task comparing similarity between text and vision features. For preserving original model flexibility, we will convert each part separately.\n", + "The script `ov_internvideo_helper.py` contains helper function for model conversion, please check its content if you interested in conversion details." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from ov_internvideo_helper import convert_internvideo\n", + "\n", + "# Read more about telemetry collection at https://github.com/openvinotoolkit/openvino_notebooks?tab=readme-ov-file#-telemetry\n", + "from notebook_utils import collect_telemetry\n", + "\n", + "collect_telemetry(\"intern-video2-classification.ipynb\")\n", + "\n", + "# Uncomment the line to see model conversion code\n", + "# ??convert_internvideo" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Compress model weights\n", + "[back to top ⬆️](#Table-of-contents:)\n", + "\n", + "For reducing memory consumption, weights compression optimization can be applied using [NNCF](https://github.com/openvinotoolkit/nncf). \n", + "\n", + "
\n", + " Click here for more details about weight compression\n", + "Weight compression aims to reduce the memory footprint of a model. It can also lead to significant performance improvement for large memory-bound models, such as Large Language Models (LLMs). LLMs and other models, which require extensive memory to store the weights during inference, can benefit from weight compression in the following ways:\n", + "\n", + "* enabling the inference of exceptionally large models that cannot be accommodated in the memory of the device;\n", + "\n", + "* improving the inference performance of the models by reducing the latency of the memory access when computing the operations with weights, for example, Linear layers.\n", + "\n", + "[Neural Network Compression Framework (NNCF)](https://github.com/openvinotoolkit/nncf) provides 4-bit / 8-bit mixed weight quantization as a compression method primarily designed to optimize LLMs. The main difference between weights compression and full model quantization (post-training quantization) is that activations remain floating-point in the case of weights compression which leads to a better accuracy. Weight compression for LLMs provides a solid inference performance improvement which is on par with the performance of the full model quantization. In addition, weight compression is data-free and does not require a calibration dataset, making it easy to use.\n", + "\n", + "`nncf.compress_weights` function can be used for performing weights compression. The function accepts an OpenVINO model and other compression parameters. Compared to INT8 compression, INT4 compression improves performance even more, but introduces a minor drop in prediction quality.\n", + "\n", + "More details about weights compression, can be found in [OpenVINO documentation](https://docs.openvino.ai/2024/openvino-workflow/model-optimization-guide/weight-compression.html).\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "01a14cfe1d4349d5b154565aa579aad9", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Dropdown(description='Model format:', options=('FP16', 'INT8', 'INT4'), value='FP16')" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import ipywidgets as widgets\n", + "\n", + "model_format = widgets.Dropdown(\n", + " options=[\"FP16\", \"INT8\", \"INT4\"],\n", + " default=\"INT4\",\n", + " description=\"Model format:\",\n", + ")\n", + "\n", + "model_id = \"OpenGVLab/InternVideo2-Stage2_6B\"\n", + "model_format" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ea/work/my_optimum_intel/optimum_env_new/lib/python3.11/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers\n", + " warnings.warn(f\"Importing from {__name__} is deprecated, please import via timm.layers\", FutureWarning)\n", + "/home/ea/.cache/huggingface/modules/transformers_modules/InternVideo2-Stage2_6B/modeling_internvideo2.py:508: FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n", + " @torch.cuda.amp.autocast(enabled=False)\n", + "/home/ea/work/my_optimum_intel/optimum_env_new/lib/python3.11/site-packages/transformers/configuration_utils.py:311: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FusedMLP of flash_attn is not installed!!!\n", + "DropoutAddRMSNorm of flash_attn is not installed!!!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.\n", + " - If you're using `trust_remote_code=True`, you can get rid of this warning by loading the model with an auto class. See https://huggingface.co/docs/transformers/en/model_doc/auto#auto-classes\n", + " - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).\n", + " - If you are not the owner of the model architecture class, please contact the model code owner to update it.\n", + "Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']\n", + "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Some weights of BertForMaskedLM were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['bert.encoder.layer.19.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.19.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.19.crossattention.output.dense.bias', 'bert.encoder.layer.19.crossattention.output.dense.weight', 'bert.encoder.layer.19.crossattention.self.key.bias', 'bert.encoder.layer.19.crossattention.self.key.weight', 'bert.encoder.layer.19.crossattention.self.query.bias', 'bert.encoder.layer.19.crossattention.self.query.weight', 'bert.encoder.layer.19.crossattention.self.value.bias', 'bert.encoder.layer.19.crossattention.self.value.weight', 'bert.encoder.layer.20.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.20.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.20.crossattention.output.dense.bias', 'bert.encoder.layer.20.crossattention.output.dense.weight', 'bert.encoder.layer.20.crossattention.self.key.bias', 'bert.encoder.layer.20.crossattention.self.key.weight', 'bert.encoder.layer.20.crossattention.self.query.bias', 'bert.encoder.layer.20.crossattention.self.query.weight', 'bert.encoder.layer.20.crossattention.self.value.bias', 'bert.encoder.layer.20.crossattention.self.value.weight', 'bert.encoder.layer.21.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.21.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.21.crossattention.output.dense.bias', 'bert.encoder.layer.21.crossattention.output.dense.weight', 'bert.encoder.layer.21.crossattention.self.key.bias', 'bert.encoder.layer.21.crossattention.self.key.weight', 'bert.encoder.layer.21.crossattention.self.query.bias', 'bert.encoder.layer.21.crossattention.self.query.weight', 'bert.encoder.layer.21.crossattention.self.value.bias', 'bert.encoder.layer.21.crossattention.self.value.weight', 'bert.encoder.layer.22.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.22.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.22.crossattention.output.dense.bias', 'bert.encoder.layer.22.crossattention.output.dense.weight', 'bert.encoder.layer.22.crossattention.self.key.bias', 'bert.encoder.layer.22.crossattention.self.key.weight', 'bert.encoder.layer.22.crossattention.self.query.bias', 'bert.encoder.layer.22.crossattention.self.query.weight', 'bert.encoder.layer.22.crossattention.self.value.bias', 'bert.encoder.layer.22.crossattention.self.value.weight', 'bert.encoder.layer.23.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.23.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.23.crossattention.output.dense.bias', 'bert.encoder.layer.23.crossattention.output.dense.weight', 'bert.encoder.layer.23.crossattention.self.key.bias', 'bert.encoder.layer.23.crossattention.self.key.weight', 'bert.encoder.layer.23.crossattention.self.query.bias', 'bert.encoder.layer.23.crossattention.self.query.weight', 'bert.encoder.layer.23.crossattention.self.value.bias', 'bert.encoder.layer.23.crossattention.self.value.weight', 'cls.predictions.decoder.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "06b22b2abc634783bca0a9565cd7b969", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/13 [00:00= fnum + step = len(vid_list) // fnum + vid_list = vid_list[::step][:fnum] + vid_list = [cv2.resize(x[:, :, ::-1], target_size) for x in vid_list] + vid_tube = [np.expand_dims(normalize(x), axis=(0, 1)) for x in vid_list] + vid_tube = np.concatenate(vid_tube, axis=1) + vid_tube = np.transpose(vid_tube, (0, 1, 4, 2, 3)) + vid_tube = torch.from_numpy(vid_tube).to(device, non_blocking=True).float() + return vid_tube + + +def vid2tensor(path: str, fnum: int = 8, target_size: tuple = (224, 224), device=torch.device("cuda")): + video = cv2.VideoCapture(path) + frames = [x for x in _frame_from_video(video)] + return frames2tensor(frames, fnum, target_size, device) + + +def get_text_feat_dict(texts, clip, text_feat_d={}): + for t in texts: + feat = clip.get_txt_feat(t) + text_feat_d[t] = feat + return text_feat_d + + +def get_vid_feat(frames, vlm): + return vlm.get_vid_features(frames) + + +def retrieve_text(frames, texts, vlm, topk: int = 5): + + config = vlm._config + + fn = config.num_frames + size_t = config.size_t + frames_tensor = frames2tensor(frames, fnum=fn, target_size=(size_t, size_t), device=torch.device("cpu")) + vid_feat = vlm.get_vid_feat(frames_tensor) + + text_feat_d = {} + text_feat_d = get_text_feat_dict(texts, vlm, text_feat_d) + text_feats = [text_feat_d[t] for t in texts] + text_feats_tensor = torch.cat(text_feats, 0) + + probs, idxs = vlm.predict_label(vid_feat, text_feats_tensor, top=topk) + + ret_texts = [texts[i] for i in idxs.long().numpy()[0].tolist()] + return ret_texts, probs.float().numpy()[0] + + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + +PRETRAINED_VOCAB_FILES_MAP = { + "vocab_file": { + "bert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt", + "bert-large-uncased": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt", + "bert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt", + "bert-large-cased": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt", + "bert-base-multilingual-uncased": "https://huggingface.co/bert-base-multilingual-uncased/resolve/main/vocab.txt", + "bert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt", + "bert-base-chinese": "https://huggingface.co/bert-base-chinese/resolve/main/vocab.txt", + "bert-base-german-cased": "https://huggingface.co/bert-base-german-cased/resolve/main/vocab.txt", + "bert-large-uncased-whole-word-masking": "https://huggingface.co/bert-large-uncased-whole-word-masking/resolve/main/vocab.txt", + "bert-large-cased-whole-word-masking": "https://huggingface.co/bert-large-cased-whole-word-masking/resolve/main/vocab.txt", + "bert-large-uncased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt", + "bert-large-cased-whole-word-masking-finetuned-squad": "https://huggingface.co/bert-large-cased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt", + "bert-base-cased-finetuned-mrpc": "https://huggingface.co/bert-base-cased-finetuned-mrpc/resolve/main/vocab.txt", + "bert-base-german-dbmdz-cased": "https://huggingface.co/bert-base-german-dbmdz-cased/resolve/main/vocab.txt", + "bert-base-german-dbmdz-uncased": "https://huggingface.co/bert-base-german-dbmdz-uncased/resolve/main/vocab.txt", + "TurkuNLP/bert-base-finnish-cased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-cased-v1/resolve/main/vocab.txt", + "TurkuNLP/bert-base-finnish-uncased-v1": "https://huggingface.co/TurkuNLP/bert-base-finnish-uncased-v1/resolve/main/vocab.txt", + "wietsedv/bert-base-dutch-cased": "https://huggingface.co/wietsedv/bert-base-dutch-cased/resolve/main/vocab.txt", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "bert-base-uncased": 512, + "bert-large-uncased": 512, + "bert-base-cased": 512, + "bert-large-cased": 512, + "bert-base-multilingual-uncased": 512, + "bert-base-multilingual-cased": 512, + "bert-base-chinese": 512, + "bert-base-german-cased": 512, + "bert-large-uncased-whole-word-masking": 512, + "bert-large-cased-whole-word-masking": 512, + "bert-large-uncased-whole-word-masking-finetuned-squad": 512, + "bert-large-cased-whole-word-masking-finetuned-squad": 512, + "bert-base-cased-finetuned-mrpc": 512, + "bert-base-german-dbmdz-cased": 512, + "bert-base-german-dbmdz-uncased": 512, + "TurkuNLP/bert-base-finnish-cased-v1": 512, + "TurkuNLP/bert-base-finnish-uncased-v1": 512, + "wietsedv/bert-base-dutch-cased": 512, +} + +PRETRAINED_INIT_CONFIGURATION = { + "bert-base-uncased": {"do_lower_case": True}, + "bert-large-uncased": {"do_lower_case": True}, + "bert-base-cased": {"do_lower_case": False}, + "bert-large-cased": {"do_lower_case": False}, + "bert-base-multilingual-uncased": {"do_lower_case": True}, + "bert-base-multilingual-cased": {"do_lower_case": False}, + "bert-base-chinese": {"do_lower_case": False}, + "bert-base-german-cased": {"do_lower_case": False}, + "bert-large-uncased-whole-word-masking": {"do_lower_case": True}, + "bert-large-cased-whole-word-masking": {"do_lower_case": False}, + "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True}, + "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False}, + "bert-base-cased-finetuned-mrpc": {"do_lower_case": False}, + "bert-base-german-dbmdz-cased": {"do_lower_case": False}, + "bert-base-german-dbmdz-uncased": {"do_lower_case": True}, + "TurkuNLP/bert-base-finnish-cased-v1": {"do_lower_case": False}, + "TurkuNLP/bert-base-finnish-uncased-v1": {"do_lower_case": True}, + "wietsedv/bert-base-dutch-cased": {"do_lower_case": False}, +} + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class BasicTokenizer(object): + """ + Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). + Args: + do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to lowercase the input when tokenizing. + never_split (:obj:`Iterable`, `optional`): + Collection of tokens which will never be split during tokenization. Only has an effect when + :obj:`do_basic_tokenize=True` + tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to tokenize Chinese characters. + This should likely be deactivated for Japanese (see this `issue + `__). + strip_accents: (:obj:`bool`, `optional`): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for :obj:`lowercase` (as in the original BERT). + """ + + def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None): + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = set(never_split) + self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents + + def tokenize(self, text, never_split=None): + """ + Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see + WordPieceTokenizer. + Args: + **never_split**: (`optional`) list of str + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + :func:`PreTrainedTokenizer.tokenize`) List of token not to split. + """ + # union() returns a new set by concatenating the two sets. + never_split = self.never_split.union(set(never_split)) if never_split else self.never_split + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + if self.do_lower_case: + token = token.lower() + if self.strip_accents is not False: + token = self._run_strip_accents(token) + elif self.strip_accents: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if never_split is not None and text in never_split: + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ( + (cp >= 0x4E00 and cp <= 0x9FFF) + or (cp >= 0x3400 and cp <= 0x4DBF) # + or (cp >= 0x20000 and cp <= 0x2A6DF) # + or (cp >= 0x2A700 and cp <= 0x2B73F) # + or (cp >= 0x2B740 and cp <= 0x2B81F) # + or (cp >= 0x2B820 and cp <= 0x2CEAF) # + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F) # + ): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """ + Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform + tokenization using the given vocabulary. + For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`. + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer`. + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +class BertTokenizer(PreTrainedTokenizer): + r""" + Construct a BERT tokenizer. Based on WordPiece. + This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. + Users should refer to this superclass for more information regarding those methods. + Args: + vocab_file (:obj:`str`): + File containing the vocabulary. + do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to lowercase the input when tokenizing. + do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to do basic tokenization before WordPiece. + never_split (:obj:`Iterable`, `optional`): + Collection of tokens which will never be split during tokenization. Only has an effect when + :obj:`do_basic_tokenize=True` + unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`): + Whether or not to tokenize Chinese characters. + This should likely be deactivated for Japanese (see this `issue + `__). + strip_accents: (:obj:`bool`, `optional`): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for :obj:`lowercase` (as in the original BERT). + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__( + self, + vocab_file, + do_lower_case=True, + do_basic_tokenize=True, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + **kwargs, + ): + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " + "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file) + ) + self.vocab = load_vocab(vocab_file) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) + + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + ) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + + @property + def do_lower_case(self): + return self.basic_tokenizer.do_lower_case + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + return dict(self.vocab, **self.added_tokens_encoder) + + def _tokenize(self, text): + split_tokens = [] + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): + + # If the token is part of the never_split set + if token in self.basic_tokenizer.never_split: + split_tokens.append(token) + else: + split_tokens += self.wordpiece_tokenizer.tokenize(token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = " ".join(tokens).replace(" ##", "").strip() + return out_string + + def build_inputs_with_special_tokens(self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None) -> list[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + - single sequence: ``[CLS] X `` + - pair of sequences: ``[CLS] A [SEP] B [SEP]`` + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + Returns: + :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask(self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False) -> list[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` method. + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the token list is already formatted with special tokens for the model. + Returns: + :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " "ids is already formatted with special tokens for the model." + ) + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_token_type_ids_from_sequences(self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None) -> list[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence + pair mask has the following format: + :: + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s). + Args: + token_ids_0 (:obj:`List[int]`): + List of IDs. + token_ids_1 (:obj:`List[int]`, `optional`): + Optional second list of IDs for sequence pairs. + Returns: + :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given + sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]: + index = 0 + if os.path.isdir(save_directory): + vocab_file = os.path.join(save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]) + else: + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + print( + "Saving vocabulary to {}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!".format(vocab_file) + ) + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file,) + + +def patch_model_code(model_dir): + modeling_file = model_dir / "modeling_internvideo2.py" + orig_modeling_file = model_dir / "orig_modeling_internvideo2.py" + if not orig_modeling_file.exists(): + modeling_file.rename(orig_modeling_file) + with orig_modeling_file.open("r") as in_f: + content = in_f.read() + content = content.replace( + "self.tokenizer = BertTokenizer.from_pretrained(self._config.model.text_encoder.pretrained, local_files_only=True, use_safetensors=True)", + "self.tokenizer = BertTokenizer.from_pretrained(self._config.model.text_encoder.pretrained, use_safetensors=True)", + ) + content = content.replace( + "from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func\nfrom flash_attn.bert_padding import unpad_input, pad_input", + "try:\n from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func\n from flash_attn.bert_padding import unpad_input, pad_input\n flash_attn_available=True\nexcept:\n flash_attn_available = False", + ) + content = content.replace("self.use_flash_attn = use_flash_attn", "self.use_flash_attn = use_flash_attn and flash_attn_available") + with modeling_file.open("w") as out_f: + out_f.write(content) + orig_config_file = model_dir / "orig_config.json" + config_file = model_dir / "config.json" + if not orig_config_file.exists(): + config_file.rename(orig_config_file) + with orig_config_file.open("r") as in_f: + content = in_f.read() + configs_dir = model_dir / "configs" + content = content.replace('"configs/', f'"{configs_dir.absolute()}/') + with config_file.open("w") as out_f: + out_f.write(content) + + +def convert_internvideo(model_id, output_dir, weights_compression_config=None): + output_dir = Path(output_dir) + model_dir = Path(model_id.split("/")[-1]) + if not (output_dir / VISION_ENCODER_NAME).exists() or not (output_dir / TEXT_ENCODER_NAME).exists(): + output_dir.mkdir(exist_ok=True, parents=True) + print(f"⌛ {model_id} conversion started. Be patient, it may takes some time.") + print("⌛ Load Original model") + if not model_dir.exists(): + hf_hub.snapshot_download(model_id, local_dir=model_dir) + patch_model_code(model_dir) + + shutil.copy(model_dir / "config.json", output_dir / "config.json") + shutil.copytree(model_dir / "configs", output_dir / "configs", dirs_exist_ok=True) + shutil.copy(model_dir / "modeling_internvideo2.py", output_dir / "modeling_internvideo2.py") + model = AutoModel.from_pretrained(model_dir, trust_remote_code=True).eval() + model._config.device = torch.device("cpu") + model.tokenizer.save_pretrained(output_dir) + print("✅ Original model successfully loaded") + + if not (output_dir / VISION_ENCODER_NAME).exists(): + print("⌛ Convert Vision Encoder model") + + model.forward = model.get_vid_feat + + def _naive_attn(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) + + if self.qk_normalization: + B_, H_, N_, D_ = q.shape + q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2) + k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2) + attention_mask = torch.zeros(q.size(-2), k.size(-2)) + x = torch.nn.functional.scaled_dot_product_attention(q, k, v, scale=self.scale, is_causal=False, attn_mask=attention_mask) + x = x.transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + def cross_attn_forward(self, x, k=None, v=None): + B, N, C = x.shape + N_k = k.shape[1] + N_v = v.shape[1] + + q_bias, k_bias, v_bias = None, None, None + if self.q_bias is not None: + q_bias = self.q_bias + k_bias = self.k_bias + v_bias = self.v_bias + + q = torch.nn.functional.linear(input=x, weight=self.q.weight, bias=q_bias) + q = q.reshape(B, N, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0) # (B, N_head, N_q, dim) + + k = torch.nn.functional.linear(input=k, weight=self.k.weight, bias=k_bias) + k = k.reshape(B, N_k, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0) + + v = torch.nn.functional.linear(input=v, weight=self.v.weight, bias=v_bias) + v = v.reshape(B, N_v, 1, self.num_heads, -1).permute(2, 0, 3, 1, 4).squeeze(0) + + x = torch.nn.functional.scaled_dot_product_attention(q, k, v, scale=self.scale, is_causal=False) + + x = x.transpose(1, 2).reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + + return x + + for block in model.vision_encoder.blocks: + block.with_cp = False + block.attn._naive_attn = types.MethodType(_naive_attn, block.attn) + + model.vision_encoder.clip_projector.cross_attn.forward = types.MethodType(cross_attn_forward, model.vision_encoder.clip_projector.cross_attn) + vision_encoder = ov.convert_model(model, example_input=torch.zeros([1, 4, 3, 224, 224])) + if weights_compression_config is not None: + vision_encoder = nncf.compress_weights(vision_encoder, **weights_compression_config) + ov.save_model(vision_encoder, output_dir / VISION_ENCODER_NAME) + del vision_encoder + cleanup_torchscript_cache() + del model.vision_encoder + gc.collect() + print("✅ Vision Encoder model successfully converted") + + if not (output_dir / TEXT_ENCODER_NAME).exists(): + print("⌛ Convert Text Encoder model") + + def forward(self, input_ids, attention_mask): + text_output = self.get_text_encoder()( + input_ids, + attention_mask=attention_mask, + return_dict=True, + mode="text", + ) + text_embeds = text_output.last_hidden_state + pooled_text_embeds = text_embeds[:, 0] + tfeat = self.text_proj(pooled_text_embeds) + return tfeat / tfeat.norm(dim=-1, keepdim=True) + + model.forward = types.MethodType(forward, model) + + attention_mask = torch.ones([2, 40], dtype=torch.long) + attention_mask[:, -10:] = 0 + text_encoder = ov.convert_model(model, example_input={"input_ids": torch.ones([2, 40], dtype=torch.long), "attention_mask": attention_mask}) + + ov.save_model(text_encoder, output_dir / TEXT_ENCODER_NAME) + print("✅ Text Encoder model successfully converted") + print(f"✅ {model_id} model conversion finished. You can find results in {output_dir}") + else: + print(f"✅ {model_id} model already converted. You can find results in {output_dir}") + + +core = ov.Core() + + +class OVInternVideoStage2: + def __init__(self, model_dir, device_map="CPU", ov_config=None): + model_dir = Path(model_dir) + if isinstance(device_map, str): + device = device_map + device_map = {"text_encoder": device.upper(), "vision_encoder": device.upper(), "text_proj": device.upper()} + config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True).to_dict() + self._config = DictToClass(config) if isinstance(config, dict) else config + self._config.device = torch.device("cpu") + self.tokenizer = BertTokenizer.from_pretrained(self._config.model.text_encoder.pretrained, use_safetensors=True) + self.text_encoder = core.compile_model(model_dir / TEXT_ENCODER_NAME, device_map["text_encoder"], config=ov_config) + self.vision_encoder = core.compile_model(model_dir / VISION_ENCODER_NAME, device_map["vision_encoder"], config=ov_config) + + def get_vid_feat(self, frames: torch.Tensor): + """get the video features for the given frames. + + Args: + frames (torch.Tensor): The input frames. Shape: [B,T,C,H,W]. + + Returns: tuple. + - pooled_vision_embeds (torch.Tensor): The pooled output features. Shape: [B,1,C]. + + """ + vfeat = self.vision_encoder(frames)[0] + return torch.from_numpy(vfeat) + + def get_txt_feat(self, text: str): + """get the text features for the given text.""" + text = self.tokenizer( + text, + padding="max_length", + truncation=True, + max_length=self._config.max_txt_l, + return_tensors="pt", + ).to(self._config.device) + return torch.from_numpy(self.text_encoder({"input_ids": text.input_ids, "attention_mask": text.attention_mask})[0]) + + def predict_label(self, vid_feat: torch.Tensor, txt_feat: torch.Tensor, top: int = 5): + label_probs = (100 * vid_feat @ txt_feat.T).softmax(dim=-1) + top_probs, top_labels = label_probs.float().cpu().topk(top, dim=-1) + return top_probs, top_labels