diff --git a/fengshen/examples/Lyrics/pretrain_lyrics.py b/fengshen/examples/Lyrics/pretrain_lyrics.py
new file mode 100644
index 0000000..04c34f6
--- /dev/null
+++ b/fengshen/examples/Lyrics/pretrain_lyrics.py
@@ -0,0 +1,217 @@
+from pytorch_lightning import (
+    LightningModule,
+    Trainer,
+)
+from pytorch_lightning.loggers import (
+    WandbLogger
+)
+from pytorch_lightning.callbacks import (
+    LearningRateMonitor,
+)
+from fengshen.models.lyrics.modeling_lyrics import LyricsQFromerForPretrain
+import fengshen.models.lyrics.groundingdino.transforms as T
+from fengshen.models.lyrics.configuration_lyrics import LyricsConfig
+from fengshen.models.model_utils import (
+    add_module_args,
+    configure_optimizers,
+    get_total_steps,
+)
+import argparse
+from fengshen.data.universal_datamodule import UniversalDataModule
+from fengshen.data.taiyi_stable_diffusion_datasets.taiyi_datasets import add_data_args, load_data
+from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+import numpy as np
+from torchvision.transforms import Normalize, Compose, Resize, RandomResizedCrop, InterpolationMode, ToTensor, RandomHorizontalFlip
+from PIL import Image
+from transformers import BertTokenizer, Blip2Processor
+from torch.utils.data._utils.collate import default_collate
+import os
+import torch
+
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+class TensorObject(object):
+    def __init__(self, tensor: torch.Tensor):
+        self.data = tensor
+
+class Collator():
+    def __init__(self, args):
+        self.transforms = Blip2Processor.from_pretrained(args.model_path)
+        self.grounding_transforms = T.Compose(
+            [
+                T.RandomResize([800], max_size=1333),
+                # T.RandomResize([800]),
+                T.ToTensor(),
+                T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+            ]
+        )
+        self.ram_transforms = Compose([
+                    Resize((384, 384)),
+                    ToTensor(), 
+                    Normalize(mean=[0.485, 0.456, 0.406],
+                            std=[0.229, 0.224, 0.225])
+                ])
+
+    def __call__(self, inputs):
+        # samples = []
+        image = []
+        grounding_image = []
+        ram_image = []
+        input_captions = []
+        input_languages = []
+
+        ran = None
+        for (cnt, i) in enumerate(inputs):
+            if 'npy_path' in i:
+                instance_image = Image.fromarray(np.load(i['npy_path']))
+            elif 'img_path' in i:
+                try:
+                    instance_image = Image.open(i['img_path'])
+                    if not instance_image.mode == "RGB":
+                        instance_image = instance_image.convert("RGB")
+                except:
+                    continue
+            elif "image" in i and i["image"] is not None:
+                instance_image = i["image"]
+                if not instance_image.mode == "RGB":
+                    instance_image = instance_image.convert("RGB")
+            elif "img" in i and i["img"] is not None:
+                instance_image = i["img"]
+                if not instance_image.mode == "RGB":
+                    instance_image = instance_image.convert("RGB")
+            else:
+                raise ValueError('no img path in samples')
+
+            if 'blip_caption' in i:
+                try:
+                    loc = torch.multinomial(torch.tensor(i['blip_scores']), 1)
+                    caption = i['blip_caption'][loc]
+                    language = 'zh'
+                except Exception:
+                    caption = ''
+                    print(i)
+            elif 'caption' in i:
+                caption = i['caption']
+                language = 'en'
+            elif 'caption_zh' in i:
+                caption = i['caption_zh']
+                language = 'zh'
+            image.append(self.transforms(instance_image, return_tensors="pt")['pixel_values'][0])
+            grounding_image.append(self.grounding_transforms(instance_image, None)[0])
+            ram_image.append(self.ram_transforms(instance_image))
+            input_captions.append(caption)
+            input_languages.append(language)
+        model_inputs = {
+            "image": torch.stack(image),
+            "grounding_image": grounding_image,
+            "ram_image": torch.stack(ram_image),
+            "caption": input_captions,
+            "language": input_languages,
+            }
+        return model_inputs
+
+
+class LyricsQFromer(LightningModule):
+    @staticmethod
+    def add_module_specific_args(parent_parser):
+        parser = parent_parser.add_argument_group('LyricsQFromer')
+        parser.add_argument('--freeze_image_tower', default=False, action='store_true')
+        return parent_parser
+
+    def __init__(self, args, **kwargs) -> None:
+        super().__init__()
+        self.save_hyperparameters(args)
+
+        # self.model = LyricsQFromerForPretrain.from_pretrained(args.model_path, ignore_mismatched_sizes=True)
+        self.model = LyricsQFromerForPretrain.from_pretrained(args.model_path)
+        tokenizer = BertTokenizer.from_pretrained(os.path.join(args.model_path, 'tokenizer'))
+        self.model.tokenizer = tokenizer
+        self.model.box_threshold = 0.25
+        self.model.text_threshold = 0.2
+        self.model.iou_threshold = 0.6
+
+        if args.freeze_image_tower:
+            self.model.vision_model.eval()
+            for param in self.model.vision_model.parameters():
+                param.requires_grad = False
+            self.model.ram.eval()
+            for param in self.model.ram.parameters():
+                param.requires_grad = False
+            self.model.grounding_dino.eval()
+            for param in self.model.grounding_dino.parameters():
+                param.requires_grad = False
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            self.total_steps = get_total_steps(self.trainer, self.hparams)
+            self.steps_per_epoch = self.total_steps // self.trainer.max_epochs
+            print('Total steps: {}' .format(self.total_steps))
+        elif stage == 'validate':
+            self.total_steps = 100
+
+    def configure_optimizers(self):
+        return configure_optimizers(self)
+
+    def training_step(self, batch):
+        output = self.model(**batch)
+        self.log('train/loss_itc', output.loss_itc)
+        self.log('train/loss_itm', output.loss_itm)
+        self.log('train/loss_lm', output.loss_lm)
+        self.log('train/loss_mlm', output.loss_mlm)
+        self.log('train/loss', output.loss)
+        if self.trainer.global_rank == 0:
+            if self.trainer.global_step % 1000 == 0:
+                print('loss_itc:', output.loss_itc)
+                print('loss_itm:', output.loss_itm)
+                print('loss_lm:', output.loss_lm)
+                print('loss_mlm:', output.loss_mlm)
+        return output.loss
+
+    def validation_step(self, batch, batch_idx):
+        raise Exception("not impl")
+
+    def on_load_checkpoint(self, checkpoint) -> None:
+        # 兼容低版本lightning，低版本lightning从ckpt起来时steps数会被重置为0
+        global_step_offset = checkpoint["global_step"]
+        if 'global_samples' in checkpoint:
+            self.consumed_samples = checkpoint['global_samples']
+        self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset
+
+    def on_save_checkpoint(self, checkpoint) -> None:
+        # 保存的时候把权重按huggingface的形式保存出来
+        if self.global_rank == 0:
+            dir_path = os.path.join(
+                self.hparams.default_root_dir, f'hf_out_{self.trainer.current_epoch}_{self.trainer.global_step}')
+            if not os.path.exists(dir_path):
+                os.mkdir(dir_path)
+            self.model.save_pretrained(dir_path)
+
+
+if __name__ == '__main__':
+    args_parser = argparse.ArgumentParser()
+    args_parser = add_module_args(args_parser)
+    args_parser = add_data_args(args_parser)
+    args_parser = UniversalDataModule.add_data_specific_args(args_parser)
+    args_parser = Trainer.add_argparse_args(args_parser)
+    args_parser = LyricsQFromer.add_module_specific_args(args_parser)
+    args_parser = UniversalCheckpoint.add_argparse_args(args_parser)
+    args = args_parser.parse_args()
+
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    checkpoint_callback = UniversalCheckpoint(args)
+
+    # wandb_logger = WandbLogger(project="ditto_pretrain")  # 初始化个WandbLogger对象
+    trainer = Trainer.from_argparse_args(args,
+                                        #  logger=wandb_logger,
+                                         callbacks=[
+                                             lr_monitor,
+                                             checkpoint_callback])
+
+    model = LyricsQFromer(args)
+    collate_fn = Collator(args)
+    datasets = load_data(args, global_rank=trainer.global_rank)
+    datamoule = UniversalDataModule(
+        tokenizer=None, collate_fn=collate_fn, args=args, datasets=datasets)
+    trainer.fit(model, datamoule)
+    # trainer.fit(model, datamoule, ckpt_path=args.load_ckpt_path)
\ No newline at end of file
diff --git a/fengshen/examples/Lyrics/pretrain_lyrics_stage2.py b/fengshen/examples/Lyrics/pretrain_lyrics_stage2.py
new file mode 100644
index 0000000..ccd7da8
--- /dev/null
+++ b/fengshen/examples/Lyrics/pretrain_lyrics_stage2.py
@@ -0,0 +1,311 @@
+from pytorch_lightning import (
+    LightningModule,
+    Trainer,
+)
+from pytorch_lightning.loggers import (
+    WandbLogger
+)
+from pytorch_lightning.callbacks import (
+    LearningRateMonitor,
+)
+from fengshen.models.lyrics.modeling_lyrics import LyricsLMForConditionalGeneration
+import fengshen.models.lyrics.groundingdino.transforms as T
+from fengshen.models.lyrics.configuration_lyrics import LyricsConfig
+from fengshen.models.model_utils import (
+    add_module_args,
+    configure_optimizers,
+    get_total_steps,
+)
+import argparse
+from fengshen.data.universal_datamodule import UniversalDataModule
+from fengshen.data.taiyi_stable_diffusion_datasets.taiyi_datasets import add_data_args, load_data
+from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+import numpy as np
+from torchvision.transforms import Normalize, Compose, Resize, RandomResizedCrop, InterpolationMode, ToTensor, RandomHorizontalFlip
+from PIL import Image
+from transformers import BertTokenizer, Blip2Processor, InstructBlipProcessor, InstructBlipForConditionalGeneration, LlamaTokenizer
+from torch.utils.data._utils.collate import default_collate
+import os
+import torch
+import random
+
+# BlipImageProcessor
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+class TensorObject(object):
+    def __init__(self, tensor: torch.Tensor):
+        self.data = tensor
+
+class Collator():
+    def __init__(self, args):
+        self.processor = InstructBlipProcessor.from_pretrained(args.model_path, padding_side = "right")
+        self.eos_token = self.processor.tokenizer.eos_token
+        self.grounding_transforms = T.Compose(
+            [
+                T.RandomResize([800], max_size=1333),
+                # T.RandomResize([800]),
+                T.ToTensor(),
+                T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+            ]
+        )
+        self.ram_transforms = Compose([
+                    Resize((384, 384)),
+                    ToTensor(), 
+                    Normalize(mean=[0.485, 0.456, 0.406],
+                            std=[0.229, 0.224, 0.225])
+                ])
+        self.max_txt_len = 24 #128
+        self.max_output_txt_len = 40 #256
+        self.prompts = {
+            "zh":[
+                "在本任务中，您将获得一张图片，您的任务是生成该图片的描述。",
+                "在这项任务中，您将获得一篇图片。你的任务是用一句话概括这张图片。",
+                "为给定的图片生成一个适当的描述。",
+                "本任务中，您将获得一张图片。你的任务是描述它。",
+                "这张图片的内容是什么。",
+                "请简单描述一下这张图片。",
+            ],
+            "en": [
+                'A short image caption:',
+                'A short image description:',
+                'A photo of',
+                'An image that shows',
+                'Write a short description for the image.',
+                'Write a description for the photo.',
+                'Provide a description of what is presented in the photo.',
+                'Briefly describe the content of the image.',
+                'Can you briefly explain what you see in the image?',
+                'Could you use a few words to describe what you perceive in the photo?',
+                'Please provide a short depiction of the picture.',
+                'Using language, provide a short account of the image.',
+                'Use a few words to illustrate what is happening in the picture.',
+                ]
+        }
+        self.stage = 'first' # 一阶段和二阶段代表有无instruct部分
+
+    # 只需要写qa拼接逻辑
+    def concat_text_input_output(self, input_ids, input_atts, output_ids, output_atts):
+        input_part_targets_len = []
+        llm_tokens = {"input_ids": [], "attention_mask": []}
+        for i in range(input_ids.size(0)):
+            this_input_ones = input_atts[i].sum()
+            input_part_targets_len.append(this_input_ones)
+            llm_tokens['input_ids'].append(
+                torch.cat([
+                    input_ids[i][:this_input_ones],
+                    output_ids[i][1:], 
+                    input_ids[i][this_input_ones:]
+                ])
+            )
+            llm_tokens['attention_mask'].append(
+                torch.cat([
+                    input_atts[i][:this_input_ones],
+                    output_atts[i][1:],
+                    input_atts[i][this_input_ones:]
+                ])
+            )
+        llm_tokens['input_ids'] = torch.stack(llm_tokens['input_ids'])
+        llm_tokens['attention_mask'] = torch.stack(llm_tokens['attention_mask'])
+        return llm_tokens, input_part_targets_len
+
+    def __call__(self, inputs):
+        # samples = []
+        images = []
+        grounding_pixel_values = []
+        ram_pixel_values = []
+        questions = []
+        answers = []
+
+        for (cnt, i) in enumerate(inputs):
+            if 'npy_path' in i:
+                instance_image = Image.fromarray(np.load(i['npy_path']))
+            elif 'img_path' in i:
+                try:
+                    instance_image = Image.open(i['img_path'])
+                    if not instance_image.mode == "RGB":
+                        instance_image = instance_image.convert("RGB")
+                except:
+                    continue
+            elif "image" in i and i["image"] is not None:
+                instance_image = i["image"]
+                if not instance_image.mode == "RGB":
+                    instance_image = instance_image.convert("RGB")
+            elif "img" in i and i["img"] is not None:
+                instance_image = i["img"]
+                if not instance_image.mode == "RGB":
+                    instance_image = instance_image.convert("RGB")
+            else:
+                raise ValueError('no img path in samples')
+
+            if 'caption' in i:
+                answer = i['caption'] + ' ' + self.eos_token
+                prompts = self.prompts['en']
+                prompt = prompts[random.randint(0, len(prompts) - 1)]
+            elif 'caption_zh' in i:
+                answer = i['caption_zh'] + ' ' + self.eos_token
+                prompts = self.prompts['zh']
+                prompt = prompts[random.randint(0, len(prompts) - 1)]
+            elif 'text' in i:
+                answer = i['text']['answer'] + ' ' + self.eos_token
+                prompt = i['text']['question']
+            # elif 'caption_zh' in i:
+            #     caption = i['caption_zh']
+
+            images.append(instance_image)
+            grounding_pixel_values.append(self.grounding_transforms(instance_image, None)[0])
+            ram_pixel_values.append(self.ram_transforms(instance_image))
+            questions.append(prompt)
+            answers.append(answer)
+
+        self.processor.tokenizer.truncation_side = "left"
+        text_input_tokens = self.processor(text=questions, padding="longest", truncation=True, max_length=self.max_txt_len, return_tensors="pt")
+        # print('text_input_tokens:', text_input_tokens.input_ids)
+        
+        self.processor.tokenizer.truncation_side = 'right'
+        text_output_tokens = self.processor(text=answers, padding="longest", truncation=True, max_length=self.max_output_txt_len, return_tensors="pt")
+        # print('text_output_tokens:', text_output_tokens.input_ids)
+        llm_tokens, input_part_targets_len = self.concat_text_input_output(
+            text_input_tokens.input_ids,
+            text_input_tokens.attention_mask,
+            text_output_tokens.input_ids,
+            text_output_tokens.attention_mask,
+        )
+
+        labels = llm_tokens['input_ids'].masked_fill(
+            llm_tokens['input_ids'] == self.processor.tokenizer.pad_token_id, -100
+        )
+        for i, l in enumerate(input_part_targets_len):
+            labels[i][:l] = -100    
+        # labels = text_input_tokens.input_ids    
+
+        images_pixel_values = self.processor.image_processor(images=images, return_tensors="pt")
+
+        model_inputs = {
+            "pixel_values":images_pixel_values['pixel_values'],
+            "grounding_pixel_values": grounding_pixel_values,
+            "ram_pixel_values": torch.stack(ram_pixel_values),
+            "input_ids": llm_tokens['input_ids'],
+            "attention_mask": llm_tokens['attention_mask'],
+            "labels": labels,
+            }        
+        return model_inputs
+
+
+class Lyrics(LightningModule):
+    @staticmethod
+    def add_module_specific_args(parent_parser):
+        parser = parent_parser.add_argument_group('Lyrics')
+        parser.add_argument('--freeze_image_tower', default=False, action='store_true')
+        parser.add_argument('--freeze_qformer', default=False, action='store_true')
+        return parent_parser
+
+    def __init__(self, args, **kwargs) -> None:
+        super().__init__()
+        self.save_hyperparameters(args)
+
+        # self.model = LyricsQFromerForPretrain.from_pretrained(args.model_path, ignore_mismatched_sizes=True)
+        self.model = LyricsLMForConditionalGeneration.from_pretrained(args.model_path)
+        self.processor = InstructBlipProcessor.from_pretrained(args.model_path, padding_side = "right")
+
+        self.model.box_threshold = 0.25
+        self.model.text_threshold = 0.2
+        self.model.iou_threshold = 0.6
+
+        if args.freeze_image_tower:
+            self.model.vision_model.eval()
+            for param in self.model.vision_model.parameters():
+                param.requires_grad = False
+            self.model.ram.eval()
+            for param in self.model.ram.parameters():
+                param.requires_grad = False
+            self.model.grounding_dino.eval()
+            for param in self.model.grounding_dino.parameters():
+                param.requires_grad = False
+        if args.freeze_qformer:
+            # freeze qformer, minigpt4  
+            self.model.qformer.eval()
+            self.model.qformer.requires_grad_(False)
+            self.model.query_tokens.requires_grad_(False)
+        self.model.language_model.eval()
+        self.model.language_model.requires_grad_(False)
+                                               
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            self.total_steps = get_total_steps(self.trainer, self.hparams)
+            self.steps_per_epoch = self.total_steps // self.trainer.max_epochs
+            print('Total steps: {}' .format(self.total_steps))
+        elif stage == 'validate':
+            self.total_steps = 100
+
+    def configure_optimizers(self):
+        return configure_optimizers(self)
+    
+    def detokenize(self, token_ids):
+        toks = self.processor.tokenizer.convert_ids_to_tokens(token_ids)
+        return self.processor.tokenizer.convert_tokens_to_string(toks)
+    
+    def qformer_detokenize(self, token_ids):
+        toks = self.processor.qformer_tokenizer.convert_ids_to_tokens(token_ids)
+        return self.processor.qformer_tokenizer.convert_tokens_to_string(toks)    
+
+    def training_step(self, batch):
+        if self.trainer.global_rank == 0:
+            global SHOW_DATA
+            if self.trainer.global_step % 1000 == 0:
+                SHOW_DATA = True
+                print(f"input_ids: {batch['input_ids'][0]}")
+                print(f"input: {self.detokenize(batch['input_ids'][0])}")
+                print(f"labels: {batch['labels'][0]}")
+
+        output = self.model(**batch)
+        return output.loss
+
+    def validation_step(self, batch, batch_idx):
+        raise Exception("not impl")
+
+    def on_load_checkpoint(self, checkpoint) -> None:
+        # 兼容低版本lightning，低版本lightning从ckpt起来时steps数会被重置为0
+        global_step_offset = checkpoint["global_step"]
+        if 'global_samples' in checkpoint:
+            self.consumed_samples = checkpoint['global_samples']
+        self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset
+
+    def on_save_checkpoint(self, checkpoint) -> None:
+        # 保存的时候把权重按huggingface的形式保存出来
+        if self.global_rank == 0:
+            dir_path = os.path.join(
+                self.hparams.default_root_dir, f'hf_out_{self.trainer.current_epoch}_{self.trainer.global_step}')
+            if not os.path.exists(dir_path):
+                os.mkdir(dir_path)
+            self.model.save_pretrained(dir_path)
+
+
+if __name__ == '__main__':
+    args_parser = argparse.ArgumentParser()
+    args_parser = add_module_args(args_parser)
+    args_parser = add_data_args(args_parser)
+    args_parser = UniversalDataModule.add_data_specific_args(args_parser)
+    args_parser = Trainer.add_argparse_args(args_parser)
+    args_parser = Lyrics.add_module_specific_args(args_parser)
+    args_parser = UniversalCheckpoint.add_argparse_args(args_parser)
+    args = args_parser.parse_args()
+
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    checkpoint_callback = UniversalCheckpoint(args)
+
+    # wandb_logger = WandbLogger(project="Lyrics")  # 初始化个WandbLogger对象
+    trainer = Trainer.from_argparse_args(args,
+                                        #  logger=wandb_logger,
+                                         callbacks=[
+                                             lr_monitor,
+                                             checkpoint_callback])
+
+    model = Lyrics(args)
+    collate_fn = Collator(args)
+    datasets = load_data(args, global_rank=trainer.global_rank)
+    datamoule = UniversalDataModule(
+        tokenizer=None, collate_fn=collate_fn, args=args, datasets=datasets)
+    # trainer.fit(model, datamoule)
+    trainer.fit(model, datamoule, ckpt_path=args.load_ckpt_path)
\ No newline at end of file
diff --git a/fengshen/examples/Lyrics/pretrain_lyrics_stage2_instruct.py b/fengshen/examples/Lyrics/pretrain_lyrics_stage2_instruct.py
new file mode 100644
index 0000000..0fa31c2
--- /dev/null
+++ b/fengshen/examples/Lyrics/pretrain_lyrics_stage2_instruct.py
@@ -0,0 +1,369 @@
+from pytorch_lightning import (
+    LightningModule,
+    Trainer,
+)
+from pytorch_lightning.loggers import (
+    WandbLogger
+)
+from pytorch_lightning.callbacks import (
+    LearningRateMonitor,
+)
+from fengshen.models.lyrics.modeling_lyrics import LyricsLMForConditionalGeneration
+import fengshen.models.lyrics.groundingdino.transforms as T
+from fengshen.models.lyrics.configuration_lyrics import LyricsConfig
+from fengshen.models.model_utils import (
+    add_module_args,
+    configure_optimizers,
+    get_total_steps,
+)
+import argparse
+from peft import LoraConfig, get_peft_config, get_peft_model
+from fengshen.data.universal_datamodule import UniversalDataModule
+from fengshen.data.taiyi_stable_diffusion_datasets.taiyi_datasets import add_data_args, load_data
+from fengshen.utils.universal_checkpoint import UniversalCheckpoint
+import numpy as np
+from torchvision.transforms import Normalize, Compose, Resize, RandomResizedCrop, InterpolationMode, ToTensor, RandomHorizontalFlip
+from PIL import Image
+from transformers import BertTokenizer, Blip2Processor, InstructBlipProcessor, InstructBlipForConditionalGeneration, LlamaTokenizer
+from torch.utils.data._utils.collate import default_collate
+import os
+import torch
+import random
+from io import BytesIO
+from base64 import b64decode
+
+# BlipImageProcessor
+OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+class Collator():
+    def __init__(self, args):
+        self.processor = InstructBlipProcessor.from_pretrained(args.model_path, padding_side = "right")
+        self.eos_token = self.processor.tokenizer.eos_token
+        self.grounding_transforms = T.Compose(
+            [
+                T.RandomResize([800], max_size=1333),
+                T.ToTensor(),
+                T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+            ]
+        )
+        self.ram_transforms = Compose([
+                    Resize((384, 384)),
+                    ToTensor(), 
+                    Normalize(mean=[0.485, 0.456, 0.406],
+                            std=[0.229, 0.224, 0.225])
+                ])
+        self.max_txt_len = 24 
+        self.max_output_txt_len = 40 
+        self.prompts = {
+            "zh":[
+                "在本任务中，您将获得一张图片，您的任务是生成该图片的描述。",
+                "在这项任务中，您将获得一篇图片。你的任务是用一句话概括这张图片。",
+                "为给定的图片生成一个适当的描述。",
+                "本任务中，您将获得一张图片。你的任务是描述它。",
+                "这张图片的内容是什么。",
+                "请简单描述一下这张图片。",
+            ],
+            "en": [
+                'A short image caption:',
+                'A short image description:',
+                'A photo of',
+                'An image that shows',
+                'Write a short description for the image.',
+                'Write a description for the photo.',
+                'Provide a description of what is presented in the photo.',
+                'Briefly describe the content of the image.',
+                'Can you briefly explain what you see in the image?',
+                'Could you use a few words to describe what you perceive in the photo?',
+                'Please provide a short depiction of the picture.',
+                'Using language, provide a short account of the image.',
+                'Use a few words to illustrate what is happening in the picture.',
+                ]
+        }
+        self.stage = 'second' # 一阶段和二阶段代表有无instruct部分
+
+    def concat_text_input_output(self, input_ids, input_atts, output_ids, output_atts):
+        input_part_targets_len = []
+        llm_tokens = {"input_ids": [], "attention_mask": []}
+        for i in range(input_ids.size(0)):
+            this_input_ones = input_atts[i].sum()
+            input_part_targets_len.append(this_input_ones)
+            llm_tokens['input_ids'].append(
+                torch.cat([
+                    input_ids[i][:this_input_ones],
+                    output_ids[i][1:], 
+                    input_ids[i][this_input_ones:]
+                ])
+            )
+            llm_tokens['attention_mask'].append(
+                torch.cat([
+                    input_atts[i][:this_input_ones],
+                    output_atts[i][1:],
+                    input_atts[i][this_input_ones:]
+                ])
+            )
+        llm_tokens['input_ids'] = torch.stack(llm_tokens['input_ids'])
+        llm_tokens['attention_mask'] = torch.stack(llm_tokens['attention_mask'])
+        return llm_tokens, input_part_targets_len
+
+    def __call__(self, inputs):
+        # samples = []
+        images = []
+        grounding_pixel_values = []
+        ram_pixel_values = []
+        questions = []
+        answers = []
+
+        for (cnt, i) in enumerate(inputs):
+            if 'npy_path' in i:
+                instance_image = Image.fromarray(np.load(i['npy_path']))
+            elif 'img_path' in i:
+                try:
+                    instance_image = Image.open(i['img_path'])
+                    if not instance_image.mode == "RGB":
+                        instance_image = instance_image.convert("RGB")
+                except:
+                    continue
+            elif "image" in i and i["image"] is not None:
+                instance_image = i["image"]
+                if not instance_image.mode == "RGB":
+                    instance_image = instance_image.convert("RGB")
+            elif "img" in i and i["img"] is not None:
+                instance_image = i["img"]
+                if not instance_image.mode == "RGB":
+                    instance_image = instance_image.convert("RGB")
+            elif "image_base64_str" in i:
+                try:
+                    instance_image = Image.open(BytesIO(b64decode(i["image_base64_str"][0])))
+                    if not instance_image.mode == "RGB":
+                        instance_image = instance_image.convert("RGB")
+                except:
+                    continue
+            else:
+                raise ValueError('no img path in samples')
+            if 1 in instance_image.size:
+                continue
+
+            if 'caption' in i:
+                answer = i['caption'] + ' ' + self.eos_token
+                prompts = self.prompts['en']
+                prompt = prompts[random.randint(0, len(prompts) - 1)]
+            elif 'caption_zh' in i:
+                answer = i['caption_zh'] + ' ' + self.eos_token
+                prompts = self.prompts['zh']
+                prompt = prompts[random.randint(0, len(prompts) - 1)]
+            elif 'text' in i:
+                answer = i['text'][0]['answer'] + ' ' + self.eos_token
+                prompt = i['text'][0]['question']
+            elif 'instruction' in i:
+                answer = i['outputs'] + ' ' + self.eos_token
+                # if random.random() <=0.15:
+                prompt = i['instruction'] + i['inputs']               
+            # elif 'caption_zh' in i:
+            #     caption = i['caption_zh']
+            elif 'question' in i:
+                answer = i['answer']+ ' ' + self.eos_token
+                prompt = i['question']            
+
+            images.append(instance_image)
+            grounding_pixel_values.append(self.grounding_transforms(instance_image, None)[0])
+            ram_pixel_values.append(self.ram_transforms(instance_image))
+            questions.append(prompt)
+            answers.append(answer)
+
+        self.processor.tokenizer.truncation_side = "left"
+        # print(questions)
+        #  
+        text_input_tokens = self.processor(text=questions, padding="longest", truncation=True, max_length=self.max_txt_len, return_tensors="pt")
+        # print('text_input_tokens:', text_input_tokens.input_ids)
+        
+        self.processor.tokenizer.truncation_side = 'right'
+        text_output_tokens = self.processor(text=answers, padding="longest", truncation=True, max_length=self.max_output_txt_len, return_tensors="pt")
+        # print('text_output_tokens:', text_output_tokens.input_ids)
+
+        llm_tokens, input_part_targets_len = self.concat_text_input_output(
+            text_input_tokens.input_ids,
+            text_input_tokens.attention_mask,
+            text_output_tokens.input_ids,
+            text_output_tokens.attention_mask,
+        )
+
+        labels = llm_tokens['input_ids'].masked_fill(
+            llm_tokens['input_ids'] == self.processor.tokenizer.pad_token_id, -100
+        )
+        for i, l in enumerate(input_part_targets_len):
+            labels[i][:l] = -100        
+
+        images_pixel_values = self.processor.image_processor(images=images, return_tensors="pt")
+        # images_pixel_values = torch.stack(images_pixel_values['pixel_values'])
+        model_inputs = {
+            "pixel_values":images_pixel_values['pixel_values'],
+            "grounding_pixel_values": grounding_pixel_values,
+            "ram_pixel_values": torch.stack(ram_pixel_values),
+            "input_ids": llm_tokens['input_ids'],
+            "attention_mask": llm_tokens['attention_mask'],
+            "qformer_input_ids": text_input_tokens.qformer_input_ids,
+            "qformer_attention_mask": text_input_tokens.qformer_attention_mask,
+            "labels": labels,
+            }        
+        return model_inputs
+
+
+class Lyrics(LightningModule):
+    @staticmethod
+    def add_module_specific_args(parent_parser):
+        parser = parent_parser.add_argument_group('Lyrics')
+        parser.add_argument('--freeze_image_tower', default=False, action='store_true')
+        parser.add_argument('--freeze_qformer', default=False, action='store_true')
+        parser.add_argument('--lora-r', type=int, default=8,
+                            help='curvature.')
+        parser.add_argument('--inference_mode', type=bool, default=False,
+                    help='The inference mode.')
+        parser.add_argument('--lora-alpha', type=int, default=32,
+                            help='The initialization coefficient of lora-alpha.')  
+        parser.add_argument('--lora-dropout', type=int, default=0.05,
+                            help='The initialization coefficient of lora_dropout.')
+        parser.add_argument('--use-lora', action='store_true', help='LORA.')                
+        return parent_parser
+
+    def __init__(self, args, **kwargs) -> None:
+        super().__init__()
+        self.save_hyperparameters(args)
+
+        # self.model = LyricsQFromerForPretrain.from_pretrained(args.model_path, ignore_mismatched_sizes=True)
+        self.model = LyricsLMForConditionalGeneration.from_pretrained(args.model_path)
+        self.processor = InstructBlipProcessor.from_pretrained(args.model_path, padding_side = "right")
+
+        self.model.box_threshold = 0.25
+        self.model.text_threshold = 0.2
+        self.model.iou_threshold = 0.6
+
+        if args.freeze_image_tower:
+            self.model.vision_model.eval()
+            for param in self.model.vision_model.parameters():
+                param.requires_grad = False
+            self.model.ram.eval()
+            for param in self.model.ram.parameters():
+                param.requires_grad = False
+            self.model.grounding_dino.eval()
+            for param in self.model.grounding_dino.parameters():
+                param.requires_grad = False
+        # if args.freeze_qformer:
+        #     self.model.qformer.eval()
+        #     self.model.qformer.requires_grad_(False)
+        #     self.model.query_tokens.requires_grad_(False)
+        # freeze lm
+        if args.use_lora:
+            # for param in self.model.parameters():
+            #     # freeze base model's layers
+            #     param.requires_grad = False
+            peft_config = LoraConfig(
+                target_modules=r'.*language_model.*\.(q_proj|k_proj|v_proj)', 
+                inference_mode=args.inference_mode, 
+                r=args.lora_r, 
+                lora_alpha=args.lora_alpha, 
+                lora_dropout=args.lora_dropout
+            )
+            self.model = get_peft_model(self.model, peft_config)
+            # self.model.base_model.model.qformer.train()
+            # self.model.base_model.model.qformer.requires_grad_(True)
+            # self.model.base_model.model.query_tokens.requires_grad_(True)
+            # self.model.base_model.model.language_projection.train()
+            # self.model.base_model.model.language_projection.requires_grad_(True)
+            self.model.print_trainable_parameters()
+        elif args.freeze_qformer:
+            self.model.language_model.eval()
+            self.model.language_model.requires_grad_(False)
+            self.model.qformer.eval()
+            self.model.qformer.requires_grad_(False)
+            self.model.query_tokens.requires_grad_(False)             
+        else:
+            self.model.language_model.eval()
+            self.model.language_model.requires_grad_(False)
+        # for name, param in self.model.named_parameters():
+        #     if param.requires_grad == True:
+        #         print(name)
+
+                                              
+
+    def setup(self, stage) -> None:
+        if stage == 'fit':
+            self.total_steps = get_total_steps(self.trainer, self.hparams)
+            self.steps_per_epoch = self.total_steps // self.trainer.max_epochs
+            print('Total steps: {}' .format(self.total_steps))
+        elif stage == 'validate':
+            self.total_steps = 100
+
+    def configure_optimizers(self):
+        return configure_optimizers(self)
+    
+    def detokenize(self, token_ids):
+        toks = self.processor.tokenizer.convert_ids_to_tokens(token_ids)
+        return self.processor.tokenizer.convert_tokens_to_string(toks)
+    
+    def qformer_detokenize(self, token_ids):
+        toks = self.processor.qformer_tokenizer.convert_ids_to_tokens(token_ids)
+        return self.processor.qformer_tokenizer.convert_tokens_to_string(toks)    
+
+    def training_step(self, batch):
+        if self.trainer.global_rank == 0:
+            global SHOW_DATA
+            if self.trainer.global_step % 1000 == 0:
+                SHOW_DATA = True
+                print(f"input_ids: {batch['input_ids'][0]}")
+                print(f"input: {self.detokenize(batch['input_ids'][0])}")
+                print(f"labels_id: {batch['labels'][0]}")
+                print(f"qformer_input_ids: {batch['qformer_input_ids'][0]}")
+                print(f"qformer_input: {self.qformer_detokenize(batch['qformer_input_ids'][0])}")
+
+        output = self.model(**batch)
+        return output.loss
+
+    def validation_step(self, batch, batch_idx):
+        raise Exception("not impl")
+
+    def on_load_checkpoint(self, checkpoint) -> None:
+        # 兼容低版本lightning，低版本lightning从ckpt起来时steps数会被重置为0
+        global_step_offset = checkpoint["global_step"]
+        if 'global_samples' in checkpoint:
+            self.consumed_samples = checkpoint['global_samples']
+        self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset
+
+    def on_save_checkpoint(self, checkpoint) -> None:
+        # 保存的时候把权重按huggingface的形式保存出来
+        if self.global_rank == 0:
+            dir_path = os.path.join(
+                self.hparams.default_root_dir, f'hf_out_{self.trainer.current_epoch}_{self.trainer.global_step}')
+            if not os.path.exists(dir_path):
+                os.mkdir(dir_path)
+            self.model.save_pretrained(dir_path)
+
+
+if __name__ == '__main__':
+    args_parser = argparse.ArgumentParser()
+    args_parser = add_module_args(args_parser)
+    args_parser = add_data_args(args_parser)
+    args_parser = UniversalDataModule.add_data_specific_args(args_parser)
+    args_parser = Trainer.add_argparse_args(args_parser)
+    args_parser = Lyrics.add_module_specific_args(args_parser)
+    args_parser = UniversalCheckpoint.add_argparse_args(args_parser)
+    args = args_parser.parse_args()
+
+    lr_monitor = LearningRateMonitor(logging_interval='step')
+    checkpoint_callback = UniversalCheckpoint(args)
+
+    # wandb_logger = WandbLogger(project="Lyrics")  # 初始化个WandbLogger对象
+    trainer = Trainer.from_argparse_args(args,
+                                        #  logger=wandb_logger,
+                                         callbacks=[
+                                             lr_monitor,
+                                             checkpoint_callback])
+
+    model = Lyrics(args)
+    collate_fn = Collator(args)
+    datasets = load_data(args, global_rank=trainer.global_rank)
+    # print(datasets)
+    datamoule = UniversalDataModule(
+        tokenizer=None, collate_fn=collate_fn, args=args, datasets=datasets)
+    # trainer.fit(model, datamoule)
+    trainer.fit(model, datamoule, ckpt_path=args.load_ckpt_path)
diff --git a/fengshen/models/Lyrics/backbone/__init__.py b/fengshen/models/Lyrics/backbone/__init__.py
new file mode 100644
index 0000000..c0de392
--- /dev/null
+++ b/fengshen/models/Lyrics/backbone/__init__.py
@@ -0,0 +1,3 @@
+from fengshen.models.groundedblip.backbone.backbone import Joiner
+from fengshen.models.groundedblip.backbone.swin_transformer import SwinTransformer
+from fengshen.models.groundedblip.backbone.position_encoding import PositionEmbeddingSineHW
\ No newline at end of file
diff --git a/fengshen/models/Lyrics/backbone/backbone.py b/fengshen/models/Lyrics/backbone/backbone.py
new file mode 100644
index 0000000..2e82c34
--- /dev/null
+++ b/fengshen/models/Lyrics/backbone/backbone.py
@@ -0,0 +1,53 @@
+# ------------------------------------------------------------------------
+# Grounding DINO
+# url: https://github.com/IDEA-Research/GroundingDINO
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+
+"""
+Backbone modules.
+"""
+
+from typing import Dict, List
+
+import torch
+import torch.nn.functional as F
+import torchvision
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+
+from fengshen.models.groundedblip.groundingdino.utils import NestedTensor, clean_state_dict, is_main_process
+
+from fengshen.models.groundedblip.backbone.swin_transformer import SwinTransformer
+from fengshen.models.groundedblip.backbone.position_encoding import PositionEmbeddingSineHW
+
+class Joiner(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.swintransformer = SwinTransformer(args)
+        self.position_embedding = PositionEmbeddingSineHW(args.hidden_dim,
+                                                          temperatureh=args.pe_temperatureh,
+                                                          temperaturew=args.pe_temperaturew,
+                                                          normalize=True,
+                                                          )
+        bb_num_channels = self.swintransformer.num_features[4 - len(tuple(args.return_interm_indices)) :]
+        self.num_channels = bb_num_channels
+
+    def forward(self, tensor_list: NestedTensor):
+        xs = self.swintransformer(tensor_list)
+        out: List[NestedTensor] = []
+        pos = []
+        for name, x in xs.items():
+            out.append(x)
+            # position encoding
+            pos.append(self.position_embedding(x).to(x.tensors.dtype))
+
+        return out, pos
diff --git a/fengshen/models/Lyrics/backbone/position_encoding.py b/fengshen/models/Lyrics/backbone/position_encoding.py
new file mode 100644
index 0000000..20407fa
--- /dev/null
+++ b/fengshen/models/Lyrics/backbone/position_encoding.py
@@ -0,0 +1,119 @@
+# ------------------------------------------------------------------------
+# Grounding DINO
+# url: https://github.com/IDEA-Research/GroundingDINO
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# DINO
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copied from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+
+"""
+Various positional encodings for the transformer.
+"""
+import math
+
+import torch
+from torch import nn
+
+from fengshen.models.groundedblip.groundingdino.utils import NestedTensor
+
+class PositionEmbeddingSineHW(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+
+    def __init__(
+        self, num_pos_feats=64, temperatureh=10000, temperaturew=10000, normalize=False, scale=None
+    ):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats // 2
+        self.temperatureh = temperatureh
+        self.temperaturew = temperaturew
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+
+        # import ipdb; ipdb.set_trace()
+
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_tx = self.temperaturew ** (2 * (torch.div(dim_tx, 2, rounding_mode='floor')) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_tx
+
+        dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_ty = self.temperatureh ** (2 * (torch.div(dim_ty, 2, rounding_mode='floor')) / self.num_pos_feats)
+        pos_y = y_embed[:, :, :, None] / dim_ty
+
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+
+        # import ipdb; ipdb.set_trace()
+
+        return pos
+
+
+class PositionEmbeddingLearned(nn.Module):
+    """
+    Absolute pos embedding, learned.
+    """
+
+    def __init__(self, num_pos_feats=256):
+        super().__init__()
+        self.row_embed = nn.Embedding(50, num_pos_feats)
+        self.col_embed = nn.Embedding(50, num_pos_feats)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.uniform_(self.row_embed.weight)
+        nn.init.uniform_(self.col_embed.weight)
+
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        h, w = x.shape[-2:]
+        i = torch.arange(w, device=x.device)
+        j = torch.arange(h, device=x.device)
+        x_emb = self.col_embed(i)
+        y_emb = self.row_embed(j)
+        pos = (
+            torch.cat(
+                [
+                    x_emb.unsqueeze(0).repeat(h, 1, 1),
+                    y_emb.unsqueeze(1).repeat(1, w, 1),
+                ],
+                dim=-1,
+            )
+            .permute(2, 0, 1)
+            .unsqueeze(0)
+            .repeat(x.shape[0], 1, 1, 1)
+        )
+        return pos
diff --git a/fengshen/models/Lyrics/backbone/swin_transformer.py b/fengshen/models/Lyrics/backbone/swin_transformer.py
new file mode 100644
index 0000000..bfa6316
--- /dev/null
+++ b/fengshen/models/Lyrics/backbone/swin_transformer.py
@@ -0,0 +1,788 @@
+# ------------------------------------------------------------------------
+# Grounding DINO
+# url: https://github.com/IDEA-Research/GroundingDINO
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# DINO
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# --------------------------------------------------------
+# modified from https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/master/mmdet/models/backbones/swin_transformer.py
+# --------------------------------------------------------
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+
+from fengshen.models.groundedblip.groundingdino.utils import NestedTensor
+
+
+class Mlp(nn.Module):
+    """Multilayer perceptron."""
+
+    def __init__(
+        self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    """Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(
+        self,
+        dim,
+        window_size,
+        num_heads,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
+        )  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B_, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1
+        )  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1
+        ).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        if v.dtype == torch.half:
+            attn = self.attn_drop(attn).half()
+        # attn = self.attn_drop(attn)
+        elif v.dtype == torch.bfloat16:
+            attn = torch.tensor(attn, dtype=torch.bfloat16)
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock(nn.Module):
+    """Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        window_size=7,
+        shift_size=0,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop
+        )
+
+        self.H = None
+        self.W = None
+
+    def forward(self, x, mask_matrix):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size
+        )  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(
+            -1, self.window_size * self.window_size, C
+        )  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchMerging(nn.Module):
+    """Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        x = x.view(B, H, W, C)
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class BasicLayer(nn.Module):
+    """A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(
+        self,
+        dim,
+        depth,
+        num_heads,
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        downsample=None,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList(
+            [
+                SwinTransformerBlock(
+                    dim=dim,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        w_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+
+        mask_windows = window_partition(
+            img_mask, self.window_size
+        )  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
+            attn_mask == 0, float(0.0)
+        )
+
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+
+        return x
+
+
+class SwinTransformer(nn.Module):
+    """Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+        dilation (bool): if True, the output size if 16x downsample, ow 32x downsample.
+    """
+
+    def __init__(
+        self,
+        args,
+    ):
+        super().__init__()
+
+        self.pretrain_img_size = args.pretrain_img_size
+        self.num_layers = args.num_layers
+        self.embed_dim = args.embed_dim
+        self.ape = args.ape
+        self.patch_norm = args.patch_norm
+        self.out_indices = args.out_indices
+        self.frozen_stages = args.frozen_stages
+        self.dilation = args.dilation
+
+        # if use_checkpoint:
+        #     print("use_checkpoint!!!!!!!!!!!!!!!!!!!!!!!!")
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=args.patch_size,
+            in_chans=args.in_chans,
+            embed_dim=args.embed_dim,
+            norm_layer=nn.LayerNorm if self.patch_norm else None,
+        )
+
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1],
+            ]
+
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros(1, args.embed_dim, patches_resolution[0], patches_resolution[1])
+            )
+            trunc_normal_(self.absolute_pos_embed, std=0.02)
+
+        self.pos_drop = nn.Dropout(p=args.drop_rate)
+
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, args.drop_path_rate, sum(args.depths))
+        ]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        # prepare downsample list
+        downsamplelist = [PatchMerging for i in range(self.num_layers)]
+        downsamplelist[-1] = None
+        num_features = [int(args.embed_dim * 2**i) for i in range(self.num_layers)]
+        if self.dilation:
+            downsamplelist[-2] = None
+            num_features[-1] = int(args.embed_dim * 2 ** (self.num_layers - 1)) // 2
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                # dim=int(embed_dim * 2 ** i_layer),
+                dim=num_features[i_layer],
+                depth=args.depths[i_layer],
+                num_heads=args.num_heads[i_layer],
+                window_size=args.window_size,
+                mlp_ratio=args.mlp_ratio,
+                qkv_bias=args.qkv_bias,
+                qk_scale=args.qk_scale,
+                drop=args.drop_rate,
+                attn_drop=args.attn_drop_rate,
+                drop_path=dpr[sum(args.depths[:i_layer]) : sum(args.depths[: i_layer + 1])],
+                norm_layer=nn.LayerNorm,
+                # downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                downsample=downsamplelist[i_layer],
+                use_checkpoint=args.swintransformer_use_checkpoint,
+            )
+            self.layers.append(layer)
+
+        # num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        self.num_features = num_features
+
+        # add a norm layer for each output
+        for i_layer in tuple(args.out_indices):
+            layer = nn.LayerNorm(num_features[i_layer])
+            layer_name = f"norm{i_layer}"
+            self.add_module(layer_name, layer)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    # def init_weights(self, pretrained=None):
+    #     """Initialize the weights in backbone.
+    #     Args:
+    #         pretrained (str, optional): Path to pre-trained weights.
+    #             Defaults to None.
+    #     """
+
+    #     def _init_weights(m):
+    #         if isinstance(m, nn.Linear):
+    #             trunc_normal_(m.weight, std=.02)
+    #             if isinstance(m, nn.Linear) and m.bias is not None:
+    #                 nn.init.constant_(m.bias, 0)
+    #         elif isinstance(m, nn.LayerNorm):
+    #             nn.init.constant_(m.bias, 0)
+    #             nn.init.constant_(m.weight, 1.0)
+
+    #     if isinstance(pretrained, str):
+    #         self.apply(_init_weights)
+    #         logger = get_root_logger()
+    #         load_checkpoint(self, pretrained, strict=False, logger=logger)
+    #     elif pretrained is None:
+    #         self.apply(_init_weights)
+    #     else:
+    #         raise TypeError('pretrained must be a str or None')
+
+    def forward_raw(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
+            )
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            # import ipdb; ipdb.set_trace()
+
+            if i in tuple(self.out_indices):
+                norm_layer = getattr(self, f"norm{i}")
+                x_out = norm_layer(x_out)
+
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs.append(out)
+        # in:
+        #   torch.Size([2, 3, 1024, 1024])
+        # outs:
+        #   [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \
+        #       torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]
+        return tuple(outs)
+
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
+            )
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+
+            if i in tuple(self.out_indices):
+                norm_layer = getattr(self, f"norm{i}")
+                x_out = norm_layer(x_out)
+
+                out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
+                outs.append(out)
+                # print(out.shape)
+        # in:
+        #   torch.Size([2, 3, 1024, 1024])
+        # out:
+        #   [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \
+        #       torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]
+
+        # collect for nesttensors
+        outs_dict = {}
+        for idx, out_i in enumerate(outs):
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(), size=out_i.shape[-2:]).to(torch.bool)[0]
+            outs_dict[idx] = NestedTensor(out_i, mask)
+            # print(out_i.shape)
+
+        return outs_dict
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+
+
+def build_swin_transformer(modelname, pretrain_img_size, **kw):
+    assert modelname in [
+        "swin_T_224_1k",
+        "swin_B_224_22k",
+        "swin_B_384_22k",
+        "swin_L_224_22k",
+        "swin_L_384_22k",
+    ]
+
+    model_para_dict = {
+        "swin_T_224_1k": dict(
+            embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7
+        ),
+        "swin_B_224_22k": dict(
+            embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=7
+        ),
+        "swin_B_384_22k": dict(
+            embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=12
+        ),
+        "swin_L_224_22k": dict(
+            embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=7
+        ),
+        "swin_L_384_22k": dict(
+            embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=12
+        ),
+    }
+    kw_cgf = model_para_dict[modelname]
+    kw_cgf.update(kw)
+    model = SwinTransformer(pretrain_img_size=pretrain_img_size, **kw_cgf)
+    return model
+
+
+if __name__ == "__main__":
+    model = build_swin_transformer("swin_L_384_22k", 384, dilation=True)
+    x = torch.rand(2, 3, 1024, 1024)
+    y = model.forward_raw(x)
+    import ipdb
+
+    ipdb.set_trace()
+    x = torch.rand(2, 3, 384, 384)
+    y = model.forward_raw(x)
diff --git a/fengshen/models/Lyrics/configuration_lyrics.py b/fengshen/models/Lyrics/configuration_lyrics.py
new file mode 100644
index 0000000..c4764de
--- /dev/null
+++ b/fengshen/models/Lyrics/configuration_lyrics.py
@@ -0,0 +1,616 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BLIP-2 model configuration"""
+
+import copy
+import os
+from typing import Union
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from transformers.utils import logging
+from transformers.models.auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+# BLIP_2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+#     "salesforce/blip2-opt-2.7b": "https://huggingface.co/salesforce/blip2-opt-2.7b/resolve/main/config.json",
+# }
+
+
+class LyricsVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Blip2VisionModel`]. It is used to instantiate a
+    BLIP-2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
+    configuration defaults will yield a similar configuration to that of the BLIP-2
+    [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 1408):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 39):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
+            to 1e-5): The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+
+    Example:
+
+    ```python
+    >>> from transformers import Blip2VisionConfig, Blip2VisionModel
+
+    >>> # Initializing a Blip2VisionConfig with Salesforce/blip2-opt-2.7b style configuration
+    >>> configuration = Blip2VisionConfig()
+
+    >>> # Initializing a Blip2VisionModel (with random weights) from the Salesforce/blip2-opt-2.7b style configuration
+    >>> model = Blip2VisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "blip_2_vision_model"
+
+    def __init__(
+        self,
+        hidden_size=1408,
+        intermediate_size=6144,
+        num_hidden_layers=39,
+        num_attention_heads=16,
+        image_size=224,
+        patch_size=14,
+        hidden_act="gelu",
+        layer_norm_eps=0.00001,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
+        qkv_bias=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.qkv_bias = qkv_bias
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from Blip2Config
+        if config_dict.get("model_type") == "blip-2":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+    
+class LyricsDetectionConfig(PretrainedConfig):
+
+    model_type = "blip_2_detection_model"
+
+    def __init__(
+        self,
+        backbone = "swin_T_224_1k",
+        position_embedding = "sine",
+        pe_temperatureh = 20,
+        pe_temperaturew = 20,
+        return_interm_indices = [1, 2, 3],
+        backbone_freeze_keywords = None,
+        enc_layers = 6,
+        num_unicoder_layers = 0,
+        dec_layers = 6,
+        pre_norm = False,
+        dim_feedforward = 2048,
+        hidden_dim = 256,
+        dropout = 0.0,
+        nheads = 8,
+        num_queries = 900,
+        aux_loss = True,
+        iter_update = True,
+        dn_number = 0,
+        query_dim = 4,
+        num_patterns = 0,
+        num_feature_levels = 4,
+        enc_n_points = 4,
+        dec_n_points = 4,
+        learnable_tgt_init = True,
+        two_stage_type = "standard",
+        two_stage_bbox_embed_share = False,
+        two_stage_class_embed_share = False,
+        transformer_activation = "relu",
+        return_intermediate_dec = True,
+        dec_pred_bbox_embed_share = True,
+        dn_box_noise_scale = 1.0,
+        dn_label_noise_ratio = 0.5,
+        dn_label_coef = 1.0,
+        dn_bbox_coef = 1.0,
+        embed_init_tgt = True,
+        dn_labelbook_size = 2000,
+        max_text_len = 256,
+        text_encoder_type = "bert-base-uncased",
+        use_checkpoint = True,
+        use_transformer_ckpt = True,
+        use_text_cross_attention = True,
+        text_dropout = 0.0,
+        fusion_dropout = 0.0,
+        fusion_droppath = 0.1,
+        sub_sentence_present = True,
+        pretrain_img_size = 224,
+        patch_size = 4,
+        in_chans = 3,
+        num_layers = 4,
+        embed_dim = 96,
+        depths = [2, 2, 6, 2],
+        num_heads = [3, 6, 12, 24],
+        window_size = 7,
+        mlp_ratio = 4.0,
+        qkv_bias = True,
+        qk_scale = None,
+        drop_rate = 0.0,
+        attn_drop_rate = 0.0,
+        drop_path_rate = 0.2,
+        swintransformer_use_checkpoint = False,
+        ape = False,
+        patch_norm = True,
+        out_indices = [1, 2, 3],
+        frozen_stages = -1,
+        dilation = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.backbone = backbone
+        self.position_embedding = position_embedding
+        self.pe_temperatureh = pe_temperatureh
+        self.pe_temperaturew = pe_temperaturew
+        self.return_interm_indices = return_interm_indices
+        self.backbone_freeze_keywords = backbone_freeze_keywords
+        self.enc_layers = enc_layers
+        self.num_unicoder_layers = num_unicoder_layers
+        self.dec_layers = dec_layers
+        self.pre_norm = pre_norm
+        self.dim_feedforward = dim_feedforward
+        self.hidden_dim = hidden_dim
+        self.dropout = dropout
+        self.nheads = nheads
+        self.num_queries = num_queries
+        self.aux_loss = aux_loss
+        self.iter_update = iter_update
+        self.dn_number = dn_number
+        self.query_dim = query_dim
+        self.num_patterns = num_patterns
+        self.num_feature_levels = num_feature_levels
+        self.enc_n_points = enc_n_points
+        self.dec_n_points = dec_n_points
+        self.learnable_tgt_init = learnable_tgt_init
+        self.two_stage_type = two_stage_type
+        self.two_stage_bbox_embed_share = two_stage_bbox_embed_share
+        self.two_stage_class_embed_share = two_stage_class_embed_share
+        self.transformer_activation = transformer_activation
+        self.return_intermediate_dec = return_intermediate_dec
+        self.dec_pred_bbox_embed_share = dec_pred_bbox_embed_share
+        self.dn_box_noise_scale = dn_box_noise_scale
+        self.dn_label_noise_ratio = dn_label_noise_ratio
+        self.dn_label_coef = dn_label_coef
+        self.dn_bbox_coef = dn_bbox_coef
+        self.embed_init_tgt = embed_init_tgt
+        self.dn_labelbook_size = dn_labelbook_size
+        self.max_text_len = max_text_len
+        self.text_encoder_type = text_encoder_type
+        self.use_checkpoint = use_checkpoint
+        self.use_transformer_ckpt = use_transformer_ckpt
+        self.use_text_cross_attention = use_text_cross_attention
+        self.text_dropout = text_dropout
+        self.fusion_dropout = fusion_dropout
+        self.fusion_droppath = fusion_droppath
+        self.sub_sentence_present = sub_sentence_present
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = num_layers
+        self.patch_size = patch_size
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.qk_scale = qk_scale
+        self.drop_rate = drop_rate
+        self.attn_drop_rate = attn_drop_rate
+        self.drop_path_rate = drop_path_rate
+        self.swintransformer_use_checkpoint = swintransformer_use_checkpoint
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.dilation = dilation
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from Blip2Config
+        if config_dict.get("model_type") == "blip-2":
+            config_dict = config_dict["detection_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        
+        return cls.from_dict(config_dict, **kwargs)
+    
+class LyricsRAMConfig(PretrainedConfig):
+
+    model_type = "blip_2_ram_model"
+
+    def __init__(
+        self,
+        med_config='/med_config.json',
+        image_size=384,
+        window_size=12,
+        vit='swin_l',
+        vit_grad_ckpt=False,
+        vit_ckpt_layer=0,
+        prompt='a picture of ',
+        threshold=0.68,
+        delete_tag_index=[],
+        tag_list='/ram_tag_list.txt',
+        tag_list_chinese='/ram_tag_list_chinese.txt',
+        vision_width=1536,
+        image_res=384,
+        embed_dim=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.med_config = med_config
+        self.image_size = image_size
+        self.window_size = window_size
+        self.vit = vit
+        self.vit_grad_ckpt = vit_grad_ckpt
+        self.vit_ckpt_layer = vit_ckpt_layer
+        self.prompt = prompt
+        self.threshold = threshold
+        self.delete_tag_index = delete_tag_index
+        self.tag_list = tag_list
+        self.tag_list_chinese = tag_list_chinese
+        self.vision_width = vision_width
+        self.image_res = image_res
+        self.embed_dim =embed_dim
+        self.depths = depths
+        self.num_heads = num_heads
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from Blip2Config
+        if config_dict.get("model_type") == "blip-2":
+            config_dict = config_dict["ram_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)    
+
+
+class LyricsQFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Blip2QFormerModel`]. It is used to instantiate a
+    BLIP-2 Querying Transformer (Q-Former) model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the BLIP-2
+    [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture. Configuration objects
+    inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+
+    Note that [`Blip2QFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling the model.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
+            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
+            [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+            For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
+            with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        cross_attention_frequency (`int`, *optional*, defaults to 2):
+            The frequency of adding cross-attention to the Transformer layers.
+        encoder_hidden_size (`int`, *optional*, defaults to 1408):
+            The hidden size of the hidden states for cross-attention.
+
+    Examples:
+
+    ```python
+    >>> from transformers import Blip2QFormerConfig, Blip2QFormerModel
+
+    >>> # Initializing a BLIP-2 Salesforce/blip2-opt-2.7b style configuration
+    >>> configuration = Blip2QFormerConfig()
+
+    >>> # Initializing a model (with random weights) from the Salesforce/blip2-opt-2.7b style configuration
+    >>> model = Blip2QFormerModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "blip_2_qformer"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        cross_attention_frequency=2,
+        encoder_hidden_size=1408,
+        detection_encoder_hidden_size=256,
+        query_length=96,
+        num_vit_query_tokens=32,
+        num_dino_query_tokens=64,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.cross_attention_frequency = cross_attention_frequency
+        self.encoder_hidden_size = encoder_hidden_size
+        self.detection_encoder_hidden_size = detection_encoder_hidden_size
+        self.query_length = query_length
+        self.num_vit_query_tokens = num_vit_query_tokens
+        self.num_dino_query_tokens = num_dino_query_tokens
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the qformer config dict if we are loading from Blip2Config
+        if config_dict.get("model_type") == "blip-2":
+            config_dict = config_dict["qformer_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class LyricsConfig(PretrainedConfig):
+    r"""
+    [`Blip2Config`] is the configuration class to store the configuration of a [`Blip2ForConditionalGeneration`]. It is
+    used to instantiate a BLIP-2 model according to the specified arguments, defining the vision model, Q-Former model
+    and language model configs. Instantiating a configuration with the defaults will yield a similar configuration to
+    that of the BLIP-2 [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Blip2VisionConfig`].
+        qformer_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Blip2QFormerConfig`].
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize any [`PretrainedConfig`].
+        num_query_tokens (`int`, *optional*, defaults to 32):
+            The number of query tokens passed through the Transformer.
+
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import (
+    ...     Blip2VisionConfig,
+    ...     Blip2QFormerConfig,
+    ...     OPTConfig,
+    ...     Blip2Config,
+    ...     Blip2ForConditionalGeneration,
+    ... )
+
+    >>> # Initializing a Blip2Config with Salesforce/blip2-opt-2.7b style configuration
+    >>> configuration = Blip2Config()
+
+    >>> # Initializing a Blip2ForConditionalGeneration (with random weights) from the Salesforce/blip2-opt-2.7b style configuration
+    >>> model = Blip2ForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a Blip2Config from a Blip2VisionConfig, Blip2QFormerConfig and any PretrainedConfig
+
+    >>> # Initializing BLIP-2 vision, BLIP-2 Q-Former and language model configurations
+    >>> vision_config = Blip2VisionConfig()
+    >>> qformer_config = Blip2QFormerConfig()
+    >>> text_config = OPTConfig()
+
+    >>> config = Blip2Config.from_text_vision_configs(vision_config, qformer_config, text_config)
+    ```"""
+
+    model_type = "blip-2"
+    is_composition = True
+
+    def __init__(self, vision_config=None, qformer_config=None, text_config=None, detection_config=None, ram_config=None, num_query_tokens=96, image_text_hidden_size=256,**kwargs):
+        super().__init__(**kwargs)
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("vision_config is None. initializing the LyricsVisionConfig with default values.")
+
+        if qformer_config is None:
+            qformer_config = {}
+            logger.info("qformer_config is None. Initializing the LyricsQFormerConfig with default values.")
+
+        if text_config is None:
+            text_config = {}
+            logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
+
+        if detection_config is None:
+            detection_config = {}
+            logger.info("detection_config is None. initializing the LyricsDetectionConfig with default values.")
+
+        if ram_config is None:
+            ram_config = {}
+            logger.info("ram_config is None. Initializing the LyricsRAMConfig with default values.")
+
+        self.vision_config = LyricsVisionConfig(**vision_config)
+        self.qformer_config = LyricsQFormerConfig(**qformer_config)
+        self.detection_config = LyricsDetectionConfig(**detection_config)
+        self.ram_config = LyricsRAMConfig(**ram_config)
+        text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
+        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
+
+        self.tie_word_embeddings = self.text_config.tie_word_embeddings
+        self.is_encoder_decoder = self.text_config.is_encoder_decoder
+
+        self.num_query_tokens = num_query_tokens
+        self.image_text_hidden_size = image_text_hidden_size
+        self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
+        self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+
+    @classmethod
+    def from_vision_qformer_text_configs(
+        cls,
+        vision_config: LyricsVisionConfig,
+        qformer_config: LyricsQFormerConfig,
+        text_config: PretrainedConfig,
+        detection_config: LyricsDetectionConfig,
+        ram_config: LyricsRAMConfig,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a [`Blip2Config`] (or a derived class) from a BLIP-2 vision model, Q-Former and language model
+        configurations.
+
+        Returns:
+            [`Blip2Config`]: An instance of a configuration object
+        """
+
+        return cls(
+            vision_config=vision_config.to_dict(),
+            qformer_config=qformer_config.to_dict(),
+            text_config=text_config.to_dict(),
+            detection_config=detection_config.to_dict(),
+            ram_config=ram_config.to_dict(),
+            **kwargs,
+        )
+    
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        output["vision_config"] = self.vision_config.to_dict()
+        output["qformer_config"] = self.qformer_config.to_dict()
+        output["text_config"] = self.text_config.to_dict()
+        output["detection_config"] = self.detection_config.to_dict()
+        output["ram_config"] = self.ram_config.to_dict()        
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/fengshen/models/Lyrics/groundingdino/bertwarper.py b/fengshen/models/Lyrics/groundingdino/bertwarper.py
new file mode 100644
index 0000000..f0cf977
--- /dev/null
+++ b/fengshen/models/Lyrics/groundingdino/bertwarper.py
@@ -0,0 +1,273 @@
+# ------------------------------------------------------------------------
+# Grounding DINO
+# url: https://github.com/IDEA-Research/GroundingDINO
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from torch import Tensor, nn
+from torchvision.ops.boxes import nms
+from transformers import BertConfig, BertModel, BertPreTrainedModel
+from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
+
+
+class BertModelWarper(nn.Module):
+    def __init__(self, bert_model):
+        super().__init__()
+        # self.bert = bert_modelc
+
+        self.config = bert_model.config
+        self.embeddings = bert_model.embeddings
+        self.encoder = bert_model.encoder
+        self.pooler = bert_model.pooler
+
+        self.get_extended_attention_mask = bert_model.get_extended_attention_mask
+        self.invert_attention_mask = bert_model.invert_attention_mask
+        self.get_head_mask = bert_model.get_head_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = (
+            output_attentions if output_attentions is not None else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] if past_key_values is not None else 0
+        )
+
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)), device=device
+            )
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device
+        )
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+        # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class TextEncoderShell(nn.Module):
+    def __init__(self, text_encoder):
+        super().__init__()
+        self.text_encoder = text_encoder
+        self.config = self.text_encoder.config
+
+    def forward(self, **kw):
+        # feed into text encoder
+        return self.text_encoder(**kw)
+
+
+def generate_masks_with_special_tokens(tokenized, special_tokens_list, tokenizer):
+    """Generate attention mask between each pair of special tokens
+    Args:
+        input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
+        special_tokens_mask (list): special tokens mask.
+    Returns:
+        torch.Tensor: attention mask between each special tokens.
+    """
+    input_ids = tokenized["input_ids"]
+    bs, num_token = input_ids.shape
+    # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
+    special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool()
+    for special_token in special_tokens_list:
+        special_tokens_mask |= input_ids == special_token
+
+    # idxs: each row is a list of indices of special tokens
+    idxs = torch.nonzero(special_tokens_mask)
+
+    # generate attention mask and positional ids
+    attention_mask = (
+        torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
+    )
+    position_ids = torch.zeros((bs, num_token), device=input_ids.device)
+    previous_col = 0
+    for i in range(idxs.shape[0]):
+        row, col = idxs[i]
+        if (col == 0) or (col == num_token - 1):
+            attention_mask[row, col, col] = True
+            position_ids[row, col] = 0
+        else:
+            attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+            position_ids[row, previous_col + 1 : col + 1] = torch.arange(
+                0, col - previous_col, device=input_ids.device
+            )
+
+        previous_col = col
+
+    # # padding mask
+    # padding_mask = tokenized['attention_mask']
+    # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
+
+    return attention_mask, position_ids.to(torch.long)
+
+
+def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list, tokenizer):
+    """Generate attention mask between each pair of special tokens
+    Args:
+        input_ids (torch.Tensor): input ids. Shape: [bs, num_token]
+        special_tokens_mask (list): special tokens mask.
+    Returns:
+        torch.Tensor: attention mask between each special tokens.
+    """
+    input_ids = tokenized["input_ids"]
+    bs, num_token = input_ids.shape
+    # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens
+    special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool()
+    for special_token in special_tokens_list:
+        special_tokens_mask |= input_ids == special_token
+
+    # idxs: each row is a list of indices of special tokens
+    idxs = torch.nonzero(special_tokens_mask)
+
+    # generate attention mask and positional ids
+    attention_mask = (
+        torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1)
+    )
+    position_ids = torch.zeros((bs, num_token), device=input_ids.device)
+    cate_to_token_mask_list = [[] for _ in range(bs)]
+    previous_col = 0
+    for i in range(idxs.shape[0]):
+        row, col = idxs[i]
+        if (col == 0) or (col == num_token - 1):
+            attention_mask[row, col, col] = True
+            position_ids[row, col] = 0
+        else:
+            attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True
+            position_ids[row, previous_col + 1 : col + 1] = torch.arange(
+                0, col - previous_col, device=input_ids.device
+            )
+            c2t_maski = torch.zeros((num_token), device=input_ids.device).bool()
+            c2t_maski[previous_col + 1 : col] = True
+            cate_to_token_mask_list[row].append(c2t_maski)
+        previous_col = col
+
+    cate_to_token_mask_list = [
+        torch.stack(cate_to_token_mask_listi, dim=0)
+        for cate_to_token_mask_listi in cate_to_token_mask_list
+    ]
+
+    # # padding mask
+    # padding_mask = tokenized['attention_mask']
+    # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool()
+
+    return attention_mask, position_ids.to(torch.long), cate_to_token_mask_list
diff --git a/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn.h b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn.h
new file mode 100644
index 0000000..c7408eb
--- /dev/null
+++ b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn.h
@@ -0,0 +1,64 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+
+#include "ms_deform_attn_cpu.h"
+
+#ifdef WITH_CUDA
+#include "ms_deform_attn_cuda.h"
+#endif
+
+namespace groundingdino {
+
+at::Tensor
+ms_deform_attn_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_forward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+
+std::vector<at::Tensor>
+ms_deform_attn_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_backward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+
+} // namespace groundingdino
\ No newline at end of file
diff --git a/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cpu.cpp b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cpu.cpp
new file mode 100644
index 0000000..551243f
--- /dev/null
+++ b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cpu.cpp
@@ -0,0 +1,43 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+namespace groundingdino {
+
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
+
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
+
+} // namespace groundingdino
diff --git a/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cpu.h b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cpu.h
new file mode 100644
index 0000000..b2b88e8
--- /dev/null
+++ b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cpu.h
@@ -0,0 +1,35 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+namespace groundingdino {
+
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
+
+} // namespace groundingdino
diff --git a/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cuda.cu b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cuda.cu
new file mode 100644
index 0000000..d04fae8
--- /dev/null
+++ b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cuda.cu
@@ -0,0 +1,156 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <vector>
+#include "ms_deform_im2col_cuda.cuh"
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace groundingdino {
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    
+    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+    const int batch_n = im2col_step_;
+    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto columns = output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
+            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
+                value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                spatial_shapes.data<int64_t>(),
+                level_start_index.data<int64_t>(),
+                sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                columns.data<scalar_t>());
+
+        }));
+    }
+
+    output = output.view({batch, num_query, num_heads*channels});
+
+    return output;
+}
+
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+
+    auto grad_value = at::zeros_like(value);
+    auto grad_sampling_loc = at::zeros_like(sampling_loc);
+    auto grad_attn_weight = at::zeros_like(attn_weight);
+
+    const int batch_n = im2col_step_;
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto grad_output_g = grad_output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
+            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
+                                    grad_output_g.data<scalar_t>(),
+                                    value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                                    spatial_shapes.data<int64_t>(),
+                                    level_start_index.data<int64_t>(),
+                                    sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                                    grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
+                                    grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
+
+        }));
+    }
+
+    return {
+        grad_value, grad_sampling_loc, grad_attn_weight
+    };
+}
+
+} // namespace groundingdino
\ No newline at end of file
diff --git a/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cuda.h b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cuda.h
new file mode 100644
index 0000000..ad1311a
--- /dev/null
+++ b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cuda.h
@@ -0,0 +1,33 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+namespace groundingdino {
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
+
+} // namespace groundingdino
\ No newline at end of file
diff --git a/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_im2col_cuda.cuh b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_im2col_cuda.cuh
new file mode 100644
index 0000000..6bc2acb
--- /dev/null
+++ b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_im2col_cuda.cuh
@@ -0,0 +1,1327 @@
+/*!
+**************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************
+* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
+* Copyright (c) 2018 Microsoft
+**************************************************************************
+*/
+
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THCAtomics.cuh>
+
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads)
+{
+  return (N + num_threads - 1) / num_threads;
+}
+
+
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, 
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value, 
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val); 
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(const int n,
+                                                const scalar_t *data_value, 
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+    
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockSize/2; s>0; s>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        { 
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+          
+          
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            } 
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+        
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index, 
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size, 
+                                                const int spatial_size, 
+                                                const int num_heads,
+                                                const int channels, 
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp; 
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear_gm(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr, 
+            grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream,
+                              const scalar_t* data_value,
+                              const int64_t* data_spatial_shapes, 
+                              const int64_t* data_level_start_index, 
+                              const scalar_t* data_sampling_loc,
+                              const scalar_t* data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size, 
+                              const int num_heads, 
+                              const int channels, 
+                              const int num_levels, 
+                              const int num_query,
+                              const int num_point,
+                              scalar_t* data_col)
+{
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+          0, stream>>>(
+      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, 
+      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
+  
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
+
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(cudaStream_t stream,
+                              const scalar_t* grad_col,
+                              const scalar_t* data_value,
+                              const int64_t * data_spatial_shapes,
+                              const int64_t * data_level_start_index,
+                              const scalar_t * data_sampling_loc,
+                              const scalar_t * data_attn_weight,
+                              const int batch_size, 
+                              const int spatial_size, 
+                              const int num_heads,
+                              const int channels, 
+                              const int num_levels,
+                              const int num_query,
+                              const int num_point, 
+                              scalar_t* grad_value,
+                              scalar_t* grad_sampling_loc,
+                              scalar_t* grad_attn_weight)
+{
+  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > 1024)
+  {
+    if ((channels & 1023) == 0)
+    {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+    }
+    else
+    {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+    }
+  }
+  else{
+    switch(channels)
+    {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 1024:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels, 
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index, 
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size, 
+                      spatial_size, 
+                      num_heads,
+                      channels, 
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      default:
+        if (channels < 64)
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+        else
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels, 
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index, 
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size, 
+                        spatial_size, 
+                        num_heads,
+                        channels, 
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
\ No newline at end of file
diff --git a/fengshen/models/Lyrics/groundingdino/csrc/cuda_version.cu b/fengshen/models/Lyrics/groundingdino/csrc/cuda_version.cu
new file mode 100644
index 0000000..64569e3
--- /dev/null
+++ b/fengshen/models/Lyrics/groundingdino/csrc/cuda_version.cu
@@ -0,0 +1,7 @@
+#include <cuda_runtime_api.h>
+
+namespace groundingdino {
+int get_cudart_version() {
+  return CUDART_VERSION;
+}
+} // namespace groundingdino
diff --git a/fengshen/models/Lyrics/groundingdino/csrc/vision.cpp b/fengshen/models/Lyrics/groundingdino/csrc/vision.cpp
new file mode 100644
index 0000000..c1f2c50
--- /dev/null
+++ b/fengshen/models/Lyrics/groundingdino/csrc/vision.cpp
@@ -0,0 +1,58 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+
+#include "MsDeformAttn/ms_deform_attn.h"
+
+namespace groundingdino {
+
+#ifdef WITH_CUDA
+extern int get_cudart_version();
+#endif
+
+std::string get_cuda_version() {
+#ifdef WITH_CUDA
+  std::ostringstream oss;
+
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else
+  return std::string("not available");
+#endif
+}
+
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
+  m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
+}
+
+} // namespace groundingdino
\ No newline at end of file
diff --git a/fengshen/models/Lyrics/groundingdino/fuse_modules.py b/fengshen/models/Lyrics/groundingdino/fuse_modules.py
new file mode 100644
index 0000000..cbafee1
--- /dev/null
+++ b/fengshen/models/Lyrics/groundingdino/fuse_modules.py
@@ -0,0 +1,297 @@
+# ------------------------------------------------------------------------
+# Grounding DINO
+# url: https://github.com/IDEA-Research/GroundingDINO
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.models.layers import DropPath
+
+
+class FeatureResizer(nn.Module):
+    """
+    This class takes as input a set of embeddings of dimension C1 and outputs a set of
+    embedding of dimension C2, after a linear transformation, dropout and normalization (LN).
+    """
+
+    def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True):
+        super().__init__()
+        self.do_ln = do_ln
+        # Object feature encoding
+        self.fc = nn.Linear(input_feat_size, output_feat_size, bias=True)
+        self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12)
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, encoder_features):
+        x = self.fc(encoder_features)
+        if self.do_ln:
+            x = self.layer_norm(x)
+        output = self.dropout(x)
+        return output
+
+
+def l1norm(X, dim, eps=1e-8):
+    """L1-normalize columns of X"""
+    norm = torch.abs(X).sum(dim=dim, keepdim=True) + eps
+    X = torch.div(X, norm)
+    return X
+
+
+def l2norm(X, dim, eps=1e-8):
+    """L2-normalize columns of X"""
+    norm = torch.pow(X, 2).sum(dim=dim, keepdim=True).sqrt() + eps
+    X = torch.div(X, norm)
+    return X
+
+
+def func_attention(query, context, smooth=1, raw_feature_norm="softmax", eps=1e-8):
+    """
+    query: (n_context, queryL, d)
+    context: (n_context, sourceL, d)
+    """
+    batch_size_q, queryL = query.size(0), query.size(1)
+    batch_size, sourceL = context.size(0), context.size(1)
+
+    # Get attention
+    # --> (batch, d, queryL)
+    queryT = torch.transpose(query, 1, 2)
+
+    # (batch, sourceL, d)(batch, d, queryL)
+    # --> (batch, sourceL, queryL)
+    attn = torch.bmm(context, queryT)
+    if raw_feature_norm == "softmax":
+        # --> (batch*sourceL, queryL)
+        attn = attn.view(batch_size * sourceL, queryL)
+        attn = nn.Softmax()(attn)
+        # --> (batch, sourceL, queryL)
+        attn = attn.view(batch_size, sourceL, queryL)
+    elif raw_feature_norm == "l2norm":
+        attn = l2norm(attn, 2)
+    elif raw_feature_norm == "clipped_l2norm":
+        attn = nn.LeakyReLU(0.1)(attn)
+        attn = l2norm(attn, 2)
+    else:
+        raise ValueError("unknown first norm type:", raw_feature_norm)
+    # --> (batch, queryL, sourceL)
+    attn = torch.transpose(attn, 1, 2).contiguous()
+    # --> (batch*queryL, sourceL)
+    attn = attn.view(batch_size * queryL, sourceL)
+    attn = nn.Softmax()(attn * smooth)
+    # --> (batch, queryL, sourceL)
+    attn = attn.view(batch_size, queryL, sourceL)
+    # --> (batch, sourceL, queryL)
+    attnT = torch.transpose(attn, 1, 2).contiguous()
+
+    # --> (batch, d, sourceL)
+    contextT = torch.transpose(context, 1, 2)
+    # (batch x d x sourceL)(batch x sourceL x queryL)
+    # --> (batch, d, queryL)
+    weightedContext = torch.bmm(contextT, attnT)
+    # --> (batch, queryL, d)
+    weightedContext = torch.transpose(weightedContext, 1, 2)
+
+    return weightedContext, attnT
+
+
+class BiMultiHeadAttention(nn.Module):
+    def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1, cfg=None):
+        super(BiMultiHeadAttention, self).__init__()
+
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.v_dim = v_dim
+        self.l_dim = l_dim
+
+        assert (
+            self.head_dim * self.num_heads == self.embed_dim
+        ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})."
+        self.scale = self.head_dim ** (-0.5)
+        self.dropout = dropout
+
+        self.v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.l_proj = nn.Linear(self.l_dim, self.embed_dim)
+        self.values_v_proj = nn.Linear(self.v_dim, self.embed_dim)
+        self.values_l_proj = nn.Linear(self.l_dim, self.embed_dim)
+
+        self.out_v_proj = nn.Linear(self.embed_dim, self.v_dim)
+        self.out_l_proj = nn.Linear(self.embed_dim, self.l_dim)
+
+        self.stable_softmax_2d = True
+        self.clamp_min_for_underflow = True
+        self.clamp_max_for_overflow = True
+
+        self._reset_parameters()
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def _reset_parameters(self):
+        nn.init.xavier_uniform_(self.v_proj.weight)
+        self.v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.l_proj.weight)
+        self.l_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_v_proj.weight)
+        self.values_v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.values_l_proj.weight)
+        self.values_l_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_v_proj.weight)
+        self.out_v_proj.bias.data.fill_(0)
+        nn.init.xavier_uniform_(self.out_l_proj.weight)
+        self.out_l_proj.bias.data.fill_(0)
+
+    def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
+        """_summary_
+
+        Args:
+            v (_type_): bs, n_img, dim
+            l (_type_): bs, n_text, dim
+            attention_mask_v (_type_, optional): _description_. bs, n_img
+            attention_mask_l (_type_, optional): _description_. bs, n_text
+
+        Returns:
+            _type_: _description_
+        """
+        # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+        #     import ipdb; ipdb.set_trace()
+        bsz, tgt_len, _ = v.size()
+
+        query_states = self.v_proj(v) * self.scale
+        key_states = self._shape(self.l_proj(l), -1, bsz)
+        value_v_states = self._shape(self.values_v_proj(v), -1, bsz)
+        value_l_states = self._shape(self.values_l_proj(l), -1, bsz)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_v_states = value_v_states.view(*proj_shape)
+        value_l_states = value_l_states.view(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))  # bs*nhead, nimg, ntxt
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}"
+            )
+
+        if self.stable_softmax_2d:
+            attn_weights = attn_weights - attn_weights.max()
+
+        if self.clamp_min_for_underflow:
+            attn_weights = torch.clamp(
+                attn_weights, min=-50000
+            )  # Do not increase -50000, data type half has quite limited range
+        if self.clamp_max_for_overflow:
+            attn_weights = torch.clamp(
+                attn_weights, max=50000
+            )  # Do not increase 50000, data type half has quite limited range
+
+        attn_weights_T = attn_weights.transpose(1, 2)
+        attn_weights_l = attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0]
+        if self.clamp_min_for_underflow:
+            attn_weights_l = torch.clamp(
+                attn_weights_l, min=-50000
+            )  # Do not increase -50000, data type half has quite limited range
+        if self.clamp_max_for_overflow:
+            attn_weights_l = torch.clamp(
+                attn_weights_l, max=50000
+            )  # Do not increase 50000, data type half has quite limited range
+
+        # mask vison for language
+        if attention_mask_v is not None:
+            attention_mask_v = (
+                attention_mask_v[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+            )
+            attn_weights_l.masked_fill_(attention_mask_v, float("-inf"))
+
+        attn_weights_l = attn_weights_l.softmax(dim=-1)
+
+        # mask language for vision
+        if attention_mask_l is not None:
+            attention_mask_l = (
+                attention_mask_l[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1)
+            )
+            attn_weights.masked_fill_(attention_mask_l, float("-inf"))
+        attn_weights_v = attn_weights.softmax(dim=-1)
+
+        attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training)
+        attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training)
+
+        attn_output_v = torch.bmm(attn_probs_v, value_l_states)
+        attn_output_l = torch.bmm(attn_probs_l, value_v_states)
+
+        if attn_output_v.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.size()}"
+            )
+
+        if attn_output_l.size() != (bsz * self.num_heads, src_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.size()}"
+            )
+
+        attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output_v = attn_output_v.transpose(1, 2)
+        attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len, self.head_dim)
+        attn_output_l = attn_output_l.transpose(1, 2)
+        attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim)
+
+        attn_output_v = self.out_v_proj(attn_output_v)
+        attn_output_l = self.out_l_proj(attn_output_l)
+
+        return attn_output_v, attn_output_l
+
+
+# Bi-Direction MHA (text->image, image->text)
+class BiAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        v_dim,
+        l_dim,
+        embed_dim,
+        num_heads,
+        dropout=0.1,
+        drop_path=0.0,
+        init_values=1e-4,
+        cfg=None,
+    ):
+        """
+        Inputs:
+            embed_dim - Dimensionality of input and attention feature vectors
+            hidden_dim - Dimensionality of hidden layer in feed-forward network
+                         (usually 2-4x larger than embed_dim)
+            num_heads - Number of heads to use in the Multi-Head Attention block
+            dropout - Amount of dropout to apply in the feed-forward network
+        """
+        super(BiAttentionBlock, self).__init__()
+
+        # pre layer norm
+        self.layer_norm_v = nn.LayerNorm(v_dim)
+        self.layer_norm_l = nn.LayerNorm(l_dim)
+        self.attn = BiMultiHeadAttention(
+            v_dim=v_dim, l_dim=l_dim, embed_dim=embed_dim, num_heads=num_heads, dropout=dropout
+        )
+
+        # add layer scale for training stability
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.temp_v = nn.Parameter(init_values * torch.ones((v_dim)), requires_grad=True)
+        self.temp_l = nn.Parameter(init_values * torch.ones((l_dim)), requires_grad=True)
+
+    def forward(self, v, l, attention_mask_v=None, attention_mask_l=None):
+        v = self.layer_norm_v(v)
+        l = self.layer_norm_l(l)
+        delta_v, delta_l = self.attn(
+            v, l, attention_mask_v=attention_mask_v, attention_mask_l=attention_mask_l
+        )
+        # v, l = v + delta_v, l + delta_l
+        v = v + self.drop_path(self.temp_v * delta_v)
+        l = l + self.drop_path(self.temp_l * delta_l)
+        return v, l
+
+    # def forward(self, v:List[torch.Tensor], l, attention_mask_v=None, attention_mask_l=None)
diff --git a/fengshen/models/Lyrics/groundingdino/modeling_groundingdino.py b/fengshen/models/Lyrics/groundingdino/modeling_groundingdino.py
new file mode 100644
index 0000000..f9af220
--- /dev/null
+++ b/fengshen/models/Lyrics/groundingdino/modeling_groundingdino.py
@@ -0,0 +1,338 @@
+# ------------------------------------------------------------------------
+# Grounding DINO
+# url: https://github.com/IDEA-Research/GroundingDINO
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR model and criterion classes.
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR)
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# ------------------------------------------------------------------------
+import copy
+from typing import List
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers import AutoTokenizer, BertModel, BertConfig
+from fengshen.models.groundedblip.backbone import Joiner, SwinTransformer
+from fengshen.models.groundedblip.groundingdino.bertwarper import (
+    BertModelWarper,
+    generate_masks_with_special_tokens_and_transfer_map,
+)
+from fengshen.models.groundedblip.groundingdino.transformer import Transformer
+from fengshen.models.groundedblip.groundingdino.utils import (
+    MLP,
+    ContrastiveEmbed,
+    NestedTensor,
+    inverse_sigmoid,
+    nested_tensor_from_tensor_list,
+)
+
+
+
+class GroundingDINO(nn.Module):
+    """This is the Cross-Attention Detector module that performs object detection"""
+
+    def __init__(
+        self,
+        args,
+        # max_text_len=256,
+    ):
+        """Initializes the model.
+        Parameters:
+            backbone: torch module of the backbone to be used. See backbone.py
+            transformer: torch module of the transformer architecture. See transformer.py
+            num_queries: number of object queries, ie detection slot. This is the maximal number of objects
+                         Conditional DETR can detect in a single image. For COCO, we recommend 100 queries.
+            aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used.
+        """
+        super().__init__()
+        self.num_queries = args.num_queries
+        self.transformer = Transformer(args)
+        self.hidden_dim = hidden_dim = self.transformer.d_model
+        self.num_feature_levels = args.num_feature_levels
+        self.nheads = args.nheads
+        self.max_text_len = 256
+        self.sub_sentence_present = args.sub_sentence_present
+
+        # setting query dim
+        self.query_dim = args.query_dim
+        assert args.query_dim == 4
+
+        # for dn training
+        self.num_patterns = args.num_patterns
+        self.dn_number = args.dn_number
+        self.dn_box_noise_scale = args.dn_box_noise_scale
+        self.dn_label_noise_ratio = args.dn_label_noise_ratio
+        self.dn_labelbook_size = args.dn_labelbook_size
+
+        # bert
+        self.bert_config = BertConfig.from_pretrained(args.text_encoder_type)
+        self.tokenizer = AutoTokenizer.from_pretrained(args.text_encoder_type)
+        self.bert = BertModel(self.bert_config)
+        self.bert.pooler.dense.weight.requires_grad_(False)
+        self.bert.pooler.dense.bias.requires_grad_(False)
+        self.bert = BertModelWarper(bert_model=self.bert)
+
+        self.feat_map = nn.Linear(self.bert.config.hidden_size, self.hidden_dim, bias=True)
+        nn.init.constant_(self.feat_map.bias.data, 0)
+        nn.init.xavier_uniform_(self.feat_map.weight.data)
+        # freeze
+
+        # special tokens
+        self.specical_tokens = self.tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"])
+
+        self.backbone = Joiner(args)
+
+        # prepare input projection layers
+        if self.num_feature_levels > 1:
+            num_backbone_outs = len(self.backbone.num_channels)
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = self.backbone.num_channels[_]
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    )
+                )
+            for _ in range(self.num_feature_levels - num_backbone_outs):
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    )
+                )
+                in_channels = hidden_dim
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            assert args.two_stage_type == "no", "two_stage_type should be no if num_feature_levels=1 !!!"
+            self.input_proj = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        nn.Conv2d(self.backbone.num_channels[-1], hidden_dim, kernel_size=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    )
+                ]
+            )
+
+        self.aux_loss = args.aux_loss
+        self.box_pred_damping = box_pred_damping = None
+
+        self.iter_update = args.iter_update
+        assert args.iter_update, "Why not iter_update?"
+
+        # prepare pred layers
+        self.dec_pred_bbox_embed_share = args.dec_pred_bbox_embed_share
+        # prepare class & box embed
+        _class_embed = ContrastiveEmbed()
+
+        _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
+
+        if args.dec_pred_bbox_embed_share:
+            box_embed_layerlist = [_bbox_embed for i in range(self.transformer.num_decoder_layers)]
+        else:
+            box_embed_layerlist = [
+                copy.deepcopy(_bbox_embed) for i in range(self.transformer.num_decoder_layers)
+            ]
+        class_embed_layerlist = [_class_embed for i in range(self.transformer.num_decoder_layers)]
+        self.bbox_embed = nn.ModuleList(box_embed_layerlist)
+        self.class_embed = nn.ModuleList(class_embed_layerlist)
+        self.transformer.decoder.bbox_embed = self.bbox_embed
+        self.transformer.decoder.class_embed = self.class_embed
+
+        # two stage
+        self.two_stage_type = args.two_stage_type
+        assert args.two_stage_type in ["no", "standard"], "unknown param {} of two_stage_type".format(
+            args.two_stage_type
+        )
+        if args.two_stage_type != "no":
+            if args.two_stage_bbox_embed_share:
+                assert args.dec_pred_bbox_embed_share
+                self.transformer.enc_out_bbox_embed = _bbox_embed
+            else:
+                self.transformer.enc_out_bbox_embed = copy.deepcopy(_bbox_embed)
+
+            if args.two_stage_class_embed_share:
+                assert args.dec_pred_bbox_embed_share
+                self.transformer.enc_out_class_embed = _class_embed
+            else:
+                self.transformer.enc_out_class_embed = copy.deepcopy(_class_embed)
+
+            self.refpoint_embed = None
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        # init input_proj
+        for proj in self.input_proj:
+            nn.init.xavier_uniform_(proj[0].weight, gain=1)
+            nn.init.constant_(proj[0].bias, 0)
+
+    def set_image_tensor(self, samples: NestedTensor):
+        if isinstance(samples, (list, torch.Tensor)):
+            samples = nested_tensor_from_tensor_list(samples)
+        self.features, self.poss = self.backbone(samples)
+
+    def unset_image_tensor(self):
+        if hasattr(self, 'features'):
+            del self.features
+        if hasattr(self,'poss'):
+            del self.poss 
+
+    def set_image_features(self, features , poss):
+        self.features = features
+        self.poss = poss
+
+    def init_ref_points(self, use_num_queries):
+        self.refpoint_embed = nn.Embedding(use_num_queries, self.query_dim)
+
+    def forward(self, samples: NestedTensor, targets: List = None, **kw):
+        """The forward expects a NestedTensor, which consists of:
+           - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
+           - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
+
+        It returns a dict with the following elements:
+           - "pred_logits": the classification logits (including no-object) for all queries.
+                            Shape= [batch_size x num_queries x num_classes]
+           - "pred_boxes": The normalized boxes coordinates for all queries, represented as
+                           (center_x, center_y, width, height). These values are normalized in [0, 1],
+                           relative to the size of each individual image (disregarding possible padding).
+                           See PostProcess for information on how to retrieve the unnormalized bounding box.
+           - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
+                            dictionnaries containing the two above keys for each decoder layer.
+        """
+        if targets is None:
+            captions = kw["captions"]
+        else:
+            captions = [t["caption"] for t in targets]
+
+        # encoder texts
+        tokenized = self.tokenizer(captions, padding="longest", return_tensors="pt").to(
+            "cuda"
+        )
+        (
+            text_self_attention_masks,
+            position_ids,
+            cate_to_token_mask_list,
+        ) = generate_masks_with_special_tokens_and_transfer_map(
+            tokenized, self.specical_tokens, self.tokenizer
+        )
+
+        if text_self_attention_masks.shape[1] > self.max_text_len:
+            text_self_attention_masks = text_self_attention_masks[
+                :, : self.max_text_len, : self.max_text_len
+            ]
+            position_ids = position_ids[:, : self.max_text_len]
+            tokenized["input_ids"] = tokenized["input_ids"][:, : self.max_text_len]
+            tokenized["attention_mask"] = tokenized["attention_mask"][:, : self.max_text_len]
+            tokenized["token_type_ids"] = tokenized["token_type_ids"][:, : self.max_text_len]
+
+        # extract text embeddings
+        if self.sub_sentence_present:
+            tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"}
+            tokenized_for_encoder["attention_mask"] = text_self_attention_masks
+            tokenized_for_encoder["position_ids"] = position_ids
+        else:
+            # import ipdb; ipdb.set_trace()
+            tokenized_for_encoder = tokenized
+            
+        bert_output = self.bert(**tokenized_for_encoder)  # bs, 195, 768
+
+        encoded_text = self.feat_map(bert_output["last_hidden_state"])  # bs, 195, d_model
+        text_token_mask = tokenized.attention_mask.bool()  # bs, 195
+        # text_token_mask: True for nomask, False for mask
+        # text_self_attention_masks: True for nomask, False for mask
+
+        if encoded_text.shape[1] > self.max_text_len:
+            encoded_text = encoded_text[:, : self.max_text_len, :]
+            text_token_mask = text_token_mask[:, : self.max_text_len]
+            position_ids = position_ids[:, : self.max_text_len]
+            text_self_attention_masks = text_self_attention_masks[
+                :, : self.max_text_len, : self.max_text_len
+            ]
+
+        text_dict = {
+            "encoded_text": encoded_text,  # bs, 195, d_model
+            "text_token_mask": text_token_mask,  # bs, 195
+            "position_ids": position_ids,  # bs, 195
+            "text_self_attention_masks": text_self_attention_masks,  # bs, 195,195
+        }
+
+        # import ipdb; ipdb.set_trace()
+        if isinstance(samples, (list, torch.Tensor)):
+            samples = nested_tensor_from_tensor_list(samples).to("cuda")
+        features, poss = self.backbone(samples)
+
+        srcs = []
+        masks = []
+        for l, feat in enumerate(features):
+            src, mask = feat.decompose()
+            srcs.append(self.input_proj[l](src))
+            masks.append(mask)
+            assert mask is not None
+        if self.num_feature_levels > len(srcs):
+            _len_srcs = len(srcs)
+            for l in range(_len_srcs, self.num_feature_levels):
+                if l == _len_srcs:
+                    src = self.input_proj[l](features[-1].tensors)
+                else:
+                    src = self.input_proj[l](srcs[-1])
+                m = samples.mask
+                mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0]
+                pos_l = self.backbone.position_embedding(NestedTensor(src, mask)).to(src.dtype)
+                srcs.append(src)
+                masks.append(mask)
+                poss.append(pos_l)
+
+        input_query_bbox = input_query_label = attn_mask = dn_meta = None
+        hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer(
+            srcs, masks, input_query_bbox, poss, input_query_label, attn_mask, text_dict
+        )
+
+        out = {'hidden_state':torch.cat([hs[-1],reference[-1]], dim=-1)}
+
+        # deformable-detr-like anchor update
+        outputs_coord_list = []
+        for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_hs) in enumerate(
+            zip(reference[:-1], self.bbox_embed, hs)
+        ):
+            layer_delta_unsig = layer_bbox_embed(layer_hs)
+            layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(layer_ref_sig)
+            layer_outputs_unsig = layer_outputs_unsig.sigmoid()
+            outputs_coord_list.append(layer_outputs_unsig)
+        outputs_coord_list = torch.stack(outputs_coord_list)
+
+        # output
+        outputs_class = torch.stack(
+            [
+                layer_cls_embed(layer_hs, text_dict)
+                for layer_cls_embed, layer_hs in zip(self.class_embed, hs)
+            ]
+        )
+        # out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord_list[-1]}
+        out['pred_logits'] = outputs_class[-1]
+        out['pred_boxes'] = outputs_coord_list[-1]
+
+        # out = torch.cat([hs[-1],reference[-1]], dim=-1)
+
+        return out
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [
+            {"pred_logits": a, "pred_boxes": b}
+            for a, b in zip(outputs_class[:-1], outputs_coord[:-1])
+        ]
diff --git a/fengshen/models/Lyrics/groundingdino/ms_deform_attn.py b/fengshen/models/Lyrics/groundingdino/ms_deform_attn.py
new file mode 100644
index 0000000..e87aa83
--- /dev/null
+++ b/fengshen/models/Lyrics/groundingdino/ms_deform_attn.py
@@ -0,0 +1,421 @@
+# ------------------------------------------------------------------------
+# Grounding DINO
+# url: https://github.com/IDEA-Research/GroundingDINO
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from:
+# https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/functions/ms_deform_attn_func.py
+# https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py
+# https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/multi_scale_deform_attn.py
+# ------------------------------------------------------------------------------------------------
+
+import math
+import warnings
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.init import constant_, xavier_uniform_
+
+try:
+    from lyrica import _C
+except:
+    warnings.warn("Failed to load custom C++ ops. Running on CPU mode Only!")
+
+
+# helpers
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
+    return (n & (n - 1) == 0) and n != 0
+
+
+class MultiScaleDeformableAttnFunction(Function):
+    @staticmethod
+    def forward(
+        ctx,
+        value,
+        value_spatial_shapes,
+        value_level_start_index,
+        sampling_locations,
+        attention_weights,
+        im2col_step,
+    ):
+        ctx.im2col_step = im2col_step
+        output = _C.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            ctx.im2col_step,
+        )
+        ctx.save_for_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+        )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        (
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+        ) = ctx.saved_tensors
+        grad_value, grad_sampling_loc, grad_attn_weight = _C.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output,
+            ctx.im2col_step,
+        )
+
+        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
+
+
+def multi_scale_deformable_attn_pytorch(
+    value: torch.Tensor,
+    value_spatial_shapes: torch.Tensor,
+    sampling_locations: torch.Tensor,
+    attention_weights: torch.Tensor,
+) -> torch.Tensor:
+
+    bs, _, num_heads, embed_dims = value.shape
+    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level, (H_, W_) in enumerate(value_spatial_shapes):
+        # bs, H_*W_, num_heads, embed_dims ->
+        # bs, H_*W_, num_heads*embed_dims ->
+        # bs, num_heads*embed_dims, H_*W_ ->
+        # bs*num_heads, embed_dims, H_, W_
+        value_l_ = (
+            value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_)
+        )
+        # bs, num_queries, num_heads, num_points, 2 ->
+        # bs, num_heads, num_queries, num_points, 2 ->
+        # bs*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1)
+        # bs*num_heads, embed_dims, num_queries, num_points
+        sampling_value_l_ = F.grid_sample(
+            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
+        )
+        sampling_value_list.append(sampling_value_l_)
+    # (bs, num_queries, num_heads, num_levels, num_points) ->
+    # (bs, num_heads, num_queries, num_levels, num_points) ->
+    # (bs, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        bs * num_heads, 1, num_queries, num_levels * num_points
+    )
+    output = (
+        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+        .sum(-1)
+        .view(bs, num_heads * embed_dims, num_queries)
+    )
+    return output.transpose(1, 2).contiguous()
+
+
+class MultiScaleDeformableAttention(nn.Module):
+    """Multi-Scale Deformable Attention Module used in Deformable-DETR
+
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+
+    Args:
+        embed_dim (int): The embedding dimension of Attention. Default: 256.
+        num_heads (int): The number of attention heads. Default: 8.
+        num_levels (int): The number of feature map used in Attention. Default: 4.
+        num_points (int): The number of sampling points for each query
+            in each head. Default: 4.
+        img2col_steps (int): The step used in image_to_column. Defualt: 64.
+            dropout (float): Dropout layer used in output. Default: 0.1.
+        batch_first (bool): if ``True``, then the input and output tensor will be
+            provided as `(bs, n, embed_dim)`. Default: False. `(n, bs, embed_dim)`
+    """
+
+    def __init__(
+        self,
+        embed_dim: int = 256,
+        num_heads: int = 8,
+        num_levels: int = 4,
+        num_points: int = 4,
+        img2col_step: int = 64,
+        batch_first: bool = False,
+    ):
+        super().__init__()
+        if embed_dim % num_heads != 0:
+            raise ValueError(
+                "embed_dim must be divisible by num_heads, but got {} and {}".format(
+                    embed_dim, num_heads
+                )
+            )
+        head_dim = embed_dim // num_heads
+
+        self.batch_first = batch_first
+
+        if not _is_power_of_2(head_dim):
+            warnings.warn(
+                """
+                You'd better set d_model in MSDeformAttn to make sure that
+                each dim of the attention head a power of 2, which is more efficient.
+                """
+            )
+
+        self.im2col_step = img2col_step
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_levels = num_levels
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(embed_dim, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dim, num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dim, embed_dim)
+        self.output_proj = nn.Linear(embed_dim, embed_dim)
+
+        self.init_weights()
+
+    def _reset_parameters(self):
+        return self.init_weights()
+
+    def init_weights(self):
+        """
+        Default initialization for Parameters of Module.
+        """
+        constant_(self.sampling_offsets.weight.data, 0.0)
+        thetas = torch.arange(self.num_heads, dtype=torch.float32) * (
+            2.0 * math.pi / self.num_heads
+        )
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (
+            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+            .view(self.num_heads, 1, 1, 2)
+            .repeat(1, self.num_levels, self.num_points, 1)
+        )
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        constant_(self.attention_weights.weight.data, 0.0)
+        constant_(self.attention_weights.bias.data, 0.0)
+        xavier_uniform_(self.value_proj.weight.data)
+        constant_(self.value_proj.bias.data, 0.0)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.0)
+
+    def freeze_sampling_offsets(self):
+        print("Freeze sampling offsets")
+        self.sampling_offsets.weight.requires_grad = False
+        self.sampling_offsets.bias.requires_grad = False
+
+    def freeze_attention_weights(self):
+        print("Freeze attention weights")
+        self.attention_weights.weight.requires_grad = False
+        self.attention_weights.bias.requires_grad = False
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: Optional[torch.Tensor] = None,
+        value: Optional[torch.Tensor] = None,
+        query_pos: Optional[torch.Tensor] = None,
+        key_padding_mask: Optional[torch.Tensor] = None,
+        reference_points: Optional[torch.Tensor] = None,
+        spatial_shapes: Optional[torch.Tensor] = None,
+        level_start_index: Optional[torch.Tensor] = None,
+        **kwargs
+    ) -> torch.Tensor:
+
+        """Forward Function of MultiScaleDeformableAttention
+
+        Args:
+            query (torch.Tensor): Query embeddings with shape
+                `(num_query, bs, embed_dim)`
+            key (torch.Tensor): Key embeddings with shape
+                `(num_key, bs, embed_dim)`
+            value (torch.Tensor): Value embeddings with shape
+                `(num_key, bs, embed_dim)`
+            query_pos (torch.Tensor): The position embedding for `query`. Default: None.
+            key_padding_mask (torch.Tensor): ByteTensor for `query`, with shape `(bs, num_key)`,
+                indicating which elements within `key` to be ignored in attention.
+            reference_points (torch.Tensor): The normalized reference points
+                with shape `(bs, num_query, num_levels, 2)`,
+                all elements is range in [0, 1], top-left (0, 0),
+                bottom-right (1, 1), including padding are.
+                or `(N, Length_{query}, num_levels, 4)`, add additional
+                two dimensions `(h, w)` to form reference boxes.
+            spatial_shapes (torch.Tensor): Spatial shape of features in different levels.
+                With shape `(num_levels, 2)`, last dimension represents `(h, w)`.
+            level_start_index (torch.Tensor): The start index of each level. A tensor with
+                shape `(num_levels, )` which can be represented as
+                `[0, h_0 * w_0, h_0 * w_0 + h_1 * w_1, ...]`.
+
+        Returns:
+            torch.Tensor: forward results with shape `(num_query, bs, embed_dim)`
+        """
+
+        if value is None:
+            value = query
+
+        if query_pos is not None:
+            query = query + query_pos
+
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], float(0))
+        value = value.view(bs, num_value, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2
+        )
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points
+        )
+        attention_weights = attention_weights.softmax(-1)
+        attention_weights = attention_weights.view(
+            bs,
+            num_query,
+            self.num_heads,
+            self.num_levels,
+            self.num_points,
+        )
+
+        # bs, num_query, num_heads, num_levels, num_points, 2
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :]
+                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+            )
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets
+                / self.num_points
+                * reference_points[:, :, None, :, None, 2:]
+                * 0.5
+            )
+        else:
+            raise ValueError(
+                "Last dim of reference_points must be 2 or 4, but get {} instead.".format(
+                    reference_points.shape[-1]
+                )
+            )
+    
+        if torch.cuda.is_available() and value.is_cuda:
+            halffloat = False
+            bhalffloat = False
+            if value.dtype == torch.float16:
+                halffloat = True
+                value = value.float()
+                sampling_locations = sampling_locations.float()
+                attention_weights = attention_weights.float()
+            elif value.dtype == torch.bfloat16:
+                bhalffloat = True
+                value = value.float()
+                sampling_locations = sampling_locations.float()
+                attention_weights = attention_weights.float()
+
+            output = MultiScaleDeformableAttnFunction.apply(
+                value,
+                spatial_shapes,
+                level_start_index,
+                sampling_locations,
+                attention_weights,
+                self.im2col_step,
+            )
+
+            if halffloat:
+                output = output.half()
+            elif bhalffloat:
+                output = torch.tensor(output, dtype=torch.bfloat16)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights
+            )
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            output = output.permute(1, 0, 2)
+
+        return output
+
+
+def create_dummy_class(klass, dependency, message=""):
+    """
+    When a dependency of a class is not available, create a dummy class which throws ImportError
+    when used.
+
+    Args:
+        klass (str): name of the class.
+        dependency (str): name of the dependency.
+        message: extra message to print
+    Returns:
+        class: a class object
+    """
+    err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, klass)
+    if message:
+        err = err + " " + message
+
+    class _DummyMetaClass(type):
+        # throw error on class attribute access
+        def __getattr__(_, __):  # noqa: B902
+            raise ImportError(err)
+
+    class _Dummy(object, metaclass=_DummyMetaClass):
+        # throw error on constructor
+        def __init__(self, *args, **kwargs):
+            raise ImportError(err)
+
+    return _Dummy
+
+
+def create_dummy_func(func, dependency, message=""):
+    """
+    When a dependency of a function is not available, create a dummy function which throws
+    ImportError when used.
+
+    Args:
+        func (str): name of the function.
+        dependency (str or list[str]): name(s) of the dependency.
+        message: extra message to print
+    Returns:
+        function: a function object
+    """
+    err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, func)
+    if message:
+        err = err + " " + message
+
+    if isinstance(dependency, (list, tuple)):
+        dependency = ",".join(dependency)
+
+    def _dummy(*args, **kwargs):
+        raise ImportError(err)
+
+    return _dummy
diff --git a/fengshen/models/Lyrics/groundingdino/setup.py b/fengshen/models/Lyrics/groundingdino/setup.py
new file mode 100644
index 0000000..12d2328
--- /dev/null
+++ b/fengshen/models/Lyrics/groundingdino/setup.py
@@ -0,0 +1,121 @@
+# coding=utf-8
+# Copyright 2022 The IDEA Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ------------------------------------------------------------------------------------------------
+# Modified from
+# https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/setup.py
+# https://github.com/facebookresearch/detectron2/blob/main/setup.py
+# https://github.com/open-mmlab/mmdetection/blob/master/setup.py
+# https://github.com/Oneflow-Inc/libai/blob/main/setup.py
+# ------------------------------------------------------------------------------------------------
+
+import glob
+import os
+import subprocess
+
+import torch
+from setuptools import find_packages, setup
+from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
+
+# groundingdino version info
+version = "0.1.0"
+package_name = "groundingdino"
+cwd = os.path.dirname(os.path.abspath(__file__))
+
+requirements = ["torch", "torchvision"]
+
+torch_ver = [int(x) for x in torch.__version__.split(".")[:2]]
+
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "csrc")
+
+    main_source = os.path.join(extensions_dir, "vision.cpp")
+    sources = glob.glob(os.path.join(extensions_dir, "**", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "**", "*.cu")) + glob.glob(
+        os.path.join(extensions_dir, "*.cu")
+    )
+
+    sources = [main_source] + sources
+
+    # We need these variables to build with CUDA when we create the Docker image
+    # It solves https://github.com/IDEA-Research/Grounded-Segment-Anything/issues/53
+    # and https://github.com/IDEA-Research/Grounded-Segment-Anything/issues/84 when running
+    # inside a Docker container.
+    am_i_docker = os.environ.get('AM_I_DOCKER', '').casefold() in ['true', '1', 't']
+    use_cuda = os.environ.get('BUILD_WITH_CUDA', '').casefold() in ['true', '1', 't']
+
+    extension = CppExtension
+
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+
+    if (torch.cuda.is_available() and CUDA_HOME is not None) or \
+            (am_i_docker and use_cuda):
+        print("Compiling with CUDA")
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+    else:
+        print("Compiling without CUDA")
+        define_macros += [("WITH_HIP", None)]
+        extra_compile_args["nvcc"] = []
+        return None
+
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+    include_dirs = [extensions_dir]
+
+    ext_modules = [
+        extension(
+            "lyrica._C",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+
+    return ext_modules
+
+
+if __name__ == "__main__":
+    print(f"Building wheel {package_name}-{version}")
+
+    # with open("LICENSE", "r", encoding="utf-8") as f:
+    #     license = f.read()
+
+
+    setup(
+        name="deformable detr cuda operator",
+        version="0.1.0",
+        # author="International Digital Economy Academy, Shilong Liu",
+        # url="https://github.com/IDEA-Research/GroundingDINO",
+        description="deformable detr cuda operator",
+        # license=license,
+        packages=find_packages(
+            exclude=(
+                "configs",
+                "tests",
+            )
+        ),
+        ext_modules=get_extensions(),
+        cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+    )
diff --git a/fengshen/models/Lyrics/groundingdino/transformer.py b/fengshen/models/Lyrics/groundingdino/transformer.py
new file mode 100644
index 0000000..ddbb4d8
--- /dev/null
+++ b/fengshen/models/Lyrics/groundingdino/transformer.py
@@ -0,0 +1,908 @@
+# ------------------------------------------------------------------------
+# Grounding DINO
+# url: https://github.com/IDEA-Research/GroundingDINO
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# DINO
+# Copyright (c) 2022 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Conditional DETR Transformer class.
+# Copyright (c) 2021 Microsoft. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Modified from DETR (https://github.com/facebookresearch/detr)
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
+# ------------------------------------------------------------------------
+
+from typing import Optional
+
+import torch
+import torch.utils.checkpoint as checkpoint
+from torch import Tensor, nn
+
+from fengshen.models.groundedblip.groundingdino.utils import inverse_sigmoid
+
+from fengshen.models.groundedblip.groundingdino.fuse_modules import BiAttentionBlock
+from fengshen.models.groundedblip.groundingdino.ms_deform_attn import MultiScaleDeformableAttention as MSDeformAttn
+from fengshen.models.groundedblip.groundingdino.transformer_vanilla import TransformerEncoderLayer
+from fengshen.models.groundedblip.groundingdino.utils import (
+    MLP,
+    _get_activation_fn,
+    _get_clones,
+    gen_encoder_output_proposals,
+    gen_sineembed_for_position,
+    get_sine_pos_embed,
+)
+
+
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        args,
+        ):
+        super().__init__()
+        self.num_feature_levels = args.num_feature_levels
+        self.num_encoder_layers = args.enc_layers
+        self.num_unicoder_layers = args.num_unicoder_layers
+        self.num_decoder_layers = args.dec_layers
+        self.num_queries = args.num_queries
+        assert args.query_dim == 4
+
+        # choose encoder layer type
+        encoder_layer = DeformableTransformerEncoderLayer(
+            args.hidden_dim, args.dim_feedforward, args.dropout, args.transformer_activation, args.num_feature_levels, args.nheads, args.enc_n_points
+        )
+
+        text_enhance_layer = TransformerEncoderLayer(
+            d_model=args.hidden_dim,
+            nhead=args.nheads // 2,
+            dim_feedforward=args.dim_feedforward // 2,
+            dropout=args.text_dropout,
+        )
+
+        feature_fusion_layer = BiAttentionBlock(
+            v_dim=args.hidden_dim,
+            l_dim=args.hidden_dim,
+            embed_dim=args.dim_feedforward // 2,
+            num_heads=args.nheads // 2,
+            dropout=args.fusion_dropout,
+            drop_path=args.fusion_droppath,
+        )
+
+        encoder_norm = nn.LayerNorm(args.hidden_dim) if args.pre_norm else None
+        assert encoder_norm is None
+        self.encoder = TransformerEncoder(
+            encoder_layer,
+            args.enc_layers,
+            d_model=args.hidden_dim,
+            num_queries=args.num_queries,
+            text_enhance_layer=text_enhance_layer,
+            feature_fusion_layer=feature_fusion_layer,
+            use_checkpoint=args.use_checkpoint,
+            use_transformer_ckpt=args.use_transformer_ckpt,
+        )
+
+        # choose decoder layer type
+        decoder_layer = DeformableTransformerDecoderLayer(
+            args.hidden_dim,
+            args.dim_feedforward,
+            args.dropout,
+            args.transformer_activation,
+            args.num_feature_levels,
+            args.nheads,
+            args.dec_n_points,
+            use_text_cross_attention=args.use_text_cross_attention,
+        )
+
+        decoder_norm = nn.LayerNorm(args.hidden_dim)
+        self.decoder = TransformerDecoder(
+            decoder_layer,
+            args.dec_layers,
+            decoder_norm,
+            return_intermediate=args.return_intermediate_dec,
+            d_model=args.hidden_dim,
+            query_dim=args.query_dim,
+            num_feature_levels=args.num_feature_levels,
+        )
+
+        self.d_model = args.hidden_dim
+        self.nhead = args.nheads
+        self.dec_layers = args.dec_layers
+        self.num_queries = args.num_queries  # useful for single stage model only
+        self.num_patterns = args.num_patterns
+        if not isinstance(args.num_patterns, int):
+            Warning("num_patterns should be int but {}".format(type(args.num_patterns)))
+            self.num_patterns = 0
+
+        if args.num_feature_levels > 1:
+            if self.num_encoder_layers > 0:
+                self.level_embed = nn.Parameter(torch.Tensor(args.num_feature_levels, args.hidden_dim))
+            else:
+                self.level_embed = None
+
+        self.learnable_tgt_init = args.learnable_tgt_init
+        assert args.learnable_tgt_init, "why not learnable_tgt_init"
+        self.embed_init_tgt = args.embed_init_tgt
+        if (args.two_stage_type != "no" and args.embed_init_tgt) or (args.two_stage_type == "no"):
+            self.tgt_embed = nn.Embedding(self.num_queries, args.hidden_dim)
+            nn.init.normal_(self.tgt_embed.weight.data)
+        else:
+            self.tgt_embed = None
+
+        # for two stage
+        self.two_stage_type = args.two_stage_type
+        assert args.two_stage_type in ["no", "standard"], "unknown param {} of two_stage_type".format(
+            args.two_stage_type
+        )
+        if args.two_stage_type == "standard":
+            # anchor selection at the output of encoder
+            self.enc_output = nn.Linear(args.hidden_dim, args.hidden_dim)
+            self.enc_output_norm = nn.LayerNorm(args.hidden_dim)
+            self.two_stage_wh_embedding = None
+
+        if args.two_stage_type == "no":
+            self.init_ref_points(args.num_queries)  # init self.refpoint_embed
+
+        self.enc_out_class_embed = None
+        self.enc_out_bbox_embed = None
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MSDeformAttn):
+                m._reset_parameters()
+        if self.num_feature_levels > 1 and self.level_embed is not None:
+            nn.init.normal_(self.level_embed)
+
+    def get_valid_ratio(self, mask):
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def init_ref_points(self, use_num_queries):
+        self.refpoint_embed = nn.Embedding(use_num_queries, 4)
+
+    def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None, text_dict=None):
+        """
+        Input:
+            - srcs: List of multi features [bs, ci, hi, wi]
+            - masks: List of multi masks [bs, hi, wi]
+            - refpoint_embed: [bs, num_dn, 4]. None in infer
+            - pos_embeds: List of multi pos embeds [bs, ci, hi, wi]
+            - tgt: [bs, num_dn, d_model]. None in infer
+
+        """
+        # prepare input for encoder
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        # import time
+        # start_time = time.time()
+        for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
+            bs, c, h, w = src.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+
+            src = src.flatten(2).transpose(1, 2)  # bs, hw, c
+            mask = mask.flatten(1)  # bs, hw
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)  # bs, hw, c
+            if self.num_feature_levels > 1 and self.level_embed is not None:
+                lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
+            else:
+                lvl_pos_embed = pos_embed
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            src_flatten.append(src)
+            mask_flatten.append(mask)
+
+        # end_time = time.time()
+        # print(end_time-start_time)
+
+        src_flatten = torch.cat(src_flatten, 1)  # bs, \sum{hxw}, c
+        mask_flatten = torch.cat(mask_flatten, 1)  # bs, \sum{hxw}
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)  # bs, \sum{hxw}, c
+        spatial_shapes = torch.as_tensor(
+            spatial_shapes, dtype=torch.long, device=src_flatten.device
+        )
+        level_start_index = torch.cat(
+            (spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])
+        )
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+
+        # two stage
+        enc_topk_proposals = enc_refpoint_embed = None
+        # end_time = time.time()
+        # print(end_time-start_time)
+        #########################################################
+        # Begin Encoder
+        #########################################################
+        memory, memory_text = self.encoder(
+            src_flatten,
+            pos=lvl_pos_embed_flatten,
+            level_start_index=level_start_index,
+            spatial_shapes=spatial_shapes,
+            valid_ratios=valid_ratios,
+            key_padding_mask=mask_flatten,
+            memory_text=text_dict["encoded_text"],
+            text_attention_mask=~text_dict["text_token_mask"],
+            # we ~ the mask . False means use the token; True means pad the token
+            position_ids=text_dict["position_ids"],
+            text_self_attention_masks=text_dict["text_self_attention_masks"],
+        )
+        # end_time = time.time()
+        # print(end_time-start_time)
+        #########################################################
+        # End Encoder
+        # - memory: bs, \sum{hw}, c
+        # - mask_flatten: bs, \sum{hw}
+        # - lvl_pos_embed_flatten: bs, \sum{hw}, c
+        # - enc_intermediate_output: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
+        # - enc_intermediate_refpoints: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
+        #########################################################
+        text_dict["encoded_text"] = memory_text
+        # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+        #     if memory.isnan().any() | memory.isinf().any():
+        #         import ipdb; ipdb.set_trace()
+
+        if self.two_stage_type == "standard":
+            output_memory, output_proposals = gen_encoder_output_proposals(
+                memory, mask_flatten, spatial_shapes
+            )
+            output_memory = self.enc_output_norm(self.enc_output(output_memory))
+            # print(output_memory.size())
+
+            if text_dict is not None:
+                enc_outputs_class_unselected = self.enc_out_class_embed(output_memory, text_dict)
+            else:
+                enc_outputs_class_unselected = self.enc_out_class_embed(output_memory)
+
+            topk_logits = enc_outputs_class_unselected.max(-1)[0]
+            # print('topk_logits:', topk_logits.shape)
+            enc_outputs_coord_unselected = (
+                self.enc_out_bbox_embed(output_memory) + output_proposals
+            )  # (bs, \sum{hw}, 4) unsigmoid
+            topk = self.num_queries
+
+            topk_proposals = torch.topk(topk_logits, topk, dim=1)[1]  # bs, nq
+            # box是根据mask(左上角点)来初始化的
+            # print('topk_proposals:', topk_proposals.shape)
+            # gather boxes
+            refpoint_embed_undetach = torch.gather(
+                enc_outputs_coord_unselected, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
+            )  # unsigmoid
+            refpoint_embed_ = refpoint_embed_undetach.detach()
+            init_box_proposal = torch.gather(
+                output_proposals, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
+            ).sigmoid()  # sigmoid
+
+            # gather tgt
+            tgt_undetach = torch.gather(
+                output_memory, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model)
+            )
+            if self.embed_init_tgt:
+                tgt_ = (
+                    self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)
+                )  # nq, bs, d_model
+            else:
+                tgt_ = tgt_undetach.detach()
+
+            if refpoint_embed is not None:
+                refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1)
+                tgt = torch.cat([tgt, tgt_], dim=1)
+            else:
+                refpoint_embed, tgt = refpoint_embed_, tgt_
+
+        elif self.two_stage_type == "no":
+            tgt_ = (
+                self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)
+            )  # nq, bs, d_model
+            refpoint_embed_ = (
+                self.refpoint_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1)
+            )  # nq, bs, 4
+
+            if refpoint_embed is not None:
+                refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1)
+                tgt = torch.cat([tgt, tgt_], dim=1)
+            else:
+                refpoint_embed, tgt = refpoint_embed_, tgt_
+
+            if self.num_patterns > 0:
+                tgt_embed = tgt.repeat(1, self.num_patterns, 1)
+                refpoint_embed = refpoint_embed.repeat(1, self.num_patterns, 1)
+                tgt_pat = self.patterns.weight[None, :, :].repeat_interleave(
+                    self.num_queries, 1
+                )  # 1, n_q*n_pat, d_model
+                tgt = tgt_embed + tgt_pat
+
+            init_box_proposal = refpoint_embed_.sigmoid()
+
+        else:
+            raise NotImplementedError("unknown two_stage_type {}".format(self.two_stage_type))
+
+        #########################################################
+        # End preparing tgt
+        # - tgt: bs, NQ, d_model
+        # - refpoint_embed(unsigmoid): bs, NQ, d_model
+        #########################################################
+
+        #########################################################
+        # Begin Decoder
+        #########################################################
+        hs, references = self.decoder(
+            tgt=tgt.transpose(0, 1),
+            memory=memory.transpose(0, 1),
+            memory_key_padding_mask=mask_flatten,
+            pos=lvl_pos_embed_flatten.transpose(0, 1),
+            refpoints_unsigmoid=refpoint_embed.transpose(0, 1),
+            level_start_index=level_start_index,
+            spatial_shapes=spatial_shapes,
+            valid_ratios=valid_ratios,
+            tgt_mask=attn_mask,
+            memory_text=text_dict["encoded_text"],
+            text_attention_mask=~text_dict["text_token_mask"],
+            # we ~ the mask . False means use the token; True means pad the token
+        )
+
+        #########################################################
+        # End Decoder
+        # hs: n_dec, bs, nq, d_model
+        # references: n_dec+1, bs, nq, query_dim
+        #########################################################
+
+        #########################################################
+        # Begin postprocess
+        #########################################################
+        if self.two_stage_type == "standard":
+            hs_enc = tgt_undetach.unsqueeze(0)
+            ref_enc = refpoint_embed_undetach.sigmoid().unsqueeze(0)
+        else:
+            hs_enc = ref_enc = None
+        #########################################################
+        # End postprocess
+        # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or (n_enc, bs, nq, d_model) or None
+        # ref_enc: (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or (n_enc, bs, nq, d_model) or None
+        #########################################################
+
+        return hs, references, hs_enc, ref_enc, init_box_proposal
+        # hs: (n_dec, bs, nq, d_model)
+        # references: sigmoid coordinates. (n_dec+1, bs, bq, 4)
+        # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or None
+        # ref_enc: sigmoid coordinates. \
+        #           (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or None
+
+
+class TransformerEncoder(nn.Module):
+    def __init__(
+        self,
+        encoder_layer,
+        num_layers,
+        d_model=256,
+        num_queries=300,
+        enc_layer_share=False,
+        text_enhance_layer=None,
+        feature_fusion_layer=None,
+        use_checkpoint=False,
+        use_transformer_ckpt=False,
+    ):
+        """_summary_
+
+        Args:
+            encoder_layer (_type_): _description_
+            num_layers (_type_): _description_
+            norm (_type_, optional): _description_. Defaults to None.
+            d_model (int, optional): _description_. Defaults to 256.
+            num_queries (int, optional): _description_. Defaults to 300.
+            enc_layer_share (bool, optional): _description_. Defaults to False.
+
+        """
+        super().__init__()
+        # prepare layers
+        self.layers = []
+        self.text_layers = []
+        self.fusion_layers = []
+        if num_layers > 0:
+            self.layers = _get_clones(encoder_layer, num_layers, layer_share=enc_layer_share)
+
+            self.text_layers = _get_clones(
+                text_enhance_layer, num_layers, layer_share=enc_layer_share
+            )
+            
+            self.fusion_layers = _get_clones(
+                feature_fusion_layer, num_layers, layer_share=enc_layer_share
+            )
+        else:
+            self.layers = []
+            del encoder_layer
+
+            if text_enhance_layer is not None:
+                self.text_layers = []
+                del text_enhance_layer
+            if feature_fusion_layer is not None:
+                self.fusion_layers = []
+                del feature_fusion_layer
+
+        self.query_scale = None
+        self.num_queries = num_queries
+        self.num_layers = num_layers
+        self.d_model = d_model
+
+        self.use_checkpoint = use_checkpoint
+        self.use_transformer_ckpt = use_transformer_ckpt
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        reference_points_list = []
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device),
+                torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device),
+            )
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def forward(
+        self,
+        # for images
+        src: Tensor,
+        pos: Tensor,
+        spatial_shapes: Tensor,
+        level_start_index: Tensor,
+        valid_ratios: Tensor,
+        key_padding_mask: Tensor,
+        # for texts
+        memory_text: Tensor = None,
+        text_attention_mask: Tensor = None,
+        pos_text: Tensor = None,
+        text_self_attention_masks: Tensor = None,
+        position_ids: Tensor = None,
+    ):
+        """
+        Input:
+            - src: [bs, sum(hi*wi), 256]
+            - pos: pos embed for src. [bs, sum(hi*wi), 256]
+            - spatial_shapes: h,w of each level [num_level, 2]
+            - level_start_index: [num_level] start point of level in sum(hi*wi).
+            - valid_ratios: [bs, num_level, 2]
+            - key_padding_mask: [bs, sum(hi*wi)]
+
+            - memory_text: bs, n_text, 256
+            - text_attention_mask: bs, n_text
+                False for no padding; True for padding
+            - pos_text: bs, n_text, 256
+
+            - position_ids: bs, n_text
+        Intermedia:
+            - reference_points: [bs, sum(hi*wi), num_level, 2]
+        Outpus:
+            - output: [bs, sum(hi*wi), 256]
+        """
+
+        output = src
+        # import time
+        # start_time = time.time()
+        # preparation and reshape
+        if self.num_layers > 0:
+            reference_points = self.get_reference_points(
+                spatial_shapes, valid_ratios, device=src.device
+            )
+        # end_time = time.time()
+        # print('encoder: ',end_time-start_time)
+        if self.text_layers:
+            # generate pos_text
+            bs, n_text, text_dim = memory_text.shape
+            if pos_text is None and position_ids is None:
+                pos_text = (
+                    torch.arange(n_text, device=memory_text.device)
+                    .float()
+                    .unsqueeze(0)
+                    .unsqueeze(-1)
+                    .repeat(bs, 1, 1)
+                )
+                pos_text = get_sine_pos_embed(pos_text, num_pos_feats=256, exchange_xy=False)
+            if position_ids is not None:
+                pos_text = get_sine_pos_embed(
+                    position_ids[..., None], num_pos_feats=256, exchange_xy=False
+                )
+        # end_time = time.time()
+        # print('encoder1: ',end_time-start_time)
+        # main process
+        for layer_id, layer in enumerate(self.layers):
+            # if output.isnan().any() or memory_text.isnan().any():
+            #     if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO':
+            #         import ipdb; ipdb.set_trace()
+            if self.fusion_layers:
+                if self.use_checkpoint:
+                    output, memory_text = checkpoint.checkpoint(
+                        self.fusion_layers[layer_id],
+                        output,
+                        memory_text,
+                        key_padding_mask,
+                        text_attention_mask,
+                    )
+                else:
+                    output, memory_text = self.fusion_layers[layer_id](
+                        v=output,
+                        l=memory_text,
+                        attention_mask_v=key_padding_mask,
+                        attention_mask_l=text_attention_mask,
+                    )
+
+            if self.text_layers:
+                memory_text = self.text_layers[layer_id](
+                    src=memory_text.transpose(0, 1),
+                    src_mask=~text_self_attention_masks,  # note we use ~ for mask here
+                    src_key_padding_mask=text_attention_mask,
+                    pos=(pos_text.transpose(0, 1) if pos_text is not None else None),
+                ).transpose(0, 1)
+
+            # main process
+            if self.use_transformer_ckpt:
+                output = checkpoint.checkpoint(
+                    layer,
+                    output,
+                    pos,
+                    reference_points,
+                    spatial_shapes,
+                    level_start_index,
+                    key_padding_mask,
+                )
+            else:
+                output = layer(
+                    src=output,
+                    pos=pos,
+                    reference_points=reference_points,
+                    spatial_shapes=spatial_shapes,
+                    level_start_index=level_start_index,
+                    key_padding_mask=key_padding_mask,
+                )
+        # end_time = time.time()
+        # print('encoder_recussive: ',end_time-start_time)
+        return output, memory_text
+
+
+class TransformerDecoder(nn.Module):
+    def __init__(
+        self,
+        decoder_layer,
+        num_layers,
+        norm=None,
+        return_intermediate=False,
+        d_model=256,
+        query_dim=4,
+        num_feature_levels=1,
+    ):
+        super().__init__()
+        if num_layers > 0:
+            self.layers = _get_clones(decoder_layer, num_layers)
+        else:
+            self.layers = []
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+        assert return_intermediate, "support return_intermediate only"
+        self.query_dim = query_dim
+        assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format(query_dim)
+        self.num_feature_levels = num_feature_levels
+
+        self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2)
+        self.query_pos_sine_scale = None
+
+        self.query_scale = None
+        self.bbox_embed = None
+        self.class_embed = None
+
+        self.d_model = d_model
+
+        self.ref_anchor_head = None
+
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        refpoints_unsigmoid: Optional[Tensor] = None,  # num_queries, bs, 2
+        # for memory
+        level_start_index: Optional[Tensor] = None,  # num_levels
+        spatial_shapes: Optional[Tensor] = None,  # bs, num_levels, 2
+        valid_ratios: Optional[Tensor] = None,
+        # for text
+        memory_text: Optional[Tensor] = None,
+        text_attention_mask: Optional[Tensor] = None,
+    ):
+        """
+        Input:
+            - tgt: nq, bs, d_model
+            - memory: hw, bs, d_model
+            - pos: hw, bs, d_model
+            - refpoints_unsigmoid: nq, bs, 2/4
+            - valid_ratios/spatial_shapes: bs, nlevel, 2
+        """
+        output = tgt
+
+        intermediate = []
+        reference_points = refpoints_unsigmoid.sigmoid()
+        ref_points = [reference_points]
+
+        for layer_id, layer in enumerate(self.layers):
+
+            if reference_points.shape[-1] == 4:
+                reference_points_input = (
+                    reference_points[:, :, None]
+                    * torch.cat([valid_ratios, valid_ratios], -1)[None, :]
+                )  # nq, bs, nlevel, 4
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = reference_points[:, :, None] * valid_ratios[None, :]
+            query_sine_embed = gen_sineembed_for_position(
+                reference_points_input[:, :, 0, :]
+            )  # nq, bs, 256*2
+
+            # conditional query
+            raw_query_pos = self.ref_point_head(query_sine_embed)  # nq, bs, 256
+            pos_scale = self.query_scale(output) if self.query_scale is not None else 1
+            query_pos = pos_scale * raw_query_pos
+            # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+            #     if query_pos.isnan().any() | query_pos.isinf().any():
+            #         import ipdb; ipdb.set_trace()
+
+            # main process
+            output = layer(
+                tgt=output,
+                tgt_query_pos=query_pos,
+                tgt_query_sine_embed=query_sine_embed,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                tgt_reference_points=reference_points_input,
+                memory_text=memory_text,
+                text_attention_mask=text_attention_mask,
+                memory=memory,
+                memory_key_padding_mask=memory_key_padding_mask,
+                memory_level_start_index=level_start_index,
+                memory_spatial_shapes=spatial_shapes,
+                memory_pos=pos,
+                self_attn_mask=tgt_mask,
+                cross_attn_mask=memory_mask,
+            )
+            if output.isnan().any() | output.isinf().any():
+                print(f"output layer_id {layer_id} is nan")
+                try:
+                    num_nan = output.isnan().sum().item()
+                    num_inf = output.isinf().sum().item()
+                    print(f"num_nan {num_nan}, num_inf {num_inf}")
+                except Exception as e:
+                    print(e)
+                    # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1':
+                    #     import ipdb; ipdb.set_trace()
+
+            # iter update
+            if self.bbox_embed is not None:
+                # box_holder = self.bbox_embed(output)
+                # box_holder[..., :self.query_dim] += inverse_sigmoid(reference_points)
+                # new_reference_points = box_holder[..., :self.query_dim].sigmoid()
+
+                reference_before_sigmoid = inverse_sigmoid(reference_points)
+                delta_unsig = self.bbox_embed[layer_id](output)
+                outputs_unsig = delta_unsig + reference_before_sigmoid
+                new_reference_points = outputs_unsig.sigmoid()
+
+                reference_points = new_reference_points.detach()
+                # if layer_id != self.num_layers - 1:
+                ref_points.append(new_reference_points)
+
+            intermediate.append(self.norm(output))
+
+        return [
+            [itm_out.transpose(0, 1) for itm_out in intermediate],
+            [itm_refpoint.transpose(0, 1) for itm_refpoint in ref_points],
+        ]
+
+
+class DeformableTransformerEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model=256,
+        d_ffn=1024,
+        dropout=0.1,
+        activation="relu",
+        n_levels=4,
+        n_heads=8,
+        n_points=4,
+    ):
+        super().__init__()
+
+        # self attention
+        self.self_attn = MSDeformAttn(
+            embed_dim=d_model,
+            num_levels=n_levels,
+            num_heads=n_heads,
+            num_points=n_points,
+            batch_first=True,
+        )
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward(
+        self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None
+    ):
+        # self attention
+        # import ipdb; ipdb.set_trace()
+        src2 = self.self_attn(
+            query=self.with_pos_embed(src, pos),
+            reference_points=reference_points,
+            value=src,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            key_padding_mask=key_padding_mask,
+        )
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+
+        # ffn
+        src = self.forward_ffn(src)
+
+        return src
+
+
+class DeformableTransformerDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model=256,
+        d_ffn=1024,
+        dropout=0.1,
+        activation="relu",
+        n_levels=4,
+        n_heads=8,
+        n_points=4,
+        use_text_feat_guide=False,
+        use_text_cross_attention=False,
+    ):
+        super().__init__()
+
+        # cross attention
+        self.cross_attn = MSDeformAttn(
+            embed_dim=d_model,
+            num_levels=n_levels,
+            num_heads=n_heads,
+            num_points=n_points,
+            batch_first=True,
+        )
+        self.dropout1 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # cross attention text
+        if use_text_cross_attention:
+            self.ca_text = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+            self.catext_dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+            self.catext_norm = nn.LayerNorm(d_model)
+
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout)
+        self.dropout2 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1)
+        self.dropout3 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout4 = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
+        self.norm3 = nn.LayerNorm(d_model)
+
+        self.key_aware_proj = None
+        self.use_text_feat_guide = use_text_feat_guide
+        assert not use_text_feat_guide
+        self.use_text_cross_attention = use_text_cross_attention
+
+    def rm_self_attn_modules(self):
+        self.self_attn = None
+        self.dropout2 = None
+        self.norm2 = None
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        with torch.cuda.amp.autocast(enabled=False):
+            tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward(
+        self,
+        # for tgt
+        tgt: Optional[Tensor],  # nq, bs, d_model
+        tgt_query_pos: Optional[Tensor] = None,  # pos for query. MLP(Sine(pos))
+        tgt_query_sine_embed: Optional[Tensor] = None,  # pos for query. Sine(pos)
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        tgt_reference_points: Optional[Tensor] = None,  # nq, bs, 4
+        memory_text: Optional[Tensor] = None,  # bs, num_token, d_model
+        text_attention_mask: Optional[Tensor] = None,  # bs, num_token
+        # for memory
+        memory: Optional[Tensor] = None,  # hw, bs, d_model
+        memory_key_padding_mask: Optional[Tensor] = None,
+        memory_level_start_index: Optional[Tensor] = None,  # num_levels
+        memory_spatial_shapes: Optional[Tensor] = None,  # bs, num_levels, 2
+        memory_pos: Optional[Tensor] = None,  # pos for memory
+        # sa
+        self_attn_mask: Optional[Tensor] = None,  # mask used for self-attention
+        cross_attn_mask: Optional[Tensor] = None,  # mask used for cross-attention
+    ):
+        """
+        Input:
+            - tgt/tgt_query_pos: nq, bs, d_model
+            -
+        """
+        assert cross_attn_mask is None
+
+        # self attention
+        if self.self_attn is not None:
+            # import ipdb; ipdb.set_trace()
+            q = k = self.with_pos_embed(tgt, tgt_query_pos)
+            tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]
+            tgt = tgt + self.dropout2(tgt2)
+            tgt = self.norm2(tgt)
+
+        if self.use_text_cross_attention:
+            tgt2 = self.ca_text(
+                self.with_pos_embed(tgt, tgt_query_pos),
+                memory_text.transpose(0, 1),
+                memory_text.transpose(0, 1),
+                key_padding_mask=text_attention_mask,
+            )[0]
+            tgt = tgt + self.catext_dropout(tgt2)
+            tgt = self.catext_norm(tgt)
+
+        tgt2 = self.cross_attn(
+            query=self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
+            reference_points=tgt_reference_points.transpose(0, 1).contiguous(),
+            value=memory.transpose(0, 1),
+            spatial_shapes=memory_spatial_shapes,
+            level_start_index=memory_level_start_index,
+            key_padding_mask=memory_key_padding_mask,
+        ).transpose(0, 1)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+
+        # ffn
+        tgt = self.forward_ffn(tgt)
+
+        return tgt
diff --git a/fengshen/models/Lyrics/groundingdino/transformer_vanilla.py b/fengshen/models/Lyrics/groundingdino/transformer_vanilla.py
new file mode 100644
index 0000000..4b33313
--- /dev/null
+++ b/fengshen/models/Lyrics/groundingdino/transformer_vanilla.py
@@ -0,0 +1,126 @@
+# ------------------------------------------------------------------------
+# Grounding DINO
+# url: https://github.com/IDEA-Research/GroundingDINO
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+# Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+DETR Transformer class.
+
+Copy-paste from torch.nn.Transformer with modifications:
+    * positional encodings are passed in MHattention
+    * extra LN at the end of encoder is removed
+    * decoder returns a stack of activations from all decoding layers
+"""
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from fengshen.models.groundedblip.groundingdino.utils import (
+    MLP,
+    _get_activation_fn,
+    _get_clones,
+)
+
+
+class TextTransformer(nn.Module):
+    def __init__(self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1):
+        super().__init__()
+        self.num_layers = num_layers
+        self.d_model = d_model
+        self.nheads = nheads
+        self.dim_feedforward = dim_feedforward
+        self.norm = None
+
+        single_encoder_layer = TransformerEncoderLayer(
+            d_model=d_model, nhead=nheads, dim_feedforward=dim_feedforward, dropout=dropout
+        )
+        self.layers = _get_clones(single_encoder_layer, num_layers)
+
+    def forward(self, memory_text: torch.Tensor, text_attention_mask: torch.Tensor):
+        """
+
+        Args:
+            text_attention_mask: bs, num_token
+            memory_text: bs, num_token, d_model
+
+        Raises:
+            RuntimeError: _description_
+
+        Returns:
+            output: bs, num_token, d_model
+        """
+
+        output = memory_text.transpose(0, 1)
+
+        for layer in self.layers:
+            output = layer(output, src_key_padding_mask=text_attention_mask)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output.transpose(0, 1)
+
+
+class TransformerEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+    ):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+        self.nhead = nhead
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward(
+        self,
+        src,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+    ):
+        # repeat attn mask
+        if src_mask.dim() == 3 and src_mask.shape[0] == src.shape[1]:
+            # bs, num_q, num_k
+            src_mask = src_mask.repeat(self.nhead, 1, 1)
+
+        if src.dtype == torch.half:
+            q = k = self.with_pos_embed(src, pos).half()
+        # q = k = self.with_pos_embed(src, pos)
+        elif src.dtype == torch.bfloat16:
+            q = k = torch.tensor(self.with_pos_embed(src, pos), dtype=torch.bfloat16)
+        else:
+            q = k = self.with_pos_embed(src, pos)
+
+        src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)[0]
+
+        # src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
diff --git a/fengshen/models/Lyrics/groundingdino/transforms.py b/fengshen/models/Lyrics/groundingdino/transforms.py
new file mode 100644
index 0000000..6b72b32
--- /dev/null
+++ b/fengshen/models/Lyrics/groundingdino/transforms.py
@@ -0,0 +1,311 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Transforms and data augmentation for both image + bbox.
+"""
+import os
+import random
+
+import PIL
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+
+from fengshen.models.groundedblip.groundingdino.utils import box_xyxy_to_cxcywh
+from fengshen.models.groundedblip.groundingdino.utils import interpolate
+
+
+def crop(image, target, region):
+    cropped_image = F.crop(image, *region)
+
+    target = target.copy()
+    i, j, h, w = region
+
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor([h, w])
+
+    fields = ["labels", "area", "iscrowd", "positive_map"]
+
+    if "boxes" in target:
+        boxes = target["boxes"]
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+        cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+        target["boxes"] = cropped_boxes.reshape(-1, 4)
+        target["area"] = area
+        fields.append("boxes")
+
+    if "masks" in target:
+        # FIXME should we update the area here if there are no boxes?
+        target["masks"] = target["masks"][:, i : i + h, j : j + w]
+        fields.append("masks")
+
+    # remove elements for which the boxes or masks that have zero area
+    if "boxes" in target or "masks" in target:
+        # favor boxes selection when defining which elements to keep
+        # this is compatible with previous implementation
+        if "boxes" in target:
+            cropped_boxes = target["boxes"].reshape(-1, 2, 2)
+            keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+        else:
+            keep = target["masks"].flatten(1).any(1)
+
+        for field in fields:
+            if field in target:
+                target[field] = target[field][keep]
+
+    if os.environ.get("IPDB_SHILONG_DEBUG", None) == "INFO":
+        # for debug and visualization only.
+        if "strings_positive" in target:
+            target["strings_positive"] = [
+                _i for _i, _j in zip(target["strings_positive"], keep) if _j
+            ]
+
+    return cropped_image, target
+
+
+def hflip(image, target):
+    flipped_image = F.hflip(image)
+
+    w, h = image.size
+
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor(
+            [w, 0, w, 0]
+        )
+        target["boxes"] = boxes
+
+    if "masks" in target:
+        target["masks"] = target["masks"].flip(-1)
+
+    return flipped_image, target
+
+
+def resize(image, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+
+        return (oh, ow)
+
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+
+    size = get_size(image.size, size, max_size)
+    rescaled_image = F.resize(image, size)
+
+    if target is None:
+        return rescaled_image, None
+
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+
+    target = target.copy()
+    if "boxes" in target:
+        boxes = target["boxes"]
+        scaled_boxes = boxes * torch.as_tensor(
+            [ratio_width, ratio_height, ratio_width, ratio_height]
+        )
+        target["boxes"] = scaled_boxes
+
+    if "area" in target:
+        area = target["area"]
+        scaled_area = area * (ratio_width * ratio_height)
+        target["area"] = scaled_area
+
+    h, w = size
+    target["size"] = torch.tensor([h, w])
+
+    if "masks" in target:
+        target["masks"] = (
+            interpolate(target["masks"][:, None].float(), size, mode="nearest")[:, 0] > 0.5
+        )
+
+    return rescaled_image, target
+
+
+def pad(image, target, padding):
+    # assumes that we only pad on the bottom right corners
+    padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
+    if target is None:
+        return padded_image, None
+    target = target.copy()
+    # should we do something wrt the original size?
+    target["size"] = torch.tensor(padded_image.size[::-1])
+    if "masks" in target:
+        target["masks"] = torch.nn.functional.pad(target["masks"], (0, padding[0], 0, padding[1]))
+    return padded_image, target
+
+
+class ResizeDebug(object):
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target):
+        return resize(img, target, self.size)
+
+
+class RandomCrop(object):
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target):
+        region = T.RandomCrop.get_params(img, self.size)
+        return crop(img, target, region)
+
+
+class RandomSizeCrop(object):
+    def __init__(self, min_size: int, max_size: int, respect_boxes: bool = False):
+        # respect_boxes:    True to keep all boxes
+        #                   False to tolerence box filter
+        self.min_size = min_size
+        self.max_size = max_size
+        self.respect_boxes = respect_boxes
+
+    def __call__(self, img: PIL.Image.Image, target: dict):
+        init_boxes = len(target["boxes"])
+        max_patience = 10
+        for i in range(max_patience):
+            w = random.randint(self.min_size, min(img.width, self.max_size))
+            h = random.randint(self.min_size, min(img.height, self.max_size))
+            region = T.RandomCrop.get_params(img, [h, w])
+            result_img, result_target = crop(img, target, region)
+            if (
+                not self.respect_boxes
+                or len(result_target["boxes"]) == init_boxes
+                or i == max_patience - 1
+            ):
+                return result_img, result_target
+        return result_img, result_target
+
+
+class CenterCrop(object):
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target):
+        image_width, image_height = img.size
+        crop_height, crop_width = self.size
+        crop_top = int(round((image_height - crop_height) / 2.0))
+        crop_left = int(round((image_width - crop_width) / 2.0))
+        return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
+
+
+class RandomHorizontalFlip(object):
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return hflip(img, target)
+        return img, target
+
+
+class RandomResize(object):
+    def __init__(self, sizes, max_size=None):
+        assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+        self.max_size = max_size
+
+    def __call__(self, img, target=None):
+        size = random.choice(self.sizes)
+        return resize(img, target, size, self.max_size)
+
+
+class RandomPad(object):
+    def __init__(self, max_pad):
+        self.max_pad = max_pad
+
+    def __call__(self, img, target):
+        pad_x = random.randint(0, self.max_pad)
+        pad_y = random.randint(0, self.max_pad)
+        return pad(img, target, (pad_x, pad_y))
+
+
+class RandomSelect(object):
+    """
+    Randomly selects between transforms1 and transforms2,
+    with probability p for transforms1 and (1 - p) for transforms2
+    """
+
+    def __init__(self, transforms1, transforms2, p=0.5):
+        self.transforms1 = transforms1
+        self.transforms2 = transforms2
+        self.p = p
+
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return self.transforms1(img, target)
+        return self.transforms2(img, target)
+
+
+class ToTensor(object):
+    def __call__(self, img, target):
+        return F.to_tensor(img), target
+
+
+class RandomErasing(object):
+    def __init__(self, *args, **kwargs):
+        self.eraser = T.RandomErasing(*args, **kwargs)
+
+    def __call__(self, img, target):
+        return self.eraser(img), target
+
+
+class Normalize(object):
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, image, target=None):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        if target is None:
+            return image, None
+        target = target.copy()
+        h, w = image.shape[-2:]
+        if "boxes" in target:
+            boxes = target["boxes"]
+            boxes = box_xyxy_to_cxcywh(boxes)
+            boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
+            target["boxes"] = boxes
+        return image, target
+
+
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + "("
+        for t in self.transforms:
+            format_string += "\n"
+            format_string += "    {0}".format(t)
+        format_string += "\n)"
+        return format_string
diff --git a/fengshen/models/Lyrics/groundingdino/utils.py b/fengshen/models/Lyrics/groundingdino/utils.py
new file mode 100644
index 0000000..adaf263
--- /dev/null
+++ b/fengshen/models/Lyrics/groundingdino/utils.py
@@ -0,0 +1,471 @@
+# ------------------------------------------------------------------------
+# Grounding DINO
+# url: https://github.com/IDEA-Research/GroundingDINO
+# Copyright (c) 2023 IDEA. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------
+
+import copy
+import math
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from typing import List, Optional
+from collections import OrderedDict
+import torch.distributed as dist
+import torchvision
+
+
+def _get_clones(module, N, layer_share=False):
+    # import ipdb; ipdb.set_trace()
+    if layer_share:
+        return nn.ModuleList([module for i in range(N)])
+    else:
+        return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def get_sine_pos_embed(
+    pos_tensor: torch.Tensor,
+    num_pos_feats: int = 128,
+    temperature: int = 10000,
+    exchange_xy: bool = True,
+):
+    """generate sine position embedding from a position tensor
+    Args:
+        pos_tensor (torch.Tensor): shape: [..., n].
+        num_pos_feats (int): projected shape for each float in the tensor.
+        temperature (int): temperature in the sine/cosine function.
+        exchange_xy (bool, optional): exchange pos x and pos y. \
+            For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. Defaults to True.
+    Returns:
+        pos_embed (torch.Tensor): shape: [..., n*num_pos_feats].
+    """
+    scale = 2 * math.pi
+    dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+
+    def sine_func(x: torch.Tensor):
+        sin_x = x * scale / dim_t
+        sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2)
+        return sin_x
+
+    pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)]
+    if exchange_xy:
+        pos_res[0], pos_res[1] = pos_res[1], pos_res[0]
+    pos_res = torch.cat(pos_res, dim=-1)
+    return pos_res
+
+
+def gen_encoder_output_proposals(
+    memory: Tensor, memory_padding_mask: Tensor, spatial_shapes: Tensor, learnedwh=None
+):
+    """
+    Input:
+        - memory: bs, \sum{hw}, d_model
+        - memory_padding_mask: bs, \sum{hw}
+        - spatial_shapes: nlevel, 2
+        - learnedwh: 2
+    Output:
+        - output_memory: bs, \sum{hw}, d_model
+        - output_proposals: bs, \sum{hw}, 4
+    """
+    N_, S_, C_ = memory.shape
+    proposals = []
+    _cur = 0
+    for lvl, (H_, W_) in enumerate(spatial_shapes):
+        mask_flatten_ = memory_padding_mask[:, _cur : (_cur + H_ * W_)].view(N_, H_, W_, 1)
+        valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+        valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+        # import ipdb; ipdb.set_trace()
+
+        grid_y, grid_x = torch.meshgrid(
+            torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device),
+            torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device),
+        )
+        grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)  # H_, W_, 2
+
+        scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
+        grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
+
+        if learnedwh is not None:
+            # import ipdb; ipdb.set_trace()
+            wh = torch.ones_like(grid) * learnedwh.sigmoid() * (2.0**lvl)
+        else:
+            wh = torch.ones_like(grid) * 0.05 * (2.0**lvl)
+
+        # scale = torch.cat([W_[None].unsqueeze(-1), H_[None].unsqueeze(-1)], 1).view(1, 1, 1, 2).repeat(N_, 1, 1, 1)
+        # grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
+        # wh = torch.ones_like(grid) / scale
+        proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
+        proposals.append(proposal)
+        _cur += H_ * W_
+    # import ipdb; ipdb.set_trace()
+    output_proposals = torch.cat(proposals, 1)
+    output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(
+        -1, keepdim=True
+    )
+    output_proposals = torch.log(output_proposals / (1 - output_proposals))  # unsigmoid
+    output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float("inf"))
+    output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
+
+    output_memory = memory
+    output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0))
+    output_memory = output_memory.masked_fill(~output_proposals_valid, float(0))
+
+    # output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf'))
+    # output_memory = output_memory.masked_fill(~output_proposals_valid, float('inf'))
+
+    return output_memory, output_proposals
+
+
+class RandomBoxPerturber:
+    def __init__(
+        self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2
+    ) -> None:
+        self.noise_scale = torch.Tensor(
+            [x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale]
+        )
+
+    def __call__(self, refanchors: Tensor) -> Tensor:
+        nq, bs, query_dim = refanchors.shape
+        device = refanchors.device
+
+        noise_raw = torch.rand_like(refanchors)
+        noise_scale = self.noise_scale.to(device)[:query_dim]
+
+        new_refanchors = refanchors * (1 + (noise_raw - 0.5) * noise_scale)
+        return new_refanchors.clamp_(0, 1)
+
+
+def sigmoid_focal_loss(
+    inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2, no_reduction=False
+):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    if no_reduction:
+        return loss
+
+    return loss.mean(1).sum() / num_boxes
+
+
+class MLP(nn.Module):
+    """Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+        )
+
+    def forward(self, x):
+        if self.layers[0].weight.dtype == torch.half:
+            x = x.half()
+        elif self.layers[0].weight.dtype == torch.bfloat16:
+            x = torch.tensor(x, dtype=torch.bfloat16)
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+def _get_activation_fn(activation, d_model=256, batch_dim=0):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    if activation == "prelu":
+        return nn.PReLU()
+    if activation == "selu":
+        return F.selu
+
+    raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
+
+
+def gen_sineembed_for_position(pos_tensor):
+    # n_query, bs, _ = pos_tensor.size()
+    # sineembed_tensor = torch.zeros(n_query, bs, 256)
+    scale = 2 * math.pi
+    dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = 10000 ** (2 * (torch.div(dim_t, 2, rounding_mode='floor')) / 128)
+    x_embed = pos_tensor[:, :, 0] * scale
+    y_embed = pos_tensor[:, :, 1] * scale
+    pos_x = x_embed[:, :, None] / dim_t
+    pos_y = y_embed[:, :, None] / dim_t
+    pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+    pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+    if pos_tensor.size(-1) == 2:
+        pos = torch.cat((pos_y, pos_x), dim=2)
+    elif pos_tensor.size(-1) == 4:
+        w_embed = pos_tensor[:, :, 2] * scale
+        pos_w = w_embed[:, :, None] / dim_t
+        pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2)
+
+        h_embed = pos_tensor[:, :, 3] * scale
+        pos_h = h_embed[:, :, None] / dim_t
+        pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2)
+
+        pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
+    else:
+        raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1)))
+    return pos
+
+
+class ContrastiveEmbed(nn.Module):
+    def __init__(self, max_text_len=256):
+        """
+        Args:
+            max_text_len: max length of text.
+        """
+        super().__init__()
+        self.max_text_len = max_text_len
+
+    def forward(self, x, text_dict):
+        """_summary_
+
+        Args:
+            x (_type_): _description_
+            text_dict (_type_): _description_
+            {
+                'encoded_text': encoded_text, # bs, 195, d_model
+                'text_token_mask': text_token_mask, # bs, 195
+                        # True for used tokens. False for padding tokens
+            }
+        Returns:
+            _type_: _description_
+        """
+        assert isinstance(text_dict, dict)
+
+        y = text_dict["encoded_text"]
+        text_token_mask = text_dict["text_token_mask"]
+        # print('y:', y.shape)
+        # print('text_token_mask:', text_token_mask.shape)
+        # print('layer_hs:', x.shape)
+        res = x @ y.transpose(-1, -2)
+        res.masked_fill_(~text_token_mask[:, None, :], float("-inf"))
+
+        # padding to max_text_len
+        new_res = torch.full((*res.shape[:-1], self.max_text_len), float("-inf"), device=res.device)
+        new_res[..., : res.shape[-1]] = res
+
+        return new_res
+
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    # TODO make this more general
+    if tensor_list[0].ndim == 3:
+        if torchvision._is_tracing():
+            # nested_tensor_from_tensor_list() does not export well to ONNX
+            # call _onnx_nested_tensor_from_tensor_list() instead
+            return _onnx_nested_tensor_from_tensor_list(tensor_list)
+
+        # TODO make it support different-sized images
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
+        batch_shape = [len(tensor_list)] + max_size
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            # 不包含的地方是0
+            m[: img.shape[1], : img.shape[2]] = False
+    else:
+        raise ValueError("not supported")
+    return NestedTensor(tensor, mask)
+
+def inverse_sigmoid(x, eps=1e-3):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+        if mask == "auto":
+            self.mask = torch.zeros_like(tensors).to(tensors.device)
+            if self.mask.dim() == 3:
+                self.mask = self.mask.sum(0).to(bool)
+            elif self.mask.dim() == 4:
+                self.mask = self.mask.sum(1).to(bool)
+            else:
+                raise ValueError(
+                    "tensors dim must be 3 or 4 but {}({})".format(
+                        self.tensors.dim(), self.tensors.shape
+                    )
+                )
+
+    def imgsize(self):
+        res = []
+        for i in range(self.tensors.shape[0]):
+            mask = self.mask[i]
+            maxH = (~mask).sum(0).max()
+            maxW = (~mask).sum(1).max()
+            res.append(torch.Tensor([maxH, maxW]))
+        return res
+
+    def to(self, device):
+        # type: (Device) -> NestedTensor # noqa
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            assert mask is not None
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def to_img_list_single(self, tensor, mask):
+        assert tensor.dim() == 3, "dim of tensor should be 3 but {}".format(tensor.dim())
+        maxH = (~mask).sum(0).max()
+        maxW = (~mask).sum(1).max()
+        img = tensor[:, :maxH, :maxW]
+        return img
+
+    def to_img_list(self):
+        """remove the padding and convert to img list
+
+        Returns:
+            [type]: [description]
+        """
+        if self.tensors.dim() == 3:
+            return self.to_img_list_single(self.tensors, self.mask)
+        else:
+            res = []
+            for i in range(self.tensors.shape[0]):
+                tensor_i = self.tensors[i]
+                mask_i = self.mask[i]
+                res.append(self.to_img_list_single(tensor_i, mask_i))
+            return res
+
+    @property
+    def device(self):
+        return self.tensors.device
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+    @property
+    def shape(self):
+        return {"tensors.shape": self.tensors.shape, "mask.shape": self.mask.shape}
+    
+@torch.jit.unused
+def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
+    max_size = []
+    for i in range(tensor_list[0].dim()):
+        max_size_i = torch.max(
+            torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
+        ).to(torch.int64)
+        max_size.append(max_size_i)
+    max_size = tuple(max_size)
+
+    # work around for
+    # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+    # m[: img.shape[1], :img.shape[2]] = False
+    # which is not yet supported in onnx
+    padded_imgs = []
+    padded_masks = []
+    for img in tensor_list:
+        padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
+        padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0]))
+        padded_imgs.append(padded_img)
+
+        m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
+        padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1)
+        padded_masks.append(padded_mask.to(torch.bool))
+
+    tensor = torch.stack(padded_imgs)
+    mask = torch.stack(padded_masks)
+
+    return NestedTensor(tensor, mask=mask)
+
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+def clean_state_dict(state_dict):
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k[:7] == "module.":
+            k = k[7:]  # remove `module.`
+        new_state_dict[k] = v
+    return new_state_dict
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+def is_main_process():
+    return get_rank() == 0
+
+def box_xyxy_to_cxcywh(x):
+    x0, y0, x1, y1 = x.unbind(-1)
+    b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
+    return torch.stack(b, dim=-1)
+
+__torchvision_need_compat_flag = float(torchvision.__version__.split(".")[1]) < 7
+if __torchvision_need_compat_flag:
+    from torchvision.ops import _new_empty_tensor
+    from torchvision.ops.misc import _output_size
+
+def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None):
+    # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
+    """
+    Equivalent to nn.functional.interpolate, but with support for empty batch sizes.
+    This will eventually be supported natively by PyTorch, and this
+    class can go away.
+    """
+    if __torchvision_need_compat_flag < 0.7:
+        if input.numel() > 0:
+            return torch.nn.functional.interpolate(input, size, scale_factor, mode, align_corners)
+
+        output_shape = _output_size(2, input, size, scale_factor)
+        output_shape = list(input.shape[:-2]) + list(output_shape)
+        return _new_empty_tensor(input, output_shape)
+    else:
+        return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners)
\ No newline at end of file
diff --git a/fengshen/models/Lyrics/modeling_lyrics.py b/fengshen/models/Lyrics/modeling_lyrics.py
new file mode 100644
index 0000000..52593bf
--- /dev/null
+++ b/fengshen/models/Lyrics/modeling_lyrics.py
@@ -0,0 +1,2235 @@
+# coding=utf-8
+# Copyright 2023 The Salesforce Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch BLIP-2 model."""
+
+import math
+import re
+from dataclasses import dataclass
+from typing import Callable, Optional, Tuple, Union, List
+import warnings
+import random
+import torchvision
+import torch
+import torch.utils.checkpoint
+import copy
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from torch.nn.utils.rnn import pad_sequence
+import torch.nn.functional as F
+import torch.distributed as dist
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+)
+from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from transformers.utils import (
+    ModelOutput,
+    logging,
+)
+# from transformers.models.blip_2.configuration_blip_2 import Blip2Config, Blip2QFormerConfig
+from transformers.models.blip_2.modeling_blip_2 import Blip2ForConditionalGenerationModelOutput
+from transformers import (
+    Blip2PreTrainedModel,
+    Blip2VisionModel,
+    AutoModelForCausalLM,
+    AutoModelForSeq2SeqLM,
+    Blip2QFormerModel,
+    PreTrainedTokenizer,
+    LogitsProcessorList,
+    LogitsProcessor,
+    StoppingCriteriaList,
+    GenerationConfig,
+)
+from fengshen.models.Lyrics.groundingdino.modeling_groundingdino import GroundingDINO
+from fengshen.models.Lyrics.ram.models.ram import RAM
+from fengshen.models.Lyrics.configuration_lyrics import LyricsConfig, LyricsQFormerConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class InvalidScoreLogitsProcessor(LogitsProcessor):
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if torch.isnan(scores).any() or torch.isinf(scores).any():
+            scores.zero_()
+            scores[..., 5] = 5e4
+        return scores
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(
+            config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
+        )
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size
+        )
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        # transformer为layernorm, lavis为LayerNorm
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))
+        )
+        self.position_embedding_type = getattr(
+            config, "position_embedding_type", "absolute"
+        )
+
+        self.config = config
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        query_embeds=None,
+        past_key_values_length=0,
+    ):
+        if input_ids is not None:
+            seq_length = input_ids.size()[1]
+        else:
+            seq_length = 0
+
+        if position_ids is None:
+            position_ids = self.position_ids[
+                :, past_key_values_length: seq_length + past_key_values_length
+            ].clone()
+        # print(position_ids)
+
+        if input_ids is not None:
+            embeddings = self.word_embeddings(input_ids)
+            if self.position_embedding_type == "absolute":
+                position_embeddings = self.position_embeddings(position_ids)
+                embeddings = embeddings + position_embeddings
+
+            if query_embeds is not None:
+                embeddings = torch.cat((query_embeds, embeddings), dim=1)
+        else:
+            embeddings = query_embeds
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class LyricsQFormerMultiHeadAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False, is_detection=False):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention and is_detection:
+            self.key = nn.Linear(config.detection_encoder_hidden_size, self.all_head_size) # 260, 256 + 4
+            self.value = nn.Linear(config.detection_encoder_hidden_size, self.all_head_size)
+        elif is_cross_attention:
+            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)            
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(
+                2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            if self.key.weight.dtype == torch.half:
+                encoder_hidden_states = encoder_hidden_states.half()
+            # encoder_hidden_states = encoder_hidden_states
+            elif self.key.weight.dtype == torch.bfloat16:
+                encoder_hidden_states = torch.tensor(encoder_hidden_states, dtype=torch.bfloat16)
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        mixed_query_layer = self.query(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long,
+                                          device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long,
+                                          device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(
+                distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(
+                dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum(
+                    "bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum(
+                    "bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->Blip2QFormer
+class LyricsQFormerSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class LyricsQFormerAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False, is_detection = False):
+        super().__init__()
+        self.attention = LyricsQFormerMultiHeadAttention(config, is_cross_attention, is_detection)
+        self.output = LyricsQFormerSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor]:
+        self_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Blip2QFormer
+class LyricsQFormerIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->Blip2QFormer
+class LyricsQFormerOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class LyricsQFormerLayer(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = LyricsQFormerAttention(config)
+
+        self.layer_idx = layer_idx
+        self.num_vit_query_tokens = config.num_vit_query_tokens
+
+        if layer_idx % config.cross_attention_frequency == 0:
+            self.crossattention = LyricsQFormerAttention(config, is_cross_attention=True)
+            self.detection_crossattention = LyricsQFormerAttention(config, is_cross_attention=True, is_detection=True)
+            self.has_cross_attention = True
+        else:
+            self.has_cross_attention = False
+
+        self.intermediate = LyricsQFormerIntermediate(config)
+        self.output = LyricsQFormerOutput(config)
+
+        self.intermediate_query = LyricsQFormerIntermediate(config)
+        self.output_query = LyricsQFormerOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        detection_encoder_hidden_states=None,
+        detection_encoder_attention_mask=None,        
+        past_key_value=None,
+        output_attentions=False,
+        query_length=0,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:-1]
+
+        present_key_value = self_attention_outputs[-1]
+
+        if query_length > 0:
+            query_attention_output = attention_output[:, :query_length, :]
+
+            if self.has_cross_attention:
+                if encoder_hidden_states is None:
+                    raise ValueError(
+                        "encoder_hidden_states must be given for cross-attention layers")
+                if detection_encoder_hidden_states is None:
+                    raise ValueError(
+                        "detection_encoder_hidden_states must be given for cross-attention layers")                
+                if attention_mask is not None:
+                    cross_attention_mask = attention_mask[:, :self.num_vit_query_tokens]
+                    detection_cross_attention_mask = attention_mask[:, self.num_vit_query_tokens:]
+                else:
+                    cross_attention_mask = None
+                    detection_cross_attention_mask = None
+                cross_attention_outputs = self.crossattention(
+                    query_attention_output[:, :self.num_vit_query_tokens, :],
+                    cross_attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                vit_query_attention_output = cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                query_attention_probs = cross_attention_outputs[1:-1]
+
+                detection_cross_attention_outputs = self.detection_crossattention(
+                    query_attention_output[:, self.num_vit_query_tokens:, :],
+                    detection_cross_attention_mask,
+                    head_mask,
+                    detection_encoder_hidden_states,
+                    detection_encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                detection_query_attention_output = detection_cross_attention_outputs[0]
+                # add cross attentions if we output attention weights
+                detection_query_attention_probs = detection_cross_attention_outputs[1:-1]
+                
+                if output_attentions == True:
+                    padding_attention = torch.zeros((query_attention_probs[0].size(0),
+                                                    query_attention_probs[0].size(1),
+                                                    detection_query_attention_probs[0].size(2) - query_attention_probs[0].size(2)))
+                    query_attention_probs = torch.cat([query_attention_probs[0], padding_attention], dim = -1)
+
+                    outputs = outputs + (torch.cat([query_attention_probs[0], detection_query_attention_probs], dim=1),)
+                else:
+                    outputs = outputs + cross_attention_outputs[1:-1]
+
+                query_attention_output = torch.cat([vit_query_attention_output, detection_query_attention_output], dim=1)
+
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk_query,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                query_attention_output,
+            )
+
+            if attention_output.shape[1] > query_length:
+                layer_output_text = apply_chunking_to_forward(
+                    self.feed_forward_chunk,
+                    self.chunk_size_feed_forward,
+                    self.seq_len_dim,
+                    attention_output[:, query_length:, :],
+                )
+                layer_output = torch.cat([layer_output, layer_output_text], dim=1)
+        else:
+            layer_output = apply_chunking_to_forward(
+                self.feed_forward_chunk,
+                self.chunk_size_feed_forward,
+                self.seq_len_dim,
+                attention_output,
+            )
+        outputs = (layer_output,) + outputs
+
+        # present_key_value是self attention的key,value, 用于在decoder中以前的词的key,value不用再重复计算
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+    def feed_forward_chunk_query(self, attention_output):
+        intermediate_output = self.intermediate_query(attention_output)
+        layer_output = self.output_query(intermediate_output, attention_output)
+        return layer_output
+
+
+class LyricsQFormerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [LyricsQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        detection_encoder_hidden_states=None,
+        detection_encoder_attention_mask=None,        
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        query_length=0,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions else None
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if getattr(self.config, "gradient_checkpointing", False) and self.training:
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions, query_length)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    detection_encoder_hidden_states,
+                    detection_encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    detection_encoder_hidden_states,
+                    detection_encoder_attention_mask,                    
+                    past_key_value,
+                    output_attentions,
+                    query_length,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            # 这里的cross attention是经过pad的
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if layer_module.has_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class LyricsQFormerModel(Blip2PreTrainedModel):
+    """
+    Querying Transformer (Q-Former), used in BLIP-2.
+    """
+
+    def __init__(self, config: LyricsQFormerConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = BertEmbeddings(config)
+
+        # self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        self.encoder = LyricsQFormerEncoder(config)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_shape: Tuple[int],
+        device: torch.device,
+        is_decoder: bool,
+        has_query: bool = False,
+    ) -> torch.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = (
+                    seq_ids[None, None, :].repeat(batch_size, seq_length, 1)
+                    <= seq_ids[None, :, None]
+                )
+
+                # add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    if has_query:  # UniLM style attention mask
+                        causal_mask = torch.cat(
+                            [
+                                torch.zeros(
+                                    (batch_size, prefix_seq_len, seq_length),
+                                    device=device,
+                                    dtype=causal_mask.dtype,
+                                ),
+                                causal_mask,
+                            ],
+                            axis=1,
+                        )
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones(
+                                (batch_size, causal_mask.shape[1], prefix_seq_len),
+                                device=device,
+                                dtype=causal_mask.dtype,
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+                extended_attention_mask = (
+                    causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+                )
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(
+            dtype=self.dtype
+        )  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        input_ids=None,
+        query_embeds=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        detection_encoder_hidden_states=None,
+        detection_encoder_attention_mask=None,        
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+    ):
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of:
+            shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
+            value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
+            used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
+            value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
+            `(batch_size, sequence_length)`.
+        use_cache (`bool`, `optional`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None:
+            assert (
+                query_embeds is not None
+            ), "You have to specify query_embeds when input_ids is None"
+
+        # past_key_values_length
+        past_key_values_length = (
+            past_key_values[0][0].shape[2] -
+            self.config.query_length if past_key_values is not None else 0
+        )
+
+        query_length = query_embeds.shape[1] if query_embeds is not None else 0
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            query_embeds=query_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        input_shape = embedding_output.size()[:-1]
+        batch_size, seq_length = input_shape
+        device = embedding_output.device
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                ((batch_size, seq_length + past_key_values_length)), device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if is_decoder:
+            extended_attention_mask = self.get_extended_attention_mask(
+                attention_mask,
+                input_ids.shape,
+                device,
+                is_decoder,
+                has_query=(query_embeds is not None),
+            )
+        else:
+            extended_attention_mask = self.get_extended_attention_mask(
+                attention_mask, input_shape, device, is_decoder
+            )
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                (
+                    encoder_batch_size,
+                    encoder_sequence_length,
+                    _,
+                ) = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [
+                    self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        if detection_encoder_hidden_states is not None:
+            if type(detection_encoder_hidden_states) == list:
+                detection_encoder_batch_size, detection_encoder_sequence_length, _ = detection_encoder_hidden_states[0].size()
+            else:
+                (
+                    detection_encoder_batch_size,
+                    detection_encoder_sequence_length,
+                    _,
+                ) = detection_encoder_hidden_states.size()
+            detection_encoder_hidden_shape = (detection_encoder_batch_size, detection_encoder_sequence_length)
+
+            if type(detection_encoder_attention_mask) == list:
+                detection_encoder_extended_attention_mask = [
+                    self.invert_attention_mask(mask) for mask in detection_encoder_attention_mask]
+            elif detection_encoder_attention_mask is None:
+                detection_encoder_attention_mask = torch.ones(detection_encoder_hidden_shape, device=device)
+                detection_encoder_extended_attention_mask = self.invert_attention_mask(detection_encoder_attention_mask)
+            else:
+                detection_encoder_extended_attention_mask = self.invert_attention_mask(detection_encoder_attention_mask)
+        else:
+            detection_encoder_extended_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            detection_encoder_hidden_states=detection_encoder_hidden_states,
+            detection_encoder_attention_mask=detection_encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            query_length=query_length,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = sequence_output[:, 0, :]
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->BlipText
+
+
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config: LyricsQFormerConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+# 把注意力方式改一下，就可以做MLM了
+class LyricsQFormerOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class LyricsQFormerWithLMHead(Blip2PreTrainedModel):
+    base_model_prefix = "bert"
+
+    def __init__(self, config: LyricsQFormerConfig):
+        super().__init__(config)
+
+        self.bert = LyricsQFormerModel(config)
+        self.cls = LyricsQFormerOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        query_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        detection_encoder_hidden_states=None,
+        detection_encoder_attention_mask=None,        
+        labels=None,
+        past_key_values=None,
+        use_cache=True,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=True,
+        reduction="mean",
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if labels is not None:
+            use_cache = False
+        if past_key_values is not None:
+            query_embeds = None
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            query_embeds=query_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            detection_encoder_hidden_states=detection_encoder_hidden_states,
+            detection_encoder_attention_mask=detection_encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+        )
+
+        sequence_output = outputs[0]
+        if query_embeds is not None:
+            sequence_output = outputs[0][:, query_embeds.shape[1]:, :]
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores[:, :-1, :].contiguous()
+
+        # 区分is_decoder
+        # 没mask掉的用-100代替
+        lm_loss = None
+        if is_decoder == True:
+            if labels is not None:
+                # we are doing next-token prediction; shift prediction scores and input ids by one
+                shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+                labels = labels[:, 1:].contiguous()
+                loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
+                lm_loss = loss_fct(
+                    shifted_prediction_scores.view(-1, self.config.vocab_size),
+                    labels.view(-1),
+                )
+                if reduction == "none":
+                    lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
+            
+            if not return_dict:
+                output = (prediction_scores,) + outputs[2:]
+                return ((lm_loss,) + output) if lm_loss is not None else output
+        else:
+            if labels is not None:
+                # we are doing mask prediction; do not need shift; but do not calculate cls_token
+                # bs, seq, vocab
+                prediction_scores = prediction_scores[:, 1:, :].contiguous()
+                # bs, seq
+                labels = labels[:, 1:].contiguous()
+                # print('prediction_scores:', prediction_scores.size())
+                # print('labels:', labels.size())
+                # print('max_labels:', torch.max(labels, dim=1))
+                loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
+                mlm_loss = loss_fct(
+                    prediction_scores.view(-1, self.config.vocab_size),
+                    labels.view(-1),
+                )
+                if reduction == "none":
+                    mlm_loss = mlm_loss.view(prediction_scores.size(0), -1).sum(1)            
+            
+            if not return_dict:
+                output = (prediction_scores,) + outputs[2:]
+                return ((mlm_loss,) + output) if mlm_loss is not None else output
+
+        if is_decoder == True:
+            return CausalLMOutputWithCrossAttentions(
+                loss=lm_loss,
+                logits=prediction_scores,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+                cross_attentions=outputs.cross_attentions,
+            )
+        
+        else:
+            return MaskedLMOutput(
+                loss=mlm_loss,
+                logits=prediction_scores,
+                hidden_states=outputs.hidden_states,
+                attentions=outputs.attentions,
+            )
+    
+        
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs
+    ):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "is_decoder": True,
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (
+                tuple(
+                    past_state.index_select(0, beam_idx) for past_state in layer_past
+                ),
+            )
+        return reordered_past
+
+
+@dataclass
+class LyricsOutput(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+
+    loss_itc: Optional[torch.FloatTensor] = None
+
+    loss_itm: Optional[torch.FloatTensor] = None
+
+    loss_lm: Optional[torch.FloatTensor] = None
+
+    loss_mlm: Optional[torch.FloatTensor] = None
+
+# 用来封装BLIPQFormerWithLMHead的输出的
+class LyricsQFormerForConditionalGeneration(Blip2PreTrainedModel):
+    config_class = LyricsConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: LyricsConfig):
+        super().__init__(config)
+
+        self.vision_model = Blip2VisionModel(config.vision_config)
+
+        self.query_tokens = nn.Parameter(torch.zeros(
+            1, config.num_query_tokens, config.qformer_config.hidden_size))
+
+        self.qformer = LyricsQFormerWithLMHead(config.qformer_config)
+
+        self.decoder_input_ids = config.qformer_config.bos_token_id
+        self.decoder_pad_token_id = config.qformer_config.pad_token_id
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]:
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        image_embeds = vision_outputs[0]
+
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(pixel_values.device)
+
+        query_outputs = self.qformer.bert(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_atts,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=True,
+            return_dict=return_dict,
+        )
+        query_output = query_outputs[0]
+
+        lm_output = self.qformer(
+            input_ids,
+            attention_mask=attention_mask,
+            past_key_values=query_output.past_key_values,
+            return_dict=return_dict,
+            labels=labels,
+        )
+
+        if not return_dict:
+            outputs = (outputs[0], outputs[1], image_embeds, vision_outputs[0]) + vision_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+
+        return Blip2ForConditionalGenerationModelOutput(
+            loss=lm_output.loss,
+            decoder_logits=lm_output.logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=lm_output,
+        )
+
+class LyricsLMForConditionalGeneration(Blip2PreTrainedModel):
+    config_class = LyricsConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: LyricsConfig):
+        super().__init__(config)
+
+        # 直接传ids进来，所以不需要tokenizer
+        self.vision_model = Blip2VisionModel(config.vision_config)
+        self.ram = RAM(config.ram_config)
+        self.grounding_dino = GroundingDINO(config.detection_config)
+
+        self.query_tokens = nn.Parameter(torch.zeros(
+            1, config.num_query_tokens, config.qformer_config.hidden_size))
+        
+        # 只用了encoder那部分模型，没用有cls的模型
+        self.qformer = LyricsQFormerModel(config.qformer_config)
+
+        self.language_projection = nn.Linear(
+            config.qformer_config.hidden_size, config.text_config.hidden_size)
+        if config.use_decoder_only_language_model:
+            language_model = AutoModelForCausalLM.from_config(config.text_config)
+        else:
+            language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
+        self.language_model = language_model
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.language_model.get_output_embeddings()
+
+    def get_encoder(self):
+        return self.language_model.get_encoder()
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def _tie_weights(self):
+        if not self.config.use_decoder_only_language_model:
+            self.language_model.encoder.embed_tokens = self.language_model.shared
+            self.language_model.decoder.embed_tokens = self.language_model.shared
+
+    def _preprocess_accelerate(self):
+        r"""
+        Some pre-processing hacks to make the model `accelerate` compatible. Check
+        https://github.com/huggingface/transformers/pull/21707 for more details.
+        """
+        hf_device_map = self.hf_device_map
+
+        if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1:
+            # warn users about unexpected behavior when using multi-GPU + BLIP-2 + `accelerate`.
+            logger.warning(
+                "The `language_model` is not in the `hf_device_map` dictionary and you are running your script"
+                " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`."
+                " Please pass a `device_map` that contains `language_model` to remove this warning."
+                " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for",
+                " more details on creating a `device_map` for large models.",
+            )
+
+        if hasattr(self.language_model, "_hf_hook"):
+            self.language_model._hf_hook.io_same_device = True  # For `generate` compatibility
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        ram_pixel_values: torch.FloatTensor,
+        grounding_pixel_values: torch.FloatTensor,
+        input_ids: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        labels: torch.FloatTensor = None,
+        # 因为label不会出现在image之前，所以这里不需要labels_before_image， 按照input_ids_before_image补-100就可以了
+        qformer_input_ids: torch.FloatTensor = None,
+        qformer_attention_mask: torch.FloatTensor = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # step 1: forward the images through the vision encoder,
+        # to get image embeddings of shape (batch_size, seq_len, hidden_size)
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_embeds = vision_outputs[0]
+
+        tags_english, tags_chinese = self.ram.generate_tag(ram_pixel_values)
+
+        input_tags = [tag.replace(' |', ',').lower().strip() + "." if not tag.endswith(".") else tag.replace(' |', ',').lower().strip() for tag in tags_english]
+
+        # outputs = self.grounding_dino(grounding_image[None], captions=input_tags)
+        grounding_outputs = self.grounding_dino(grounding_pixel_values, captions=input_tags)
+
+        detection_image_embeds = grounding_outputs["hidden_state"] # (bs, nq, 256)
+
+        # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
+        image_attention_mask = torch.ones(
+            image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        detection_image_attention_mask = torch.ones(detection_image_embeds.size()[:-1], dtype=torch.long).to(pixel_values.device)
+
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(image_embeds.device)
+
+        if qformer_input_ids == None:
+            # print('no_hava_instruct')
+            query_outputs = self.qformer(
+                query_embeds=query_tokens,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_attention_mask,
+                detection_encoder_hidden_states=detection_image_embeds,
+                detection_encoder_attention_mask=detection_image_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            query_output = query_outputs[0]            
+        else:
+            # print('hava_instruct')
+            text_qformer_atts = qformer_attention_mask
+            qformer_atts = torch.cat([query_atts, text_qformer_atts],dim=1)
+            query_outputs = self.qformer(
+                qformer_input_ids,
+                query_embeds=query_tokens,
+                attention_mask=qformer_atts,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_attention_mask,
+                detection_encoder_hidden_states=detection_image_embeds,
+                detection_encoder_attention_mask=detection_image_attention_mask,                
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            query_output = query_outputs[0][:,:query_tokens.size(1),:]
+
+        # print(query_output.size())
+        # step 2.5 generate the lm input by prompt and output
+        language_model_inputs = self.language_projection(query_output)
+        language_model_attention_mask = torch.ones(
+            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
+        )
+
+        # 确保language_model_inputs的batch
+        assert language_model_inputs.shape[0] == input_ids.shape[0]
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+
+        inputs_embeds = torch.cat(
+            [
+                language_model_inputs,
+                inputs_embeds.to(language_model_inputs.device)
+            ], dim=1)
+
+        attention_mask = torch.cat(
+            [
+                language_model_attention_mask,
+                attention_mask.to(language_model_attention_mask.device)
+            ], dim=1
+        )
+
+        # labels也需要对应的处理，把前面空缺的-100加进去
+        if labels is not None:
+            labels = torch.cat(
+                [
+                    torch.tensor([-100]).expand(query_tokens.shape[:-1]
+                                                ).to(language_model_inputs.device),
+                    labels,
+                ], dim=1
+            )
+
+        # step 3: use the language model
+
+        if self.config.use_decoder_only_language_model:
+            # print('model is a use_decoder_only_language_model')
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                # labels=labels,
+            )
+
+            logits = outputs.logits if return_dict else outputs[0]
+            loss = None
+            if labels is not None:
+                labels = labels.to(logits.device)
+                logits = logits[:, -labels.size(1) :, :]
+                # Shift so that tokens < n predict n
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous().to(logits.device)
+
+                # Flatten the tokens
+                loss_fct = CrossEntropyLoss(reduction="mean")
+
+                loss = loss_fct(shift_logits.view(-1, self.config.text_config.vocab_size), shift_labels.view(-1))           
+                   
+
+        else:
+            raise Exception("not impl")
+            outputs = self.language_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+                decoder_attention_mask=decoder_attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                labels=labels,
+            )
+            loss = outputs.loss if return_dict else outputs[0]
+            logits = outputs.logits if return_dict else outputs[1]
+
+        if not return_dict:
+            output = (logits, vision_outputs, query_outputs, outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return Blip2ForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            vision_outputs=vision_outputs,
+            qformer_outputs=query_outputs,
+            language_model_outputs=outputs,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        pixel_values: torch.FloatTensor,
+        ram_pixel_values: torch.FloatTensor,
+        grounding_pixel_values: torch.FloatTensor,
+        input_ids: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        qformer_input_ids: torch.FloatTensor = None,
+        qformer_attention_mask: torch.FloatTensor = None, 
+        **generate_kwargs,
+    ) -> torch.LongTensor:
+        """
+        Overrides `generate` function to be able to use the model as a conditional generator.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width)):
+                Input images to be processed.
+            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
+                The sequence used as a prompt for the generation.
+
+        Returns:
+            captions (list): A list of strings of length batch_size * num_captions.
+        """
+        if hasattr(self, "hf_device_map"):
+            # preprocess for `accelerate`
+            self._preprocess_accelerate()
+        # print('data type: ', pixel_values.dtype)
+        batch_size = pixel_values.shape[0]
+        image_embeds = self.vision_model(pixel_values, return_dict=True).last_hidden_state
+
+        tags_english, tags_chinese = self.ram.generate_tag(ram_pixel_values)
+
+        input_tags = [tag.replace(' |', ',').lower().strip() + "." if not tag.endswith(".") else tag.replace(' |', ',').lower().strip() for tag in tags_english]
+
+        # outputs = self.grounding_dino(grounding_image[None], captions=input_tags)
+        grounding_outputs = self.grounding_dino(grounding_pixel_values, captions=input_tags)
+
+        detection_image_embeds = grounding_outputs["hidden_state"] # (bs, nq, 256)
+
+        image_attention_mask = torch.ones(
+            image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+        detection_image_attention_mask = torch.ones(detection_image_embeds.size()[:-1], dtype=torch.long).to(pixel_values.device)
+
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(image_embeds.device)
+
+        if qformer_input_ids == None:
+            # print('no_hava_instruct')
+            query_outputs = self.qformer(
+                query_embeds=query_tokens,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_attention_mask,
+                detection_encoder_hidden_states=detection_image_embeds,
+                detection_encoder_attention_mask=detection_image_attention_mask,
+            )
+            query_output = query_outputs[0]          
+        else:
+            # print('hava_instruct')
+            if qformer_attention_mask == None:
+                qformer_attention_mask = torch.ones(qformer_input_ids.size(), dtype=torch.long).to(image_embeds.device)
+            qformer_atts = torch.cat([query_atts, qformer_attention_mask],dim=1)
+            query_outputs = self.qformer(
+                qformer_input_ids,
+                query_embeds=query_tokens,
+                attention_mask=qformer_atts,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_attention_mask,
+                detection_encoder_hidden_states=detection_image_embeds,
+                detection_encoder_attention_mask=detection_image_attention_mask,                
+            )
+            query_output = query_outputs[0][:,:query_tokens.size(1),:]
+            # print('query_output:', query_output)
+            # print('query_output_size:', query_output.size())
+
+        language_model_inputs = self.language_projection(query_output)
+        language_attention_mask = torch.ones(
+            language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device
+        )
+        # print('language_model_inputs:', language_model_inputs)
+        # print('language_model_inputs_size:', language_model_inputs.size())
+
+
+        if attention_mask == None:
+            assert batch_size == 1 , print('If you do not pass in llm_instruct_atts, you can only be generated in a single sentence.')
+            attention_mask = torch.ones_like(input_ids)
+        attention_mask = torch.cat([language_attention_mask, attention_mask], dim=1)
+
+        inputs_embeds = self.get_input_embeddings()(input_ids)
+        inputs_embeds = torch.cat([language_model_inputs, inputs_embeds], dim=1)
+        # print('inputs_embeds:', inputs_embeds)
+        # print('attention_mask:', attention_mask)
+        language_outputs = self.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            **generate_kwargs,
+        )
+
+        # outputs = []
+        # for index, output in enumerate(language_outputs):
+        #     output = output[inputs_embeds[index].size(0):]
+        #     outputs.append(output)
+
+        # return outputs
+        return language_outputs
+
+
+class LyricsQFromerForPretrain(Blip2PreTrainedModel):
+    config_class = LyricsConfig
+
+    def __init__(self, config: LyricsConfig):
+        super().__init__(config)
+
+        self.vision_model = Blip2VisionModel(config.vision_config)
+        self.ram = RAM(config.ram_config)
+        self.grounding_dino = GroundingDINO(config.detection_config)
+
+        self.query_tokens = nn.Parameter(torch.zeros(
+            1, config.num_query_tokens, config.qformer_config.hidden_size))
+        
+        # 同一个LMhead，不同的任务，加一个参数。或者拼起来。估计要加一个linear
+        # 图片256，目标检测900，语义分割4096，怎么可以赋予不同的权重，时间不是问题，权重是问题
+        self.qformer = LyricsQFormerWithLMHead(config.qformer_config)
+
+        self.vision_proj = nn.Linear(self.qformer.config.hidden_size, config.image_text_hidden_size)
+        self.text_proj = nn.Linear(self.qformer.config.hidden_size, config.image_text_hidden_size)
+
+        self.itm_head = nn.Linear(self.qformer.config.hidden_size, 2)
+
+        self.temp = nn.Parameter(0.07 * torch.ones([]))
+
+        self.max_txt_len = 512 # 512-96 = 416
+        self.max_input_len = 600
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def generate_bbox_caption(self, logits, boxes, english_tags, chinese_tags, language):
+        # filter output
+        # 0最大值, 1索引
+        bbox_caption = []
+        bbox_caption_tokens = []
+        bbox_caption_tokens_with_mask = []
+        bbox_caption_labels_with_mask = []
+        english_tags_list = [[tag.strip() for tag in sentence.split(' |')] for sentence in english_tags]
+        chinese_tags_list = [[tag.strip() for tag in sentence.split(' |')] for sentence in chinese_tags]
+
+        for ind in range(logits.size(0)):
+            single_filt_mask = logits[ind].max(dim=1)[0] > self.box_threshold
+            single_logits_filt = logits[ind][single_filt_mask]  # num_filt, 256
+            single_boxes_filt = boxes[ind][single_filt_mask]  # num_filt, 4
+
+            if len(single_filt_mask) == 0:
+                bbox_caption.append('')
+                bbox_caption_tokens.append(torch.Tensor([]))
+                bbox_caption_tokens_with_mask.append(torch.Tensor([]))
+                bbox_caption_labels_with_mask.append(torch.Tensor([]))
+                continue
+
+            single_image_bbox_caption = ''
+            single_image_bbox_caption_tokens = []
+            single_image_bbox_caption_tokens_with_mask = []
+            single_image_bbox_caption_labels_with_mask = []
+            # get phrase
+            tokenized = self.grounding_dino.tokenizer(english_tags[ind])
+            # build pred
+            pred_phrases = []
+            single_image_boxes = []
+            single_image_scores = []
+            for logit, box in zip(single_logits_filt, single_boxes_filt):
+                posmap = logit > self.text_threshold
+                assert isinstance(posmap, torch.Tensor), "posmap must be torch.Tensor"
+                if posmap.dim() == 1:
+                    non_zero_idx = posmap.nonzero(as_tuple=True)[0].tolist()
+                    # max_idx = posmap.max[1]
+                    token_ids = [tokenized["input_ids"][i] for i in non_zero_idx]
+                    # token_ids = [tokenized["input_ids"][i] for i in non_zero_idx if i == max_idx]
+                    pred_phrase = self.grounding_dino.tokenizer.decode(token_ids)
+                else:
+                    raise NotImplementedError("posmap must be 1-dim")
+                pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})")
+                single_image_scores.append(logit.max().item())
+                # box = box * torch.Tensor([self.config.image_size, self.config.image_size, self.config.image_size, self.config.image_size])
+                box[:2] -= box[2:] / 2
+                box[2:] += box[:2]
+                single_image_boxes.append(box)
+            # print("single_image_boxes:", len(single_image_boxes))
+            if len(single_image_boxes) == 0:
+                bbox_caption.append('')
+                bbox_caption_tokens.append(torch.Tensor([]))
+                bbox_caption_tokens_with_mask.append(torch.Tensor([]))
+                bbox_caption_labels_with_mask.append(torch.Tensor([]))
+                continue
+
+            single_image_boxes = torch.stack(single_image_boxes)
+            single_image_scores = torch.Tensor(single_image_scores).to("cuda")
+            # nms_idx = torchvision.ops.nms(single_image_boxes, single_image_scores, self.iou_threshold).to('cpu').numpy().tolist()
+            nms_idx = torchvision.ops.nms(single_image_boxes, single_image_scores, self.iou_threshold)
+            single_image_boxes_filt = single_image_boxes[nms_idx]
+            pred_phrases = [pred_phrases[idx] for idx in nms_idx]
+
+            # print("single_image_boxes_filt:", single_image_boxes_filt.size())
+            # print("pred_phrases:", pred_phrases)
+            if single_image_boxes_filt.size(0) == 0:
+                bbox_caption.append('')
+                bbox_caption_tokens.append(torch.Tensor([]))
+                bbox_caption_tokens_with_mask.append(torch.Tensor([]))
+                bbox_caption_labels_with_mask.append(torch.Tensor([]))
+                continue
+            # 处理一条数据的多个框
+            for i in range(single_image_boxes_filt.size(0)):
+                # ori_box = single_image_boxes_filt[i] / torch.Tensor([self.config.image_size, self.config.image_size, self.config.image_size, self.config.image_size])
+                ori_box = single_image_boxes_filt[i]
+                # ori_box = torch.Tensor([round(coordinate, 3) for coordinate in single_image_boxes_filt[i]])
+                name, _ = pred_phrases[i].split('(')
+                name = name.replace('|', '').strip()
+                name = re.sub(r'\s-\s', '-', name)
+                input_name = None
+                # print('english_tags_list:', english_tags_list[ind])
+                # print('name:', name)
+                if language == 'zh':
+                    for tags_ind in range(len(english_tags_list[ind])):
+                        if name == english_tags_list[ind][tags_ind]:
+                            input_name = chinese_tags_list[ind][tags_ind]
+                            break
+                    if input_name == None:
+                        flag = 0
+                        for tags_ind in range(len(english_tags_list[ind])):
+                            for name_ind in range(len(name.split()),0,-1):
+                                if ' '.join(name.split()[:name_ind]) == english_tags_list[ind][tags_ind]:
+                                    input_name = chinese_tags_list[ind][tags_ind]
+                                    flag = 1
+                                    break
+                            for name_ind in range(len(name.split())):
+                                if ' '.join(name.split()[name_ind:]) == english_tags_list[ind][tags_ind]:
+                                    input_name = chinese_tags_list[ind][tags_ind]
+                                    flag = 1
+                                    break
+                            if flag == 1:
+                                break                              
+                        if input_name == None:
+                            continue                                  
+                else:
+                    for tags_ind in range(len(english_tags_list[ind])):
+                        if name == english_tags_list[ind][tags_ind]:
+                            input_name = name
+                            break
+                    if input_name == None:
+                        flag = 0
+                        for tags_ind in range(len(english_tags_list[ind])):
+                            for name_ind in range(len(name.split()),0,-1):
+                                if ' '.join(name.split()[:name_ind]) == english_tags_list[ind][tags_ind]:
+                                    input_name = english_tags_list[ind][tags_ind]
+                                    flag = 1
+                                    break
+                            for name_ind in range(len(name.split())):
+                                if ' '.join(name.split()[name_ind:]) == english_tags_list[ind][tags_ind]:
+                                    input_name = english_tags_list[ind][tags_ind]
+                                    flag = 1
+                                    break
+                            if flag == 1:
+                                break
+                        if input_name == None:
+                            # print('name:', name)
+                            # print('input_name:', input_name)
+                            # print('english_tags_list:', english_tags_list[ind])
+                            # print('input_name is none')
+                            input_name = name      
+                # if input_name == None:
+                #     bbox_caption.append('')
+                #     bbox_caption_tokens.append(torch.Tensor([]))
+                #     bbox_caption_tokens_with_mask.append(torch.Tensor([]))
+                #     bbox_caption_labels_with_mask.append(torch.Tensor([]))                    
+                #     continue
+
+                # print('input_name:', input_name)
+                single_bbox_caption = input_name + ': [' + ', '.join([str(round(coordinate.item(), 3)) for coordinate in ori_box]) + ']'
+                single_image_bbox_caption = single_image_bbox_caption + ' ' + single_bbox_caption
+                name_and_bbox_tokens = []
+                name_and_bbox_tokens.append(torch.tensor(self.tokenizer(input_name, add_special_tokens=False).input_ids))
+                name_and_bbox_tokens.append(torch.tensor(self.tokenizer('[', add_special_tokens=False).input_ids))
+                for coordinate in ori_box:
+                    name_and_bbox_tokens.append(torch.tensor(self.tokenizer(str(round(coordinate.item(), 3)), add_special_tokens=False).input_ids))
+                name_and_bbox_tokens.append(torch.tensor(self.tokenizer(']', add_special_tokens=False).input_ids))
+
+                for name_and_bbox_tokens_ind in range(len(name_and_bbox_tokens)):                     
+                    if name_and_bbox_tokens_ind == 1 or name_and_bbox_tokens_ind == 5:
+                        single_image_bbox_caption_tokens_with_mask.append(name_and_bbox_tokens[name_and_bbox_tokens_ind])
+                        single_image_bbox_caption_labels_with_mask.append(torch.full_like(name_and_bbox_tokens[name_and_bbox_tokens_ind], -100))                           
+                    else:
+                        if random.random() <= 0.15:
+                            single_image_bbox_caption_tokens_with_mask.append(torch.full_like(name_and_bbox_tokens[name_and_bbox_tokens_ind], self.tokenizer.mask_token_id))
+                            single_image_bbox_caption_labels_with_mask.append(name_and_bbox_tokens[name_and_bbox_tokens_ind])                            
+                        else:
+                            single_image_bbox_caption_tokens_with_mask.append(name_and_bbox_tokens[name_and_bbox_tokens_ind])
+                            single_image_bbox_caption_labels_with_mask.append(torch.full_like(name_and_bbox_tokens[name_and_bbox_tokens_ind], -100))                     
+                    single_image_bbox_caption_tokens.append(name_and_bbox_tokens[name_and_bbox_tokens_ind])
+                    
+                single_image_bbox_caption_tokens.append(torch.tensor([self.tokenizer.sep_token_id]))
+                single_image_bbox_caption_tokens_with_mask.append(torch.tensor([self.tokenizer.sep_token_id]))
+                single_image_bbox_caption_labels_with_mask.append(torch.tensor([-100]))
+            # try:
+            if  single_image_bbox_caption_tokens == '':
+                bbox_caption.append('')
+                bbox_caption_tokens.append(torch.Tensor([]))
+                bbox_caption_tokens_with_mask.append(torch.Tensor([]))
+                bbox_caption_labels_with_mask.append(torch.Tensor([]))
+                continue 
+            single_image_bbox_caption_tokens = torch.cat(single_image_bbox_caption_tokens, dim = -1)
+            # except:
+            #     print('single_bbox_caption:', single_bbox_caption)
+            #     print('single_image_bbox_caption:', single_image_bbox_caption)
+            #     print('name_and_bbox_tokens:', name_and_bbox_tokens)
+            #     print('single_image_boxes:', single_image_boxes)
+            #     print('single_image_boxes_filt:', single_image_boxes_filt)
+            #     print('single_image_bbox_caption_tokens:', single_image_bbox_caption_tokens)
+            #     print('single_image_boxes_filt:', single_image_boxes_filt.size(1))
+            single_image_bbox_caption_tokens_with_mask = torch.cat(single_image_bbox_caption_tokens_with_mask, dim = -1)
+            single_image_bbox_caption_labels_with_mask = torch.cat(single_image_bbox_caption_labels_with_mask, dim = -1)
+            
+            bbox_caption.append(single_image_bbox_caption)
+            bbox_caption_tokens.append(single_image_bbox_caption_tokens)
+            bbox_caption_tokens_with_mask.append(single_image_bbox_caption_tokens_with_mask)
+            bbox_caption_labels_with_mask.append(single_image_bbox_caption_labels_with_mask)
+        # if torch.distributed.get_rank() == 0:
+        #     print(bbox_caption[0])
+        #     print(bbox_caption_tokens[0])
+        #     print(self.tokenizer.decode(bbox_caption_tokens[0]))
+        #     exit()
+        outputs = {'bbox_caption': bbox_caption,
+                   'bbox_caption_tokens': bbox_caption_tokens,
+                   'bbox_caption_tokens_with_mask': bbox_caption_tokens_with_mask,
+                   'bbox_caption_labels_with_mask': bbox_caption_labels_with_mask,
+                   }
+
+        return outputs
+
+    def prepare_inputs_for_pretrain(self, captions, bbox_caption_tokens, bbox_caption_tokens_with_mask, bbox_caption_labels_with_mask):
+        text_input_tokens_ids = []
+        text_input_tokens_ids_with_mask = []
+        text_input_labels = []
+        text_input_labels_with_mask = []
+        text_input_attentions = []
+        text_input_attentions_with_mask = []
+        text_input_position_ids = []
+        text_input_position_ids_with_mask = []
+        batch_size = len(captions)         
+        for ind, caption in enumerate(captions):
+            if len(caption)>500:
+                print(caption)
+            single_caption_tokens = torch.tensor(self.tokenizer(caption, add_special_tokens=False, truncation=True, max_length=87).input_ids)
+
+            # 单纯caption的label
+            single_caption_labels = torch.tensor(single_caption_tokens)
+            single_caption_labels_with_mask = torch.full_like(single_caption_tokens, -100)
+            
+            # 无框
+            if bbox_caption_tokens[ind].size(0) == 0:
+                single_text_input_tokens_ids = torch.cat([torch.tensor([self.tokenizer.cls_token_id]), single_caption_tokens])
+                single_text_input_tokens_ids_with_mask = torch.cat([torch.tensor([self.tokenizer.cls_token_id]), single_caption_tokens])
+                text_input_tokens_ids.append(single_text_input_tokens_ids)
+                text_input_tokens_ids_with_mask.append(single_text_input_tokens_ids_with_mask)
+                text_input_labels.append(torch.cat([torch.tensor([self.tokenizer.bos_token_id]), single_caption_labels]))
+                text_input_labels_with_mask.append(torch.cat([torch.tensor([-100]), single_caption_labels_with_mask]))
+                text_input_attentions.append(torch.ones_like(single_text_input_tokens_ids))
+                text_input_attentions_with_mask.append(torch.ones_like(single_text_input_tokens_ids_with_mask))
+                text_input_position_ids.append(torch.cat([torch.Tensor([0]), torch.arange(1, len(single_caption_tokens)+1)]))
+                text_input_position_ids_with_mask.append(torch.cat([torch.Tensor([0]), torch.arange(1, len(single_caption_tokens)+1)]))   
+                continue
+            # 拼接bbox的token和text的token和label
+            if len(bbox_caption_tokens[ind]) > self.max_txt_len - 1:
+                bbox_caption_tokens[ind] = bbox_caption_tokens[ind][:self.max_txt_len-1]
+                bbox_caption_tokens_with_mask[ind] = bbox_caption_tokens_with_mask[ind][:self.max_txt_len-1]
+                bbox_caption_labels_with_mask[ind] = bbox_caption_labels_with_mask[ind][:self.max_txt_len-1]
+
+            # LM任务的label
+            single_bbox_caption_labels = torch.full_like(bbox_caption_tokens[ind], -100)
+
+            single_text_input_tokens_ids = torch.cat([torch.tensor([self.tokenizer.cls_token_id]), bbox_caption_tokens[ind], single_caption_tokens])
+            single_text_input_tokens_ids_with_mask = torch.cat([torch.tensor([self.tokenizer.cls_token_id]), bbox_caption_tokens_with_mask[ind], single_caption_tokens])
+            
+            #这里也要改 cls 0 其他初始位置1
+            single_text_input_position_ids = torch.cat([torch.Tensor([0]), torch.arange(1, len(bbox_caption_tokens[ind])+1), torch.arange(1, len(single_caption_tokens)+1)])
+            single_text_input_position_ids_with_mask = torch.cat([torch.Tensor([0]), torch.arange(1, len(bbox_caption_tokens_with_mask[ind])+1), torch.arange(1, len(single_caption_tokens)+1)])                
+
+            single_text_input_labels = torch.cat([torch.tensor([self.tokenizer.bos_token_id]), single_bbox_caption_labels, single_caption_labels])
+            single_text_input_labels_with_mask = torch.cat([torch.tensor([-100]), bbox_caption_labels_with_mask[ind], single_caption_labels_with_mask])
+
+            # mask与pad不一样，还是要做注意力
+            single_text_input_attentions = torch.ones_like(single_text_input_tokens_ids)
+            single_text_input_attentions_with_mask = torch.ones_like(single_text_input_tokens_ids_with_mask)
+            # position_ids
+            # if single_text_input_tokens_ids.size(-1) > self.max_txt_len:
+            #     single_text_input_tokens_ids = single_text_input_tokens_ids[:self.max_txt_len]
+            #     single_text_input_tokens_ids_with_mask = single_text_input_tokens_ids_with_mask[:self.max_txt_len]
+            #     single_text_input_labels = single_text_input_labels[:self.max_txt_len]
+            #     single_text_input_labels_with_mask = single_text_input_labels_with_mask[:self.max_txt_len]
+            #     single_text_input_attentions = single_text_input_attentions[:self.max_txt_len]
+            #     single_text_input_attentions_with_mask = single_text_input_attentions_with_mask[:self.max_txt_len]
+            text_input_tokens_ids.append(single_text_input_tokens_ids)
+            text_input_tokens_ids_with_mask.append(single_text_input_tokens_ids_with_mask)
+            text_input_labels.append(single_text_input_labels)
+            text_input_labels_with_mask.append(single_text_input_labels_with_mask)
+            text_input_attentions.append(single_text_input_attentions)
+            text_input_attentions_with_mask.append(single_text_input_attentions_with_mask)
+            text_input_position_ids.append(single_text_input_position_ids)
+            text_input_position_ids_with_mask.append(single_text_input_position_ids_with_mask)   
+                
+        # 添加一个长度为max_length的tensor
+        pad_tensor = torch.ones(self.max_input_len)
+        text_input_tokens_ids.append(pad_tensor)
+        text_input_tokens_ids_with_mask.append(pad_tensor)
+        text_input_labels.append(pad_tensor)
+        text_input_labels_with_mask.append(pad_tensor)
+        text_input_attentions.append(pad_tensor)
+        text_input_attentions_with_mask.append(pad_tensor)
+        text_input_position_ids.append(pad_tensor)
+        text_input_position_ids_with_mask.append(pad_tensor)
+
+        text_input_tokens_ids = pad_sequence(text_input_tokens_ids, batch_first = True, padding_value = self.tokenizer.pad_token_id)[:batch_size, :self.max_input_len]
+        text_input_tokens_ids_with_mask = pad_sequence(text_input_tokens_ids_with_mask, batch_first = True, padding_value = self.tokenizer.pad_token_id)[:batch_size, :self.max_input_len]
+        text_input_labels = pad_sequence(text_input_labels, batch_first = True, padding_value = -100)[:batch_size, :self.max_input_len]
+        text_input_labels_with_mask = pad_sequence(text_input_labels_with_mask, batch_first = True, padding_value = -100)[:batch_size, :self.max_input_len]
+        text_input_attentions = pad_sequence(text_input_attentions, batch_first = True, padding_value = 0)[:batch_size, :self.max_input_len]
+        text_input_attentions_with_mask = pad_sequence(text_input_attentions_with_mask, batch_first = True, padding_value = 0)[:batch_size, :self.max_input_len]
+        text_input_position_ids = pad_sequence(text_input_position_ids, batch_first = True, padding_value = 0).long()[:batch_size, :self.max_input_len]
+        text_input_position_ids_with_mask = pad_sequence(text_input_position_ids_with_mask, batch_first = True, padding_value = 0).long()[:batch_size, :self.max_input_len]   
+        
+        outputs = {"text_input_tokens_ids": text_input_tokens_ids,
+                  "text_input_tokens_ids_with_mask": text_input_tokens_ids_with_mask,
+                  "text_input_labels": text_input_labels,
+                  "text_input_labels_with_mask": text_input_labels_with_mask,
+                  "text_input_attentions": text_input_attentions,
+                  "text_input_attentions_with_mask": text_input_attentions_with_mask,
+                  "text_input_position_ids": text_input_position_ids,
+                  "text_input_position_ids_with_mask": text_input_position_ids_with_mask,
+                  }
+        return outputs              
+
+    def forward(self, image, grounding_image, ram_image, caption, language):
+        image = image
+
+        image_embeds = self.vision_model(image)[0]
+        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device)
+
+        tags_english, tags_chinese = self.ram.generate_tag(ram_image)
+
+        input_tags = [tag.replace(' |', ',').lower().strip() + "." if not tag.endswith(".") else tag.replace(' |', ',').lower().strip() for tag in tags_english]
+
+        # outputs = self.grounding_dino(grounding_image[None], captions=input_tags)
+        outputs = self.grounding_dino(grounding_image, captions=input_tags)
+        logits = outputs["pred_logits"].sigmoid()  # (bs, nq, 256)
+        boxes = outputs["pred_boxes"]  # (bs, nq, 4)
+        detection_image_embeds = outputs["hidden_state"] # (bs, nq, 256)
+        detection_image_atts = torch.ones(detection_image_embeds.size()[:-1], dtype=torch.long).to(image.device)
+
+        bbox_outputs = self.generate_bbox_caption(logits, boxes, tags_english, tags_chinese, language)
+
+        text_inputs_for_pretrain = self.prepare_inputs_for_pretrain(caption, bbox_outputs['bbox_caption_tokens'], bbox_outputs['bbox_caption_tokens_with_mask'], bbox_outputs['bbox_caption_labels_with_mask'])   
+
+        # if torch.distributed.get_rank() == 0:
+        #     print(self.tokenizer.decode(text_inputs_for_pretrain["text_input_tokens_ids"][0]))
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        query_output = self.qformer.bert(
+            query_embeds=query_tokens,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_atts,
+            detection_encoder_hidden_states=detection_image_embeds,
+            detection_encoder_attention_mask=detection_image_atts,
+            use_cache=True,
+            return_dict=True,
+        )
+
+        image_feats = F.normalize(
+            self.vision_proj(query_output.last_hidden_state), dim=-1
+        )
+
+        text_output = self.qformer.bert(
+            input_ids=text_inputs_for_pretrain['text_input_tokens_ids'].to('cuda'),
+            position_ids=text_inputs_for_pretrain["text_input_position_ids"].to('cuda'),
+            attention_mask=text_inputs_for_pretrain['text_input_attentions'].to('cuda'),
+            return_dict=True,
+        )
+        text_feat = F.normalize(
+            self.text_proj(text_output.last_hidden_state[:, 0, :]), dim=-1
+        )
+
+        ###============== Image-text Contrastive ===================###
+        # print(image_feats.size())
+        # print(text_feat.size())
+        image_feats_all = concat_all_gather(
+            image_feats
+        )  # [batch_size*num_gpu, num_query_tokens, embed_dim]
+        text_feat_all = concat_all_gather(text_feat)  # [batch_size*num_gpu, embed_dim]
+
+        sim_q2t = torch.matmul(
+            image_feats.unsqueeze(1), text_feat_all.unsqueeze(-1)
+        ).squeeze()
+        # [batch_size, batch_size*num_gpu, num_query_tokens]
+
+        # image-text similarity: aggregate across all query tokens
+        sim_i2t, _ = sim_q2t.max(-1)
+        sim_i2t = sim_i2t / self.temp
+
+        # text-query similarity: [batch_size, batch_size*num_gpu, num_query_tokens]
+        sim_t2q = torch.matmul(
+            text_feat.unsqueeze(1).unsqueeze(1), image_feats_all.permute(0, 2, 1)
+        ).squeeze()
+
+        # text-image similarity: aggregate across all query tokens
+        sim_t2i, _ = sim_t2q.max(-1)
+        sim_t2i = sim_t2i / self.temp  # [batch_size, batch_size*num_gpu]
+
+        rank = dist.get_rank()
+        bs = image.size(0)
+        targets = torch.linspace(rank * bs, rank * bs + bs - 1, bs, dtype=int).to(
+            image.device
+        )
+
+        loss_itc = (
+            F.cross_entropy(sim_i2t, targets, label_smoothing=0.1)
+            + F.cross_entropy(sim_t2i, targets, label_smoothing=0.1)
+        ) / 2
+
+        ###============== Image-text Matching ===================###
+        # print(text_inputs_for_pretrain['text_input_tokens_ids'].size())
+        # print(text_inputs_for_pretrain['text_input_attentions'].size())
+        # print(text_inputs_for_pretrain['text_input_position_ids'].size())
+        # print(image_embeds)
+        # print(detection_image_embeds)
+        text_input_ids_world = concat_all_gather(text_inputs_for_pretrain['text_input_tokens_ids'].to('cuda'))
+        text_attention_mask_world = concat_all_gather(text_inputs_for_pretrain['text_input_attentions'].to('cuda'))
+        text_position_ids_world = concat_all_gather(text_inputs_for_pretrain['text_input_position_ids'].to('cuda'))
+        image_embeds_world = all_gather_with_grad(image_embeds)
+        detection_image_embeds_world = all_gather_with_grad(detection_image_embeds)
+        with torch.no_grad():
+            weights_t2i = F.softmax(sim_t2i, dim=1) + 1e-4
+            weights_t2i[:, rank * bs: rank * bs + bs].fill_diagonal_(0)
+            weights_i2t = F.softmax(sim_i2t, dim=1) + 1e-4
+            weights_i2t[:, rank * bs: rank * bs + bs].fill_diagonal_(0)
+
+        # select a negative image for each text
+        image_embeds_neg = []
+        detection_image_embeds_neg = []
+        for b in range(bs):
+            neg_idx = torch.multinomial(weights_t2i[b], 1).item()
+            image_embeds_neg.append(image_embeds_world[neg_idx])
+            detection_image_embeds_neg.append(detection_image_embeds_world[neg_idx])
+        image_embeds_neg = torch.stack(image_embeds_neg, dim=0)
+        detection_image_embeds_neg = torch.stack(detection_image_embeds_neg, dim=0)
+
+        # select a negative text for each image
+        text_ids_neg = []
+        text_atts_neg = []
+        text_position_neg = []
+        for b in range(bs):
+            neg_idx = torch.multinomial(weights_i2t[b], 1).item()
+            text_ids_neg.append(text_input_ids_world[neg_idx])
+            text_atts_neg.append(text_attention_mask_world[neg_idx])
+            text_position_neg.append(text_position_ids_world[neg_idx])
+
+        text_ids_neg = torch.stack(text_ids_neg, dim=0)
+        text_atts_neg = torch.stack(text_atts_neg, dim=0)
+        text_position_neg = torch.stack(text_position_neg, dim=0)
+
+        text_ids_all = torch.cat(
+            [text_inputs_for_pretrain['text_input_tokens_ids'].to('cuda'), text_inputs_for_pretrain['text_input_tokens_ids'].to('cuda'), text_ids_neg], dim=0
+        )  # pos, pos, neg
+        text_atts_all = torch.cat(
+            [text_inputs_for_pretrain['text_input_attentions'].to('cuda'), text_inputs_for_pretrain['text_input_attentions'].to('cuda'), text_atts_neg],
+            dim=0,
+        )
+        position_ids_all = torch.cat(
+            [text_inputs_for_pretrain["text_input_position_ids"].to('cuda'), text_inputs_for_pretrain["text_input_position_ids"].to('cuda'), text_position_neg],
+            dim=0,
+        )
+
+        query_tokens_itm = self.query_tokens.expand(text_ids_all.shape[0], -1, -1)
+        query_atts_itm = torch.ones(query_tokens_itm.size()[:-1], dtype=torch.long).to(
+            image.device
+        )
+        attention_mask_all = torch.cat([query_atts_itm, text_atts_all], dim=1)
+
+        image_embeds_all = torch.cat(
+            [image_embeds, image_embeds_neg, image_embeds], dim=0
+        )  # pos, neg, pos
+        image_atts_all = torch.ones(image_embeds_all.size()[:-1], dtype=torch.long).to(
+            image.device
+        )
+
+        detection_image_embeds_all = torch.cat(
+            [detection_image_embeds, detection_image_embeds_neg, detection_image_embeds], dim=0
+        )  # pos, neg, pos
+        detection_image_atts_all = torch.ones(detection_image_embeds_all.size()[:-1], dtype=torch.long).to(
+            image.device
+        )        
+
+        output_itm = self.qformer.bert(
+            text_ids_all,
+            query_embeds=query_tokens_itm,
+            position_ids=position_ids_all,
+            attention_mask=attention_mask_all,
+            encoder_hidden_states=image_embeds_all,
+            encoder_attention_mask=image_atts_all,
+            detection_encoder_hidden_states=detection_image_embeds_all,
+            detection_encoder_attention_mask=detection_image_atts_all,            
+            return_dict=True,
+        )
+
+        vl_embeddings = output_itm.last_hidden_state[:, : query_tokens_itm.size(1), :]
+        vl_output = self.itm_head(vl_embeddings)
+        logits = vl_output.mean(dim=1)
+
+        itm_labels = torch.cat(
+            [torch.ones(bs, dtype=torch.long), torch.zeros(2 * bs, dtype=torch.long)],
+            dim=0,
+        ).to(image.device)
+        loss_itm = F.cross_entropy(logits, itm_labels)
+
+        ##================= Image Captioning ========================##
+        decoder_input_ids = text_inputs_for_pretrain['text_input_tokens_ids'].to('cuda').clone()
+        decoder_input_ids[:, 0] = self.tokenizer.bos_token_id
+        # labels = decoder_input_ids.masked_fill(
+        #     decoder_input_ids == self.tokenizer.pad_token_id, -100
+        # )
+        print('text_input_tokens_ids:', text_inputs_for_pretrain['text_input_tokens_ids'][0])
+        print('text_input_tokens:', self.tokenizer.decode(text_inputs_for_pretrain['text_input_tokens_ids'][0]))
+        print('text_input_labels_ids:', text_inputs_for_pretrain['text_input_labels'][0])
+        print('text_input_labels:', self.tokenizer.decode(text_inputs_for_pretrain['text_input_labels'][0].masked_fill(text_inputs_for_pretrain['text_input_labels'][0]==-100, torch.tensor(0))))
+
+        decoder_labels = text_inputs_for_pretrain['text_input_labels'].to('cuda')
+
+        decoder_query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(
+            image.device
+        )
+        decoder_attention_mask = torch.cat([decoder_query_atts, text_inputs_for_pretrain['text_input_attentions'].to('cuda')], dim=1)
+        # print('decoder_input_ids:', decoder_input_ids.size())
+        # print('decoder_labels:', decoder_labels.size())
+        lm_output = self.qformer(
+            decoder_input_ids,
+            position_ids=text_inputs_for_pretrain["text_input_position_ids"].to('cuda'),
+            attention_mask=decoder_attention_mask,
+            past_key_values=query_output.past_key_values,
+            return_dict=True,
+            labels=decoder_labels,
+            is_decoder=True,
+        )
+
+        loss_lm = lm_output.loss
+
+        ##================= Mask Language Model ========================##
+        # encoder_input_ids = text_inputs_for_pretrain['text_input_tokens_ids_with_mask'].clone()
+        encoder_input_ids = text_inputs_for_pretrain['text_input_tokens_ids_with_mask'].to("cuda")
+        # encoder_input_ids[:, 0] = self.tokenizer.bos_token_id
+        print('text_input_tokens_ids_with_mask:', text_inputs_for_pretrain['text_input_tokens_ids_with_mask'][0])
+        print('text_input_tokens_with_mask:', self.tokenizer.decode(text_inputs_for_pretrain['text_input_tokens_ids_with_mask'][0]))
+        print('text_input_labels_ids_with_mask:', text_inputs_for_pretrain['text_input_labels_with_mask'][0])
+        print('text_input_labels_with_mask:', self.tokenizer.decode(text_inputs_for_pretrain['text_input_labels_with_mask'][0].masked_fill(text_inputs_for_pretrain['text_input_labels_with_mask'][0]==-100, torch.tensor(0))))
+
+        encoder_labels = text_inputs_for_pretrain['text_input_labels_with_mask'].to("cuda")
+
+        encoder_query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(
+            image.device
+        )
+        encoder_attention_mask = torch.cat([encoder_query_atts, text_inputs_for_pretrain['text_input_attentions_with_mask'].to("cuda")], dim=1)
+        mlm_output = self.qformer(
+            encoder_input_ids,
+            position_ids=text_inputs_for_pretrain["text_input_position_ids_with_mask"].to('cuda'),
+            attention_mask=encoder_attention_mask,
+            past_key_values=query_output.past_key_values,
+            return_dict=True,
+            labels=encoder_labels,
+            is_decoder=False,
+        )
+
+        loss_mlm = mlm_output.loss
+        # print(loss_itc)   
+        # print(loss_itm)   
+        # print(loss_lm)   
+        # print(loss_mlm)
+        # print('mlm_label:', torch.sum(torch.where(text_inputs_for_pretrain['text_input_labels_with_mask']!=-100,1,0),dim=1))
+        # print('lm_label:', torch.sum(torch.where(text_inputs_for_pretrain['text_input_labels']!=-100,1,0),dim=1))     
+
+        return LyricsOutput(
+            loss=loss_itc + loss_itm + loss_lm + loss_mlm,
+            loss_itc=loss_itc,
+            loss_itm=loss_itm,
+            loss_lm=loss_lm,
+            loss_mlm=loss_mlm,
+        )
+
+
+@torch.no_grad()
+def concat_all_gather(tensor):
+    """
+    Performs all_gather operation on the provided tensors.
+    *** Warning ***: torch.distributed.all_gather has no gradient.
+    """
+    tensors_gather = [torch.ones_like(tensor)
+                      for _ in range(torch.distributed.get_world_size())]
+    torch.distributed.all_gather(tensors_gather, tensor, async_op=False)
+
+    output = torch.cat(tensors_gather, dim=0)
+    return output
+
+
+class GatherLayer(torch.autograd.Function):
+    """
+    Gather tensors from all workers with support for backward propagation:
+    This implementation does not cut the gradients as torch.distributed.all_gather does.
+    """
+
+    @staticmethod
+    def forward(ctx, x):
+        output = [
+            torch.zeros_like(x) for _ in range(torch.distributed.get_world_size())
+        ]
+        torch.distributed.all_gather(output, x)
+        return tuple(output)
+
+    @staticmethod
+    def backward(ctx, *grads):
+        all_gradients = torch.stack(grads)
+        torch.distributed.all_reduce(all_gradients)
+        return all_gradients[torch.distributed.get_rank()]
+
+
+def all_gather_with_grad(tensors):
+    """
+    Performs all_gather operation on the provided tensors.
+    Graph remains connected for backward grad computation.
+    """
+    # Queue the gathered tensors
+    world_size = torch.distributed.get_world_size()
+    # There is no need for reduction in the single-proc case
+    if world_size == 1:
+        return tensors
+
+    # tensor_all = GatherLayer.apply(tensors)
+    tensor_all = GatherLayer.apply(tensors)
+
+    return torch.cat(tensor_all, dim=0)
diff --git a/fengshen/models/Lyrics/ram/configs/med_config.json b/fengshen/models/Lyrics/ram/configs/med_config.json
new file mode 100644
index 0000000..49d64f8
--- /dev/null
+++ b/fengshen/models/Lyrics/ram/configs/med_config.json
@@ -0,0 +1,21 @@
+{
+    "architectures": [
+      "BertModel"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-12,
+    "max_position_embeddings": 512,
+    "model_type": "bert",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "pad_token_id": 0,
+    "type_vocab_size": 2,
+    "vocab_size": 30524,
+    "encoder_width": 768,
+    "add_cross_attention": true   
+  }
\ No newline at end of file
diff --git a/fengshen/models/Lyrics/ram/configs/q2l_config.json b/fengshen/models/Lyrics/ram/configs/q2l_config.json
new file mode 100644
index 0000000..a8eba56
--- /dev/null
+++ b/fengshen/models/Lyrics/ram/configs/q2l_config.json
@@ -0,0 +1,22 @@
+{
+    "architectures": [
+      "BertModel"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-12,
+    "max_position_embeddings": 512,
+    "model_type": "bert",
+    "num_attention_heads": 4,
+    "num_hidden_layers": 2,
+    "pad_token_id": 0,
+    "type_vocab_size": 2,
+    "vocab_size": 30522,
+    "encoder_width": 768,
+    "add_cross_attention": true,
+    "add_tag_cross_attention": false
+  }
\ No newline at end of file
diff --git a/fengshen/models/Lyrics/ram/configs/swin/config_swinB_384.json b/fengshen/models/Lyrics/ram/configs/swin/config_swinB_384.json
new file mode 100644
index 0000000..d2f3e07
--- /dev/null
+++ b/fengshen/models/Lyrics/ram/configs/swin/config_swinB_384.json
@@ -0,0 +1,9 @@
+{
+    "ckpt": "pretrain_model/swin_base_patch4_window7_224_22k.pth",
+    "vision_width": 1024,
+    "image_res": 384,
+    "window_size": 12,
+    "embed_dim": 128,
+    "depths": [ 2, 2, 18, 2 ],
+    "num_heads": [ 4, 8, 16, 32 ]
+  }
\ No newline at end of file
diff --git a/fengshen/models/Lyrics/ram/configs/swin/config_swinL_384.json b/fengshen/models/Lyrics/ram/configs/swin/config_swinL_384.json
new file mode 100644
index 0000000..e6443a2
--- /dev/null
+++ b/fengshen/models/Lyrics/ram/configs/swin/config_swinL_384.json
@@ -0,0 +1,9 @@
+{
+    "ckpt": "pretrain_model/swin_large_patch4_window12_384_22k.pth",
+    "vision_width": 1536,
+    "image_res": 384,
+    "window_size": 12,
+    "embed_dim": 192,
+    "depths": [ 2, 2, 18, 2 ],
+    "num_heads": [ 6, 12, 24, 48 ]
+  }
\ No newline at end of file
diff --git a/fengshen/models/Lyrics/ram/data/ram_tag_list.txt b/fengshen/models/Lyrics/ram/data/ram_tag_list.txt
new file mode 100644
index 0000000..49c840b
--- /dev/null
+++ b/fengshen/models/Lyrics/ram/data/ram_tag_list.txt
@@ -0,0 +1,4585 @@
+3D CG rendering
+3D glasses
+abacus
+abalone
+monastery
+belly
+academy
+accessory
+accident
+accordion
+acorn
+acrylic paint
+act
+action
+action film
+activity
+actor
+adaptation
+add
+adhesive tape
+adjust
+adult
+adventure
+advertisement
+antenna
+aerobics
+spray can
+afro
+agriculture
+aid
+air conditioner
+air conditioning
+air sock
+aircraft cabin
+aircraft model
+air field
+air line
+airliner
+airman
+plane
+airplane window
+airport
+airport runway
+airport terminal
+airship
+airshow
+aisle
+alarm
+alarm clock
+mollymawk
+album
+album cover
+alcohol
+alcove
+algae
+alley
+almond
+aloe vera
+alp
+alpaca
+alphabet
+german shepherd
+altar
+amber
+ambulance
+bald eagle
+American shorthair
+amethyst
+amphitheater
+amplifier
+amusement park
+amusement ride
+anchor
+ancient
+anemone
+angel
+angle
+animal
+animal sculpture
+animal shelter
+animation
+animation film
+animator
+anime
+ankle
+anklet
+anniversary
+trench coat
+ant
+antelope
+antique
+antler
+anvil
+apartment
+ape
+app
+app icon
+appear
+appearance
+appetizer
+applause
+apple
+apple juice
+apple pie
+apple tree
+applesauce
+appliance
+appointment
+approach
+apricot
+apron
+aqua
+aquarium
+aquarium fish
+aqueduct
+arcade
+arcade machine
+arch
+arch bridge
+archaelogical excavation
+archery
+archipelago
+architect
+architecture
+archive
+archway
+area
+arena
+argument
+arm
+armadillo
+armband
+armchair
+armoire
+armor
+army
+army base
+army tank
+array
+arrest
+arrow
+art
+art exhibition
+art gallery
+art print
+art school
+art studio
+art vector illustration
+artichoke
+article
+artifact
+artist
+artists loft
+ash
+ashtray
+asia temple
+asparagus
+asphalt road
+assemble
+assembly
+assembly line
+association
+astronaut
+astronomer
+athlete
+athletic
+atlas
+atm
+atmosphere
+atrium
+attach
+fighter jet
+attend
+attraction
+atv
+eggplant
+auction
+audi
+audio
+auditorium
+aurora
+author
+auto factory
+auto mechanic
+auto part
+auto show
+auto showroom
+car battery
+automobile make
+automobile model
+motor vehicle
+autumn
+autumn forest
+autumn leave
+autumn park
+autumn tree
+avatar
+avenue
+aviator sunglasses
+avocado
+award
+award ceremony
+award winner
+shed
+ax
+azalea
+baboon
+baby
+baby bottle
+baby carriage
+baby clothe
+baby elephant
+baby food
+baby seat
+baby shower
+back
+backdrop
+backlight
+backpack
+backyard
+bacon
+badge
+badger
+badlands
+badminton
+badminton racket
+bag
+bagel
+bagpipe
+baguette
+bait
+baked goods
+baker
+bakery
+baking
+baking sheet
+balance
+balance car
+balcony
+ball
+ball pit
+ballerina
+ballet
+ballet dancer
+ballet skirt
+balloon
+balloon arch
+baseball player
+ballroom
+bamboo
+bamboo forest
+banana
+banana bread
+banana leaf
+banana tree
+band
+band aid
+bandage
+headscarf
+bandeau
+bangs
+bracelet
+balustrade
+banjo
+bank
+bank card
+bank vault
+banknote
+banner
+banquet
+banquet hall
+banyan tree
+baozi
+baptism
+bar
+bar code
+bar stool
+barbecue
+barbecue grill
+barbell
+barber
+barber shop
+barbie
+barge
+barista
+bark
+barley
+barn
+barn owl
+barn door
+barrel
+barricade
+barrier
+handcart
+bartender
+baseball
+baseball base
+baseball bat
+baseball hat
+baseball stadium
+baseball game
+baseball glove
+baseball pitcher
+baseball team
+baseball uniform
+basement
+basil
+basin
+basket
+basket container
+basketball
+basketball backboard
+basketball coach
+basketball court
+basketball game
+basketball hoop
+basketball player
+basketball stadium
+basketball team
+bass
+bass guitar
+bass horn
+bassist
+bat
+bath
+bath heater
+bath mat
+bath towel
+swimwear
+bathrobe
+bathroom
+bathroom accessory
+bathroom cabinet
+bathroom door
+bathroom mirror
+bathroom sink
+toilet paper
+bathroom window
+batman
+wand
+batter
+battery
+battle
+battle rope
+battleship
+bay
+bay bridge
+bay window
+bayberry
+bazaar
+beach
+beach ball
+beach chair
+beach house
+beach hut
+beach towel
+beach volleyball
+lighthouse
+bead
+beagle
+beak
+beaker
+beam
+bean
+bean bag chair
+beanbag
+bear
+bear cub
+beard
+beast
+beat
+beautiful
+beauty
+beauty salon
+beaver
+bed
+bedcover
+bed frame
+bedroom
+bedding
+bedpan
+bedroom window
+bedside lamp
+bee
+beech tree
+beef
+beekeeper
+beeper
+beer
+beer bottle
+beer can
+beer garden
+beer glass
+beer hall
+beet
+beetle
+beige
+clock
+bell pepper
+bell tower
+belt
+belt buckle
+bench
+bend
+bengal tiger
+bento
+beret
+berry
+berth
+beverage
+bib
+bibimbap
+bible
+bichon
+bicycle
+bicycle helmet
+bicycle wheel
+biker
+bidet
+big ben
+bike lane
+bike path
+bike racing
+bike ride
+bikini
+bikini top
+bill
+billard
+billboard
+billiard table
+bin
+binder
+binocular
+biology laboratory
+biplane
+birch
+birch tree
+bird
+bird bath
+bird feeder
+bird house
+bird nest
+birdbath
+bird cage
+birth
+birthday
+birthday cake
+birthday candle
+birthday card
+birthday party
+biscuit
+bishop
+bison
+bit
+bite
+black
+black sheep
+blackberry
+blackbird
+blackboard
+blacksmith
+blade
+blanket
+sports coat
+bleacher
+blender
+blessing
+blind
+eye mask
+flasher
+snowstorm
+block
+blog
+blood
+bloom
+blossom
+blouse
+blow
+hair drier
+blowfish
+blue
+blue artist
+blue jay
+blue sky
+blueberry
+bluebird
+pig
+board
+board eraser
+board game
+boardwalk
+boat
+boat deck
+boat house
+paddle
+boat ride
+bobfloat
+bobcat
+body
+bodyboard
+bodybuilder
+boiled egg
+boiler
+bolo tie
+bolt
+bomb
+bomber
+bonasa umbellu
+bone
+bonfire
+bonnet
+bonsai
+book
+book cover
+bookcase
+folder
+bookmark
+bookshelf
+bookstore
+boom microphone
+boost
+boot
+border
+Border collie
+botanical garden
+bottle
+bottle cap
+bottle opener
+bottle screw
+bougainvillea
+boulder
+bouquet
+boutique
+boutique hotel
+bow
+bow tie
+bow window
+bowl
+bowling
+bowling alley
+bowling ball
+bowling equipment
+box
+box girder bridge
+box turtle
+boxer
+underdrawers
+boxing
+boxing glove
+boxing ring
+boy
+brace
+bracket
+braid
+brain
+brake
+brake light
+branch
+brand
+brandy
+brass
+brass plaque
+bread
+breadbox
+break
+breakfast
+seawall
+chest
+brewery
+brick
+brick building
+wall
+brickwork
+wedding dress
+bride
+groom
+bridesmaid
+bridge
+bridle
+briefcase
+bright
+brim
+broach
+broadcasting
+broccoli
+bronze
+bronze medal
+bronze sculpture
+bronze statue
+brooch
+creek
+broom
+broth
+brown
+brown bear
+brownie
+brunch
+brunette
+brush
+coyote
+brussels sprout
+bubble
+bubble gum
+bubble tea
+bucket cabinet
+shield
+bud
+buddha
+buffalo
+buffet
+bug
+build
+builder
+building
+building block
+building facade
+building material
+lamp
+bull
+bulldog
+bullet
+bullet train
+bulletin board
+bulletproof vest
+bullfighting
+megaphone
+bullring
+bumblebee
+bumper
+roll
+bundle
+bungee
+bunk bed
+bunker
+bunny
+buoy
+bureau
+burial chamber
+burn
+burrito
+bus
+bus driver
+bus interior
+bus station
+bus stop
+bus window
+bush
+business
+business card
+business executive
+business suit
+business team
+business woman
+businessman
+bust
+butcher
+butchers shop
+butte
+butter
+cream
+butterfly
+butterfly house
+button
+buttonwood
+buy
+taxi
+cabana
+cabbage
+cabin
+cabin car
+cabinet
+cabinetry
+cable
+cable car
+cactus
+cafe
+canteen
+cage
+cake
+cake stand
+calculator
+caldron
+calendar
+calf
+call
+phone box
+calligraphy
+calm
+camcorder
+camel
+camera
+camera lens
+camouflage
+camp
+camper
+campfire
+camping
+campsite
+campus
+can
+can opener
+canal
+canary
+cancer
+candle
+candle holder
+candy
+candy bar
+candy cane
+candy store
+cane
+jar
+cannon
+canopy
+canopy bed
+cantaloupe
+cantilever bridge
+canvas
+canyon
+cap
+cape
+cape cod
+cappuccino
+capsule
+captain
+capture
+car
+car dealership
+car door
+car interior
+car logo
+car mirror
+parking lot
+car seat
+car show
+car wash
+car window
+caramel
+card
+card game
+cardboard
+cardboard box
+cardigan
+cardinal
+cargo
+cargo aircraft
+cargo ship
+caribbean
+carnation
+carnival
+carnivore
+carousel
+carp
+carpenter
+carpet
+slipper
+house finch
+coach
+dalmatian
+aircraft carrier
+carrot
+carrot cake
+carry
+cart
+carton
+cartoon
+cartoon character
+cartoon illustration
+cartoon style
+carve
+case
+cash
+cashew
+casino
+casserole
+cassette
+cassette deck
+plaster bandage
+casting
+castle
+cat
+cat bed
+cat food
+cat furniture
+cat tree
+catacomb
+catamaran
+catamount
+catch
+catcher
+caterpillar
+catfish
+cathedral
+cattle
+catwalk
+catwalk show
+cauliflower
+cave
+caviar
+CD
+CD player
+cedar
+ceiling
+ceiling fan
+celebrate
+celebration
+celebrity
+celery
+cello
+smartphone
+cement
+graveyard
+centerpiece
+centipede
+ceramic
+ceramic tile
+cereal
+ceremony
+certificate
+chain
+chain saw
+chair
+chairlift
+daybed
+chalet
+chalice
+chalk
+chamber
+chameleon
+champagne
+champagne flute
+champion
+championship
+chandelier
+changing table
+channel
+chap
+chapel
+character sculpture
+charcoal
+charge
+charger
+chariot
+charity
+charity event
+charm
+graph
+chase
+chassis
+check
+checkbook
+chessboard
+checklist
+cheer
+cheerlead
+cheese
+cheeseburger
+cheesecake
+cheetah
+chef
+chemical compound
+chemist
+chemistry
+chemistry lab
+cheongsam
+cherry
+cherry blossom
+cherry tomato
+cherry tree
+chess
+chestnut
+chicken
+chicken breast
+chicken coop
+chicken salad
+chicken wing
+garbanzo
+chiffonier
+chihuahua
+child
+child actor
+childs room
+chile
+chili dog
+chimney
+chimpanzee
+chinaware
+chinese cabbage
+chinese garden
+chinese knot
+chinese rose
+chinese tower
+chip
+chipmunk
+chisel
+chocolate
+chocolate bar
+chocolate cake
+chocolate chip
+chocolate chip cookie
+chocolate milk
+chocolate mousse
+truffle
+choir
+kitchen knife
+cutting board
+chopstick
+christmas
+christmas ball
+christmas card
+christmas decoration
+christmas dinner
+christmas eve
+christmas hat
+christmas light
+christmas market
+christmas ornament
+christmas tree
+chrysanthemum
+church
+church tower
+cider
+cigar
+cigar box
+cigarette
+cigarette case
+waistband
+cinema
+photographer
+cinnamon
+circle
+circuit
+circuit board
+circus
+water tank
+citrus fruit
+city
+city bus
+city hall
+city nightview
+city park
+city skyline
+city square
+city street
+city wall
+city view
+clam
+clarinet
+clasp
+class
+classic
+classroom
+clavicle
+claw
+clay
+pottery
+clean
+clean room
+cleaner
+cleaning product
+clear
+cleat
+clementine
+client
+cliff
+climb
+climb mountain
+climber
+clinic
+clip
+clip art
+clipboard
+clipper
+clivia
+cloak
+clogs
+close-up
+closet
+cloth
+clothe
+clothing
+clothespin
+clothesline
+clothing store
+cloud
+cloud forest
+cloudy
+clover
+joker
+clown fish
+club
+clutch
+clutch bag
+coal
+coast
+coat
+coatrack
+cob
+cock
+cockatoo
+cocker
+cockpit
+roach
+cocktail
+cocktail dress
+cocktail shaker
+cocktail table
+cocoa
+coconut
+coconut tree
+coffee
+coffee bean
+coffee cup
+coffee machine
+coffee shop
+coffeepot
+coffin
+cognac
+spiral
+coin
+coke
+colander
+cold
+slaw
+collaboration
+collage
+collection
+college student
+sheepdog
+crash
+color
+coloring book
+coloring material
+pony
+pillar
+comb
+combination lock
+comic
+comedy
+comedy film
+comet
+comfort
+comfort food
+comic book
+comic book character
+comic strip
+commander
+commentator
+community
+commuter
+company
+compass
+compete
+contest
+competitor
+composer
+composition
+compost
+computer
+computer box
+computer chair
+computer desk
+keyboard
+computer monitor
+computer room
+computer screen
+computer tower
+concept car
+concert
+concert hall
+conch
+concrete
+condiment
+condom
+condominium
+conductor
+cone
+meeting
+conference center
+conference hall
+meeting room
+confetti
+conflict
+confluence
+connect
+connector
+conservatory
+constellation
+construction site
+construction worker
+contain
+container
+container ship
+continent
+profile
+contract
+control
+control tower
+convenience store
+convention
+conversation
+converter
+convertible
+transporter
+cook
+cooking
+cooking spray
+cooker
+cool
+cooler
+copper
+copy
+coral
+coral reef
+rope
+corded phone
+liquor
+corgi
+cork
+corkboard
+cormorant
+corn
+corn field
+cornbread
+corner
+trumpet
+cornice
+cornmeal
+corral
+corridor
+corset
+cosmetic
+cosmetics brush
+cosmetics mirror
+cosplay
+costume
+costumer film designer
+infant bed
+cottage
+cotton
+cotton candy
+couch
+countdown
+counter
+counter top
+country artist
+country house
+country lane
+country pop artist
+countryside
+coupe
+couple
+couple photo
+courgette
+course
+court
+courthouse
+courtyard
+cousin
+coverall
+cow
+cowbell
+cowboy
+cowboy boot
+cowboy hat
+crab
+crabmeat
+crack
+cradle
+craft
+craftsman
+cranberry
+crane
+crape
+crapper
+crate
+crater lake
+lobster
+crayon
+cream cheese
+cream pitcher
+create
+creature
+credit card
+crescent
+croissant
+crest
+crew
+cricket
+cricket ball
+cricket team
+cricketer
+crochet
+crock pot
+crocodile
+crop
+crop top
+cross
+crossbar
+crossroad
+crosstalk
+crosswalk
+crouton
+crow
+crowbar
+crowd
+crowded
+crown
+crt screen
+crucifix
+cruise
+cruise ship
+cruiser
+crumb
+crush
+crutch
+crystal
+cub
+cube
+cucumber
+cue
+cuff
+cufflink
+cuisine
+farmland
+cup
+cupcake
+cupid
+curb
+curl
+hair roller
+currant
+currency
+curry
+curtain
+curve
+pad
+customer
+cut
+cutlery
+cycle
+cycling
+cyclone
+cylinder
+cymbal
+cypress
+cypress tree
+dachshund
+daffodil
+dagger
+dahlia
+daikon
+dairy
+daisy
+dam
+damage
+damp
+dance
+dance floor
+dance room
+dancer
+dandelion
+dark
+darkness
+dart
+dartboard
+dashboard
+date
+daughter
+dawn
+day bed
+daylight
+deadbolt
+death
+debate
+debris
+decanter
+deck
+decker bus
+decor
+decorate
+decorative picture
+deer
+defender
+deity
+delicatessen
+deliver
+demolition
+monster
+demonstration
+den
+denim jacket
+dentist
+department store
+depression
+derby
+dermopathy
+desert
+desert road
+design
+designer
+table
+table lamp
+desktop
+desktop computer
+dessert
+destruction
+detective
+detergent
+dew
+dial
+diamond
+diaper
+diaper bag
+journal
+die
+diet
+excavator
+number
+digital clock
+dill
+dinner
+rowboat
+dining room
+dinner party
+dinning table
+dinosaur
+dip
+diploma
+direct
+director
+dirt
+dirt bike
+dirt field
+dirt road
+dirt track
+disaster
+disciple
+disco
+disco ball
+discotheque
+disease
+plate
+dish antenna
+dish washer
+dishrag
+dishes
+dishsoap
+Disneyland
+dispenser
+display
+display window
+trench
+dive
+diver
+diving board
+paper cup
+dj
+doberman
+dock
+doctor
+document
+documentary
+dog
+dog bed
+dog breed
+dog collar
+dog food
+dog house
+doll
+dollar
+dollhouse
+dolly
+dolphin
+dome
+domicile
+domino
+donkey
+donut
+doodle
+door
+door handle
+doormat
+doorplate
+doorway
+dormitory
+dough
+downtown
+dozer
+drag
+dragon
+dragonfly
+drain
+drama
+drama film
+draw
+drawer
+drawing
+drawing pin
+pigtail
+dress
+dress hat
+dress shirt
+dress shoe
+dress suit
+dresser
+dressing room
+dribble
+drift
+driftwood
+drill
+drink
+drinking water
+drive
+driver
+driveway
+drone
+drop
+droplight
+dropper
+drought
+medicine
+pharmacy
+drum
+drummer
+drumstick
+dry
+duchess
+duck
+duckbill
+duckling
+duct tape
+dude
+duet
+duffel
+canoe
+dumbbell
+dumpling
+dune
+dunk
+durian
+dusk
+dust
+garbage truck
+dustpan
+duvet
+DVD
+dye
+eagle
+ear
+earmuff
+earphone
+earplug
+earring
+earthquake
+easel
+easter
+easter bunny
+easter egg
+eat
+restaurant
+eclair
+eclipse
+ecosystem
+edit
+education
+educator
+eel
+egg
+egg roll
+egg tart
+eggbeater
+egret
+Eiffel tower
+elastic band
+senior
+electric chair
+electric drill
+electrician
+electricity
+electron
+electronic
+elephant
+elevation map
+elevator
+elevator car
+elevator door
+elevator lobby
+elevator shaft
+embankment
+embassy
+embellishment
+ember
+emblem
+embroidery
+emerald
+emergency
+emergency service
+emergency vehicle
+emotion
+Empire State Building
+enamel
+enclosure
+side table
+energy
+engagement
+engagement ring
+engine
+engine room
+engineer
+engineering
+english shorthair
+ensemble
+enter
+entertainer
+entertainment
+entertainment center
+entrance
+entrance hall
+envelope
+equestrian
+equipment
+eraser
+erhu
+erosion
+escalator
+escargot
+espresso
+estate
+estuary
+eucalyptus tree
+evening
+evening dress
+evening light
+evening sky
+evening sun
+event
+evergreen
+ewe
+excavation
+exercise
+exhaust hood
+exhibition
+exit
+explorer
+explosion
+extension cord
+extinguisher
+extractor
+extrude
+eye
+eye shadow
+eyebrow
+eyeliner
+fabric
+fabric store
+facade
+face
+face close-up
+face powder
+face towel
+facial tissue holder
+facility
+factory
+factory workshop
+fair
+fairground
+fairy
+falcon
+fall
+family
+family car
+family photo
+family room
+fan
+fang
+farm
+farmer
+farmer market
+farmhouse
+fashion
+fashion accessory
+fashion designer
+fashion girl
+fashion illustration
+fashion look
+fashion model
+fashion show
+fast food
+fastfood restaurant
+father
+faucet
+fault
+fauna
+fawn
+fax
+feast
+feather
+fedora
+feed
+feedbag
+feeding
+feeding chair
+feline
+mountain lion
+fence
+fender
+fern
+ferret
+ferris wheel
+ferry
+fertilizer
+festival
+fiber
+fiction
+fiction book
+field
+field road
+fig
+fight
+figure skater
+figurine
+file
+file photo
+file cabinet
+fill
+film camera
+film director
+film format
+film premiere
+film producer
+filming
+filter
+fin
+hand
+finish line
+fir
+fir tree
+fire
+fire alarm
+fire department
+fire truck
+fire escape
+fire hose
+fire pit
+fire station
+firecracker
+fireman
+fireplace
+firework
+firework display
+first-aid kit
+fish
+fish boat
+fish market
+fish pond
+fishbowl
+fisherman
+fishing
+fishing boat
+fishing net
+fishing pole
+fishing village
+fitness
+fitness course
+five
+fixture
+fjord
+flag
+flag pole
+flake
+flame
+flamingo
+flannel
+flap
+flare
+flash
+flask
+flat
+flatfish
+flavor
+flea
+flea market
+fleet
+flight
+flight attendant
+flip
+flip-flop
+flipchart
+float
+flock
+flood
+floor
+floor fan
+floor mat
+floor plan
+floor window
+floral arrangement
+florist
+floss
+flour
+flow
+flower
+flower basket
+flower bed
+flower box
+flower field
+flower girl
+flower market
+fluid
+flush
+flute
+fly
+fly fishing
+flyer
+horse
+foam
+fog
+foggy
+foie gra
+foil
+folding chair
+leaf
+folk artist
+folk dance
+folk rock artist
+fondant
+hotpot
+font
+food
+food coloring
+food court
+food processor
+food stand
+food truck
+foosball
+foot
+foot bridge
+football
+football coach
+football college game
+football match
+football field
+football game
+football helmet
+football player
+football stadium
+football team
+path
+footprint
+footrest
+footstall
+footwear
+forbidden city
+ford
+forehead
+forest
+forest fire
+forest floor
+forest path
+forest road
+forge
+fork
+forklift
+form
+formal garden
+formation
+formula 1
+fort
+fortification
+forward
+fossil
+foundation
+fountain
+fountain pen
+fox
+frame
+freckle
+highway
+lorry
+French
+French bulldog
+French fries
+French toast
+freshener
+fridge
+fried chicken
+fried egg
+fried rice
+friendship
+frisbee
+frog
+frost
+frosting
+frosty
+frozen
+fruit
+fruit cake
+fruit dish
+fruit market
+fruit salad
+fruit stand
+fruit tree
+fruits shop
+fry
+frying pan
+fudge
+fuel
+fume hood
+fun
+funeral
+fungi
+funnel
+fur
+fur coat
+furniture
+futon
+gadget
+muzzle
+galaxy
+gallery
+game
+game board
+game controller
+ham
+gang
+garage
+garage door
+garage kit
+garbage
+garden
+garden asparagus
+garden hose
+garden spider
+gardener
+gardening
+garfield
+gargoyle
+wreath
+garlic
+garment
+gas
+gas station
+gas stove
+gasmask
+collect
+gathering
+gauge
+gazebo
+gear
+gecko
+geisha
+gel
+general store
+generator
+geranium
+ghost
+gift
+gift bag
+gift basket
+gift box
+gift card
+gift shop
+gift wrap
+gig
+gin
+ginger
+gingerbread
+gingerbread house
+ginkgo tree
+giraffe
+girl
+give
+glacier
+gladiator
+glass bead
+glass bottle
+glass bowl
+glass box
+glass building
+glass door
+glass floor
+glass house
+glass jar
+glass plate
+glass table
+glass vase
+glass wall
+glass window
+glasses
+glaze
+glider
+earth
+glove
+glow
+glue pudding
+go
+go for
+goal
+goalkeeper
+goat
+goat cheese
+gobi
+goggles
+gold
+gold medal
+Golden Gate Bridge
+golden retriever
+goldfish
+golf
+golf cap
+golf cart
+golf club
+golf course
+golfer
+goose
+gorilla
+gothic
+gourd
+government
+government agency
+gown
+graduate
+graduation
+grain
+grampus
+grand prix
+grandfather
+grandmother
+grandparent
+granite
+granola
+grape
+grapefruit
+wine
+grass
+grasshopper
+grassland
+grassy
+grater
+grave
+gravel
+gravestone
+gravy
+gravy boat
+gray
+graze
+grazing
+green
+greenery
+greet
+greeting
+greeting card
+greyhound
+grid
+griddle
+grill
+grille
+grilled eel
+grind
+grinder
+grits
+grocery bag
+grotto
+ground squirrel
+group
+group photo
+grove
+grow
+guacamole
+guard
+guard dog
+guest house
+guest room
+guide
+guinea pig
+guitar
+guitarist
+gulf
+gull
+gun
+gundam
+gurdwara
+guzheng
+gym
+gymnast
+habitat
+hacker
+hail
+hair
+hair color
+hair spray
+hairbrush
+haircut
+hairgrip
+hairnet
+hairpin
+hairstyle
+half
+hall
+halloween
+halloween costume
+halloween pumpkin
+halter top
+hamburg
+hamburger
+hami melon
+hammer
+hammock
+hamper
+hamster
+hand dryer
+hand glass
+hand towel
+handbag
+handball
+handcuff
+handgun
+handkerchief
+handle
+handsaw
+handshake
+handstand
+handwriting
+hanfu
+hang
+hangar
+hanger
+happiness
+harbor
+harbor seal
+hard rock artist
+hardback book
+safety helmet
+hardware
+hardware store
+hardwood
+hardwood floor
+mouth organ
+pipe organ
+harpsichord
+harvest
+harvester
+hassock
+hat
+hatbox
+hautboy
+hawthorn
+hay
+hayfield
+hazelnut
+head
+head coach
+headlight
+headboard
+headdress
+headland
+headquarter
+hearing
+heart
+heart shape
+heat
+heater
+heather
+hedge
+hedgehog
+heel
+helicopter
+heliport
+helmet
+help
+hen
+henna
+herb
+herd
+hermit crab
+hero
+heron
+hibiscus
+hibiscus flower
+hide
+high bar
+high heel
+highland
+highlight
+hike
+hiker
+hiking boot
+hiking equipment
+hill
+hill country
+hill station
+hillside
+hindu temple
+hinge
+hip
+hip hop artist
+hippo
+historian
+historic
+history
+hockey
+hockey arena
+hockey game
+hockey player
+hockey stick
+hoe
+hole
+vacation
+holly
+holothurian
+home
+home appliance
+home base
+home decor
+home interior
+home office
+home theater
+homework
+hummus
+honey
+beehive
+honeymoon
+hood
+hoodie
+hook
+jump
+horizon
+hornbill
+horned cow
+hornet
+horror
+horror film
+horse blanket
+horse cart
+horse farm
+horse ride
+horseback
+horseshoe
+hose
+hospital
+hospital bed
+hospital room
+host
+inn
+hot
+hot air balloon
+hot dog
+hot sauce
+hot spring
+hotel
+hotel lobby
+hotel room
+hotplate
+hourglass
+house
+house exterior
+houseplant
+hoverboard
+howler
+huddle
+hug
+hula hoop
+person
+humidifier
+hummingbird
+humpback whale
+hunt
+hunting lodge
+hurdle
+hurricane
+husky
+hut
+hyaena
+hybrid
+hydrangea
+hydrant
+seaplane
+ice
+ice bag
+polar bear
+ice cave
+icecream
+ice cream cone
+ice cream parlor
+ice cube
+ice floe
+ice hockey player
+ice hockey team
+lollipop
+ice maker
+rink
+ice sculpture
+ice shelf
+skate
+ice skating
+iceberg
+icicle
+icing
+icon
+id photo
+identity card
+igloo
+light
+iguana
+illuminate
+illustration
+image
+impala
+incense
+independence day
+individual
+indoor
+indoor rower
+induction cooker
+industrial area
+industry
+infantry
+inflatable boat
+information desk
+infrastructure
+ingredient
+inhalator
+injection
+injury
+ink
+inking pad
+inlet
+inscription
+insect
+install
+instrument
+insulated cup
+interaction
+interior design
+website
+intersection
+interview
+invertebrate
+invitation
+ipad
+iphone
+ipod
+iris
+iron
+ironing board
+irrigation system
+island
+islet
+isopod
+ivory
+ivy
+izakaya
+jack
+jackcrab
+jacket
+jacuzzi
+jade
+jaguar
+jail cell
+jam
+japanese garden
+jasmine
+jaw
+jay
+jazz
+jazz artist
+jazz fusion artist
+jeans
+jeep
+jelly
+jelly bean
+jellyfish
+jet
+motorboat
+jewel
+jewellery
+jewelry shop
+jigsaw puzzle
+rickshaw
+jockey
+jockey cap
+jog
+joint
+journalist
+joystick
+judge
+jug
+juggle
+juice
+juicer
+jujube
+jump rope
+jumpsuit
+jungle
+junkyard
+kale
+kaleidoscope
+kangaroo
+karaoke
+karate
+karting
+kasbah
+kayak
+kebab
+key
+keycard
+khaki
+kick
+kilt
+kimono
+kindergarden classroom
+kindergarten
+king
+king crab
+kiss
+kit
+kitchen
+kitchen cabinet
+kitchen counter
+kitchen floor
+kitchen hood
+kitchen island
+kitchen sink
+kitchen table
+kitchen utensil
+kitchen window
+kitchenware
+kite
+kiwi
+knee pad
+kneel
+knife
+rider
+knit
+knitting needle
+knob
+knocker
+knot
+koala
+koi
+ktv
+laboratory
+lab coat
+label
+labrador
+maze
+lace
+lace dress
+ladder
+ladle
+ladybird
+lagoon
+lake
+lake district
+lake house
+lakeshore
+lamb
+lamb chop
+lamp post
+lamp shade
+spear
+land
+land vehicle
+landfill
+landing
+landing deck
+landmark
+landscape
+landslide
+lanyard
+lantern
+lap
+laptop
+laptop keyboard
+larva
+lasagne
+laser
+lash
+lasso
+latch
+latex
+latte
+laugh
+launch
+launch event
+launch party
+laundromat
+laundry
+laundry basket
+laundry room
+lava
+lavender
+lawn
+lawn wedding
+lawyer
+lay
+lead
+lead singer
+lead to
+leader
+leak
+lean
+learn
+leash
+leather
+leather jacket
+leather shoe
+speech
+lecture hall
+lecture room
+ledge
+leftover
+leg
+legend
+legging
+legislative chamber
+lego
+legume
+lemon
+lemon juice
+lemonade
+lemur
+lens
+lens flare
+lentil
+leopard
+leotard
+tights
+leprechaun
+lesson
+letter
+mailbox
+letter logo
+lettering
+lettuce
+level
+library
+license
+license plate
+lichen
+lick
+lid
+lie
+life belt
+life jacket
+lifeboat
+lifeguard
+lift
+light fixture
+light show
+light switch
+lighting
+lightning
+lightning rod
+lilac
+lily
+limb
+lime
+limestone
+limo
+line
+line art
+line up
+linen
+liner
+lion
+lip balm
+lipstick
+liquid
+liquor store
+list
+litchi
+live
+livestock
+living room
+living space
+lizard
+load
+loading dock
+loafer
+hallway
+locate
+lock
+lock chamber
+locker
+loft
+log
+log cabin
+logo
+loki
+long hair
+longboard
+loom
+loop
+lose
+lottery
+lotus
+love
+loveseat
+luggage
+lumber
+lumberjack
+lunch
+lunch box
+lush
+luxury
+luxury yacht
+mac
+macadamia
+macaque
+macaroni
+macaw
+machete
+machine
+machine gun
+magazine
+magic
+magician
+magnet
+magnifying glass
+magnolia
+magpie
+mahjong
+mahout
+maid
+chain mail
+mail slot
+make
+makeover
+makeup artist
+makeup tool
+mallard
+mallard duck
+mallet
+mammal
+mammoth
+man
+management
+manager
+manatee
+mandala
+mandarin orange
+mandarine
+mane
+manga
+manger
+mango
+mangosteen
+mangrove
+manhattan
+manhole
+manhole cover
+manicure
+mannequin
+manor house
+mansion
+mantid
+mantle
+manufactured home
+manufacturing
+manuscript
+map
+maple
+maple leaf
+maple syrup
+maraca
+marathon
+marble
+march
+marching band
+mare
+marigold
+marine
+marine invertebrate
+marine mammal
+puppet
+mark
+market
+market square
+market stall
+marriage
+martial
+martial artist
+martial arts gym
+martini
+martini glass
+mascara
+mascot
+mashed potato
+masher
+mask
+massage
+mast
+mat
+matador
+match
+matchbox
+material
+mattress
+mausoleum
+maxi dress
+meal
+measuring cup
+measuring tape
+meat
+meatball
+mechanic
+mechanical fan
+medal
+media
+medical equipment
+medical image
+medical staff
+medicine cabinet
+medieval
+medina
+meditation
+meerkat
+meet
+melon
+monument
+menu
+mermaid
+net
+mess
+messenger bag
+metal
+metal artist
+metal detector
+meter
+mezzanine
+microphone
+microscope
+microwave
+midnight
+milestone
+military uniform
+milk
+milk can
+milk tea
+milkshake
+mill
+mine
+miner
+mineral
+mineral water
+miniskirt
+miniature
+minibus
+minister
+minivan
+mint
+mint candy
+mirror
+miss
+missile
+mission
+mistletoe
+mix
+mixer
+mixing bowl
+mixture
+moat
+mobility scooter
+model
+model car
+modern
+modern tower
+moisture
+mold
+molding
+mole
+monarch
+money
+monitor
+monk
+monkey
+monkey wrench
+monochrome
+monocycle
+monster truck
+moon
+moon cake
+moonlight
+moor
+moose
+swab
+moped
+morning
+morning fog
+morning light
+morning sun
+mortar
+mosaic
+mosque
+mosquito
+moss
+motel
+moth
+mother
+motherboard
+motif
+sport
+motor
+motorbike
+motorcycle
+motorcycle helmet
+motorcycle racer
+motorcyclist
+motorsport
+mound
+mountain
+mountain bike
+mountain biker
+mountain biking
+mountain gorilla
+mountain lake
+mountain landscape
+mountain pass
+mountain path
+mountain range
+mountain river
+mountain snowy
+mountain stream
+mountain view
+mountain village
+mountaineer
+mountaineering bag
+mouse
+mousepad
+mousetrap
+mouth
+mouthwash
+move
+movie poster
+movie ticket
+mower
+mp3 player
+mr
+mud
+muffin
+mug
+mulberry
+mulch
+mule
+municipality
+mural
+muscle
+muscle car
+museum
+mushroom
+music
+music festival
+music stool
+music studio
+music video performer
+musical keyboard
+musician
+mussel
+mustard
+mythology
+nacho
+nail polish
+nailfile
+nanny
+napkin
+narrow
+national flag
+nativity scene
+natural history museum
+nature
+nature reserve
+navigation
+navratri
+navy
+nebula
+neck
+neckband
+necklace
+neckline
+nectar
+nectarine
+needle
+neighbor
+neighbourhood
+neon
+neon light
+nerve
+nest
+new year
+newborn
+newfoundland
+newlywed
+news
+news conference
+newsstand
+night
+night market
+night sky
+night view
+nightclub
+nightstand
+noodle
+nose
+noseband
+note
+notebook
+notepad
+notepaper
+notice
+number icon
+nun
+nurse
+nursery
+nursing home
+nut
+nutcracker
+oak
+oak tree
+oar
+oasis
+oast house
+oatmeal
+oats
+obelisk
+observation tower
+observatory
+obstacle course
+sea
+octopus
+offer
+office
+office building
+office chair
+office cubicle
+office desk
+office supply
+office window
+officer
+official
+oil
+oil lamp
+oil painting
+oilrig
+okra
+old photo
+olive
+olive oil
+olive tree
+omelet
+onion
+onion ring
+opal
+open
+opening
+opening ceremony
+opera
+opera house
+operate
+operating room
+operation
+optical shop
+orangutan
+orange
+orange juice
+orange tree
+orangery
+orbit
+orchard
+orchestra pit
+orchid
+order
+organization
+origami
+ornament
+osprey
+ostrich
+otter
+out
+outcrop
+outdoor
+outhouse
+electric outlet
+outline
+oval
+oven
+overall
+overcoat
+overpass
+owl
+oyster
+teething ring
+pack
+package
+paddock
+police van
+padlock
+paella
+pagoda
+pain
+paint brush
+painter
+paisley bandanna
+palace
+palette
+paling
+pall
+palm tree
+pan
+pancake
+panda
+panel
+panorama
+pansy
+pant
+pantry
+pants
+pantyhose
+papaya
+paper
+paper bag
+paper cutter
+paper lantern
+paper plate
+paper towel
+paperback book
+paperweight
+parachute
+parade
+paradise
+parrot
+paramedic
+paraquet
+parasail
+paratrooper
+parchment
+parish
+park
+park bench
+parking
+parking garage
+parking meter
+parking sign
+parliament
+parsley
+participant
+partner
+partridge
+party
+party hat
+pass
+passage
+passbook
+passenger
+passenger ship
+passenger train
+passion fruit
+passport
+pasta
+paste
+pastry
+pasture
+patch
+patient
+pattern
+pavement
+pavilion
+paw
+pay
+payphone
+pea
+peace
+peach
+peacock
+peak
+peanut
+peanut butter
+pear
+pearl
+pebble
+pecan
+pedestrian
+pedestrian bridge
+pedestrian street
+peel
+peeler
+pegboard
+pegleg
+pelican
+pen
+penalty kick
+pencil
+pencil case
+pencil sharpener
+pencil skirt
+pendant
+pendulum
+penguin
+peninsula
+pennant
+penny
+piggy bank
+peony
+pepper
+pepper grinder
+peppercorn
+pepperoni
+perch
+perform
+performance
+performance arena
+perfume
+pergola
+persian cat
+persimmon
+personal care
+personal flotation device
+pest
+pet
+pet shop
+pet store
+petal
+petunia
+church bench
+pheasant
+phenomenon
+philosopher
+phone
+phonebook
+record player
+photo
+photo booth
+photo frame
+photography
+physicist
+physics laboratory
+pianist
+piano
+plectrum
+pick up
+pickle
+picnic
+picnic area
+picnic basket
+picnic table
+picture
+picture frame
+pie
+pigeon
+pilgrim
+tablet
+pillow
+pilot
+pilot boat
+pin
+pine
+pine cone
+pine forest
+pine nut
+pineapple
+table tennis table
+table tennis
+pink
+pint
+pipa
+pipe
+pipe bowl
+pirate
+pirate flag
+pirate ship
+pistachio
+ski slope
+pocket bread
+pitaya
+pitbull
+pitch
+pitcher
+pitcher plant
+pitchfork
+pizza
+pizza cutter
+pizza pan
+pizzeria
+placard
+place
+place mat
+plaid
+plain
+plan
+planet
+planet earth
+plank
+plant
+plantation
+planting
+plaque
+plaster
+plastic
+plasticine
+plateau
+platform
+platinum
+platter
+play
+play badminton
+play baseball
+play basketball
+play billiard
+play football
+play pong
+play tennis
+play volleyball
+player
+playground
+playhouse
+playing card
+playing chess
+playing golf
+playing mahjong
+playingfield
+playpen
+playroom
+plaza
+plier
+plot
+plow
+plug
+plug hat
+plum
+plumber
+plumbing fixture
+plume
+plywood
+pocket
+pocket watch
+pocketknife
+pod
+podium
+poetry
+poinsettia
+point
+pointer
+poker card
+poker chip
+poker table
+pole
+polecat
+police
+police car
+police dog
+police station
+politician
+polka dot
+pollen
+pollution
+polo
+polo neck
+polo shirt
+pomegranate
+pomeranian
+poncho
+pond
+ponytail
+poodle
+pool
+pop
+pop artist
+popcorn
+pope
+poppy
+porcelain
+porch
+pork
+porridge
+portable battery
+portal
+portfolio
+porthole
+portrait
+portrait session
+pose
+possum
+post
+post office
+stamp
+postcard
+poster
+poster page
+pot
+potato
+potato chip
+potato salad
+potholder
+potty
+pouch
+poultry
+pound
+pour
+powder
+power line
+power plugs and sockets
+power see
+power station
+practice
+Prague Castle
+prayer
+preacher
+premiere
+prescription
+show
+presentation
+president
+press room
+pressure cooker
+pretzel
+prince
+princess
+print
+printed page
+printer
+printing
+prison
+produce
+product
+profession
+professional
+professor
+project picture
+projection screen
+projector
+prom
+promenade
+propeller
+prophet
+proposal
+protective suit
+protest
+protester
+publication
+publicity portrait
+ice hockey
+pudding
+puddle
+puff
+puffin
+pug
+pull
+pulpit
+pulse
+pump
+pumpkin
+pumpkin pie
+pumpkin seed
+punch bag
+punch
+student
+purple
+push
+putt
+puzzle
+tower
+pyramid
+python
+qr code
+quail
+quarry
+quarter
+quartz
+queen
+quesadilla
+queue
+quiche
+quilt
+quilting
+quote
+rabbit
+raccoon
+race
+race track
+raceway
+race car
+racket
+radar
+radiator
+radio
+raft
+rag doll
+rail
+railcar
+railroad
+railroad bridge
+railway line
+railway station
+rain
+rain boot
+rainbow
+rainbow trout
+raincoat
+rainforest
+rainy
+raisin
+rake
+ram
+ramp
+rapeseed
+rapid
+rapper
+raspberry
+rat
+ratchet
+raven
+ravine
+ray
+razor
+razor blade
+read
+reading
+reamer
+rear
+rear light
+rear view
+rearview mirror
+receipt
+receive
+reception
+recipe
+record
+record producer
+recorder
+recording studio
+recreation room
+recreational vehicle
+rectangle
+recycling
+recycling bin
+red
+red carpet
+red flag
+red panda
+red wine
+redwood
+reed
+reef
+reel
+referee
+reflect
+reflection
+reflector
+register
+rein
+reindeer
+relax
+release
+relief
+religion
+religious
+relish
+remain
+remodel
+remote
+remove
+repair
+repair shop
+reptile
+rescue
+rescuer
+research
+researcher
+reservoir
+residence
+residential neighborhood
+resin
+resort
+resort town
+restaurant kitchen
+restaurant patio
+restroom
+retail
+retriever
+retro
+reveal
+rhinoceros
+rhododendron
+rib
+ribbon
+rice
+rice cooker
+rice field
+ride
+ridge
+riding
+rifle
+rim
+ring
+riot
+ripple
+rise
+rise building
+river
+river bank
+river boat
+river valley
+riverbed
+road
+road sign
+road trip
+roadside
+roast chicken
+robe
+robin
+robot
+stone
+rock arch
+rock artist
+rock band
+rock climber
+rock climbing
+rock concert
+rock face
+rock formation
+rocker
+rocket
+rocking chair
+rocky
+rodent
+rodeo
+rodeo arena
+roe
+roe deer
+roller
+coaster
+roller skate
+roller skates
+rolling pin
+romance
+romantic
+roof
+roof garden
+room
+room divider
+root
+root beer
+rope bridge
+rosary
+rose
+rosemary
+rosy cloud
+rottweiler
+round table
+router
+row
+rowan
+royal
+rubber stamp
+rubble
+rubik's cube
+ruby
+ruffle
+rugby
+rugby ball
+rugby player
+ruins
+ruler
+rum
+run
+runner
+running shoe
+rural
+rust
+rustic
+rye
+sack
+saddle
+saddlebag
+safari
+safe
+safety vest
+sage
+sail
+sailboat
+sailing
+sailor
+squirrel monkey
+sake
+salad
+salad bowl
+salamander
+salami
+sale
+salmon
+salon
+salsa
+salt
+salt and pepper shakers
+salt lake
+salt marsh
+salt shaker
+salute
+samoyed
+samurai
+sand
+sand bar
+sand box
+sand castle
+sand sculpture
+sandal
+sandwich
+sanitary napkin
+santa claus
+sapphire
+sardine
+sari
+sashimi
+satay
+satchel
+satellite
+satin
+sauce
+saucer
+sauna
+sausage
+savanna
+saw
+sawbuck
+sax
+saxophonist
+scaffold
+scale
+scale model
+scallop
+scar
+strawman
+scarf
+scene
+scenery
+schnauzer
+school
+school bus
+school uniform
+schoolhouse
+schooner
+science
+science fiction film
+science museum
+scientist
+scissors
+wall lamp
+scone
+scoop
+scooter
+score
+scoreboard
+scorpion
+scout
+scrambled egg
+scrap
+scraper
+scratch
+screen
+screen door
+screenshot
+screw
+screwdriver
+scroll
+scrub
+scrubbing brush
+sculptor
+sculpture
+sea cave
+sea ice
+sea lion
+sea turtle
+sea urchin
+seabass
+seabed
+seabird
+seafood
+seahorse
+seal
+sea view
+seashell
+seaside resort
+season
+seat
+seat belt
+seaweed
+secretary
+security
+sedan
+see
+seed
+seesaw
+segway
+selfie
+sell
+seminar
+sense
+sensor
+server
+server room
+service
+set
+sewing machine
+shadow
+shake
+shaker
+shampoo
+shape
+share
+shark
+sharpener
+sharpie
+shaver
+shaving cream
+shawl
+shear
+shears
+sheep
+sheet
+sheet music
+shelf
+shell
+shellfish
+shelter
+shelve
+shepherd
+sherbert
+shiba inu
+shine
+shipping
+shipping container
+shipwreck
+shipyard
+shirt
+shirtless
+shoal
+shoe
+shoe box
+shoe shop
+shoe tree
+shoot
+shooting basketball guard
+shop window
+shopfront
+shopper
+shopping
+shopping bag
+shopping basket
+shopping cart
+mall
+shopping street
+shore
+shoreline
+short
+short hair
+shorts
+shot glass
+shotgun
+shoulder
+shoulder bag
+shovel
+showcase
+shower
+shower cap
+shower curtain
+shower door
+shower head
+shredder
+shrew
+shrimp
+shrine
+shrub
+shutter
+siamese
+siberia
+sibling
+side
+side cabinet
+side dish
+sidecar
+sideline
+siding
+sign
+signage
+signal
+signature
+silk
+silk stocking
+silo
+silver
+silver medal
+silverware
+sing
+singe
+singer
+sink
+sip
+sit
+sitting
+skate park
+skateboard
+skateboarder
+skater
+skating rink
+skeleton
+sketch
+skewer
+ski
+ski boot
+ski equipment
+ski jacket
+ski lift
+ski pole
+ski resort
+snowboard
+skier
+skiing shoes
+skin
+skull
+skullcap
+sky
+sky tower
+skylight
+skyline
+skyscraper
+slalom
+slate
+sleigh
+sleep
+sleeping bag
+sleepwear
+sleeve
+slice
+slide
+slider
+sling
+slope
+slot
+slot machine
+sloth
+slow cooker
+slug
+slum
+smell
+smile
+smoke
+snack
+snail
+snake
+snapper
+snapshot
+snorkel
+snout
+snow
+snow leopard
+snow mountain
+snowball
+snowboarder
+snowfield
+snowflake
+snowman
+snowmobile
+snowplow
+snowshoe
+snowy
+soap
+soap bubble
+soap dispenser
+soccer goalkeeper
+socialite
+sock
+socket
+soda
+softball
+software
+solar battery
+soldier
+solo
+solution
+sombrero
+song
+sound
+soup
+soup bowl
+soupspoon
+sour cream
+souvenir
+soybean milk
+spa
+space
+space shuttle
+space station
+spacecraft
+spaghetti
+span
+wrench
+spark
+sparkle
+sparkler
+sparkling wine
+sparrow
+spatula
+speaker
+spectator
+speech bubble
+speed limit
+speed limit sign
+speedboat
+speedometer
+sphere
+spice
+spice rack
+spider
+spider web
+spike
+spin
+spinach
+spire
+splash
+sponge
+spoon
+sport association
+sport equipment
+sport team
+sports ball
+sports equipment
+sports meet
+sportswear
+dot
+spray
+spread
+spring
+spring roll
+sprinkle
+sprinkler
+sprout
+spruce
+spruce forest
+squad
+square
+squash
+squat
+squeeze
+squid
+squirrel
+water gun
+stab
+stable
+stack
+stadium
+staff
+stage
+stage light
+stagecoach
+stain
+stainless steel
+stair
+stairs
+stairwell
+stall
+stallion
+stand
+standing
+staple
+stapler
+star
+stare
+starfish
+starfruit
+starling
+state park
+state school
+station
+stationary bicycle
+stationery
+statue
+steak
+steak knife
+steam
+steam engine
+steam locomotive
+steam train
+steamed bread
+steel
+steering wheel
+stem
+stencil
+step stool
+stereo
+stethoscope
+stew
+stick
+stick insect
+sticker
+still life
+stilt
+stingray
+stir
+stirrer
+stirrup
+sew
+stock
+stocking
+stomach
+stone building
+stone carving
+stone house
+stone mill
+stool
+stop
+stop at
+stop light
+stop sign
+stop watch
+traffic light
+storage box
+storage room
+tank
+store
+storefront
+stork
+storm
+storm cloud
+stormy
+stove
+poker
+straddle
+strainer
+strait
+strap
+straw
+straw hat
+strawberry
+stream
+street art
+street artist
+street corner
+street dog
+street food
+street light
+street market
+street photography
+street scene
+street sign
+street vendor
+stretch
+stretcher
+strike
+striker
+string
+string cheese
+strip
+stripe
+stroll
+structure
+studio
+studio shot
+stuff
+stuffed animal
+stuffed toy
+stuffing
+stump
+stunning
+stunt
+stupa
+style
+stylus
+submarine
+submarine sandwich
+submarine water
+suburb
+subway
+subway station
+subwoofer
+succulent
+suede
+sugar
+sugar bowl
+sugar cane
+sugar cube
+suit
+suite
+summer
+summer evening
+summit
+sun
+sun hat
+sunbathe
+sunday
+sundial
+sunflower
+sunflower field
+sunflower seed
+sunglasses
+sunny
+sunrise
+sunset
+sunshade
+sunshine
+super bowl
+sports car
+superhero
+supermarket
+supermarket shelf
+supermodel
+supporter
+surf
+surface
+surfboard
+surfer
+surgeon
+surgery
+surround
+sushi
+sushi bar
+suspenders
+suspension
+suspension bridge
+suv
+swallow
+swallowtail butterfly
+swamp
+swan
+swan boat
+sweat pant
+sweatband
+sweater
+sweatshirt
+sweet
+sweet potato
+swim
+swim cap
+swimmer
+swimming hole
+swimming pool
+swing
+swing bridge
+swinge
+swirl
+switch
+swivel chair
+sword
+swordfish
+symbol
+symmetry
+synagogue
+syringe
+syrup
+system
+t shirt
+t-shirt
+tabasco sauce
+tabby
+table tennis racket
+table top
+tablecloth
+tablet computer
+tableware
+tachometer
+tackle
+taco
+tae kwon do
+tai chi
+tail
+tailor
+take
+takeoff
+talk
+tambourine
+tan
+tangerine
+tape
+tapestry
+tarmac
+taro
+tarp
+tart
+tassel
+taste
+tatami
+tattoo
+tattoo artist
+tavern
+tea
+tea bag
+tea party
+tea plantation
+tea pot
+tea set
+teach
+teacher
+teacup
+teal
+team photo
+team presentation
+tear
+technician
+technology
+teddy
+tee
+teenager
+telegraph pole
+zoom lens
+telescope
+television
+television camera
+television room
+television studio
+temperature
+temple
+tempura
+tennis
+tennis court
+tennis match
+tennis net
+tennis player
+tennis racket
+tent
+tequila
+terminal
+terrace
+terrain
+terrarium
+territory
+test
+test match
+test tube
+text
+text message
+textile
+texture
+thanksgiving
+thanksgiving dinner
+theater
+theatre actor
+therapy
+thermometer
+thermos
+thermos bottle
+thermostat
+thicket
+thimble
+thing
+thinking
+thistle
+throne
+throne room
+throw
+throw pillow
+thunder
+thunderstorm
+thyme
+tiara
+tick
+ticket
+ticket booth
+tide pool
+tie
+tiger
+tight
+tile
+tile flooring
+tile roof
+tile wall
+tin
+tinfoil
+tinsel
+tiramisu
+tire
+tissue
+toast
+toaster
+tobacco
+tobacco pipe
+toddler
+toe
+tofu
+toilet bowl
+toilet seat
+toiletry
+tokyo tower
+tomato
+tomato sauce
+tomato soup
+tomb
+tong
+tongs
+tool
+toolbox
+toothbrush
+toothpaste
+toothpick
+topiary garden
+topping
+torch
+tornado
+tortilla
+tortoise
+tote bag
+totem pole
+totoro
+toucan
+touch
+touchdown
+tour
+tour bus
+tour guide
+tourist
+tourist attraction
+tournament
+tow truck
+towel
+towel bar
+tower block
+tower bridge
+town
+town square
+toy
+toy car
+toy gun
+toyshop
+track
+tractor
+trade
+tradition
+traditional
+traffic
+traffic cone
+traffic congestion
+traffic jam
+traffic sign
+trail
+trailer
+trailer truck
+train
+train bridge
+train car
+train interior
+train track
+train window
+trainer
+training
+training bench
+training ground
+trolley
+trampoline
+transformer
+transparency
+travel
+tray
+treadmill
+treat
+tree
+tree branch
+tree farm
+tree frog
+tree house
+tree root
+tree trunk
+trial
+triangle
+triathlon
+tribe
+tributary
+trick
+tricycle
+trim
+trio
+tripod
+trombone
+troop
+trophy
+trophy cup
+tropic
+trout
+truck
+truck driver
+tub
+tube
+tugboat
+tulip
+tuna
+tundra
+tunnel
+turbine
+turkey
+turn
+turnip
+turquoise
+turret
+turtle
+tusk
+tv actor
+tv cabinet
+tv drama
+tv genre
+tv personality
+tv show
+tv sitcom
+tv tower
+twig
+twilight
+twin
+twine
+twist
+type
+type on
+typewriter
+ukulele
+ultraman
+umbrella
+underclothes
+underwater
+unicorn
+uniform
+universe
+university
+up
+urban
+urinal
+urn
+use
+utensil
+utility room
+vacuum
+valley
+valve
+vampire
+van
+vanilla
+vanity
+variety
+vase
+vault
+vector cartoon illustration
+vector icon
+vegetable
+vegetable garden
+vegetable market
+vegetation
+vehicle
+veil
+vein
+velvet
+vending machine
+vendor
+vent
+vespa
+vessel
+vest
+vet
+veteran
+veterinarians office
+viaduct
+video
+video camera
+video game
+videotape
+view mirror
+vigil
+villa
+village
+vine
+vinegar
+vineyard
+violence
+violet
+violin
+violinist
+violist
+vision
+visor
+vodka
+volcano
+volleyball
+volleyball court
+volleyball player
+volunteer
+voyage
+vulture
+waffle
+waffle iron
+wagon
+wagon wheel
+waist
+waiter
+waiting hall
+waiting room
+walk
+walking
+walking cane
+wall clock
+wallpaper
+walnut
+walrus
+war
+warehouse
+warm
+warning sign
+warrior
+warship
+warthog
+wash
+washer
+washing
+washing machine
+wasp
+waste
+waste container
+watch
+water
+water bird
+water buffalo
+water cooler
+water drop
+water feature
+water heater
+water level
+water lily
+water park
+water pipe
+water purifier
+water ski
+water sport
+water surface
+water tower
+watercolor
+watercolor illustration
+watercolor painting
+waterfall
+watering can
+watermark overlay stamp
+watermelon
+waterproof jacket
+waterway
+wave
+wax
+weapon
+wear
+weather
+vane
+web
+webcam
+wedding
+wedding ring
+wedding bouquet
+wedding cake
+wedding couple
+wedding invitation
+wedding party
+wedding photo
+wedding photographer
+wedding photography
+wedding reception
+wedge
+weed
+weight
+weight scale
+welder
+well
+western food
+western restaurant
+wet
+wet bar
+wet suit
+wetland
+wetsuit
+whale
+whale shark
+wheat
+wheat field
+wheel
+wheelchair
+wheelie
+whipped cream
+whisk
+whisker
+whiskey
+whistle
+white
+white house
+white wine
+whiteboard
+wicket
+wide
+wield
+wig
+Wii
+Wii controller
+wild
+wildebeest
+wildfire
+wildflower
+wildlife
+willow
+wind
+wind chime
+wind farm
+wind turbine
+windmill
+window
+window box
+window display
+window frame
+window screen
+window seat
+window sill
+wiper
+windshield
+windy
+wine bottle
+wine cooler
+wine cabinet
+wine cellar
+wine glass
+wine rack
+wine tasting
+winery
+wing
+winter
+winter melon
+winter morning
+winter scene
+winter sport
+winter storm
+wire
+wisteria
+witch
+witch hat
+wok
+wolf
+woman
+wood
+wood duck
+wood floor
+wood wall
+wood-burning stove
+wooden spoon
+woodland
+woodpecker
+woodworking plane
+wool
+job
+work card
+workbench
+worker
+workplace
+workshop
+world
+worm
+worship
+wound
+wrap
+wrap dress
+wrapping paper
+wrestle
+wrestler
+wrinkle
+wristband
+write
+writer
+writing
+writing brush
+writing desk
+yacht
+yak
+yard
+yellow
+yoga
+yoga mat
+yoghurt
+yoke
+yolk
+youth
+youth hostel
+yurt
+zebra
+zebra crossing
+zen garden
+zip
+zipper
+zombie
+zongzi
+zoo
\ No newline at end of file
diff --git a/fengshen/models/Lyrics/ram/data/ram_tag_list_chinese.txt b/fengshen/models/Lyrics/ram/data/ram_tag_list_chinese.txt
new file mode 100644
index 0000000..b21cc5f
--- /dev/null
+++ b/fengshen/models/Lyrics/ram/data/ram_tag_list_chinese.txt
@@ -0,0 +1,4585 @@
+三维CG渲染
+3d眼镜
+算盘
+鲍鱼
+修道院
+肚子
+学院
+附件
+事故
+手风琴
+橡子
+丙烯颜料
+表演
+行动
+动作电影
+活动
+演员
+改编本
+添加
+胶带
+调整
+成人
+冒险
+广告
+天线
+有氧运动
+喷雾罐
+爆炸头
+农业
+帮助
+空调
+空调系统
+风向标
+飞机客舱
+飞机模型
+机场
+航线
+客机
+飞行员
+飞机
+飞机窗口
+机场
+机场跑道
+航站楼
+飞艇
+航展
+过道
+警报
+闹钟
+信天翁
+唱片
+唱片封面
+酒精
+壁龛
+水藻
+胡同/球道
+杏仁
+芦荟
+高山
+羊驼
+字母表
+德国牧羊犬
+圣坛
+琥珀
+救护车
+秃鹰
+美国短毛猫
+紫水晶
+圆形剧场
+扩音器
+游乐园
+游乐设施
+锚
+古老的
+海葵
+天使
+角
+动物
+动物雕塑
+动物收容所
+动画片
+动画电影
+动画师
+动漫
+脚踝
+短袜
+周年庆
+风衣
+蚂蚁
+羚羊
+古董
+鹿角
+铁砧
+公寓
+猿
+应用程序
+应用图标
+出现
+外观
+开胃菜
+掌声
+苹果
+苹果汁
+苹果派
+苹果树
+苹果酱
+设备
+约定
+通道
+杏子
+围裙
+浅绿色
+水族馆
+观赏鱼
+渡槽
+游乐中心
+商场游戏机
+拱门
+拱桥
+考古现场
+射箭
+群岛
+建筑师
+建筑设计
+档案
+拱门
+地区
+竞技场
+争论
+手臂
+穿山甲
+臂章
+扶手椅
+衣柜
+盔甲
+军队
+军事基地
+坦克
+阵列
+逮捕
+箭头
+艺术
+艺术展
+美术馆
+艺术印刷品
+艺术学校
+艺术工作室
+艺术矢量插图
+洋蓟
+文章
+手工艺品
+艺术家
+艺术阁楼
+灰
+烟灰缸
+亚洲寺庙
+芦笋
+沥青道路
+组装
+集会
+生产流水线
+协会
+宇航员
+天文学家
+运动员
+运动
+地图集
+自助取款机
+大气层
+中庭
+连接
+战斗机
+参加
+吸引力
+全地形车
+茄子
+拍卖
+奥迪汽车
+音频
+礼堂
+极光
+作者
+汽车厂
+汽车修理工
+汽车零件
+车展
+汽车展厅
+汽车电池
+汽车制造
+汽车模型
+汽车
+秋天
+秋天的森林
+秋天的叶子
+秋天的公园
+秋天的树
+阿凡达
+林荫大道
+飞行员太阳镜
+牛油果
+奖品
+颁奖典礼
+获奖者
+棚
+斧头
+杜鹃花
+狒狒
+婴儿
+奶瓶
+婴儿车
+婴儿衣服
+小象
+婴儿食品
+婴儿座椅
+迎婴派对
+背后/后面
+背景
+背光
+背包
+后院
+培根
+徽章
+獾
+荒地
+羽毛球运动
+羽毛球拍
+袋子
+面包圈
+风笛
+法棍
+诱饵
+焙烤食品
+面包师
+面包店
+烘焙
+烤盘
+平衡
+平衡车
+阳台
+球
+球池
+芭蕾舞女演员
+芭蕾舞
+芭蕾舞演员
+芭蕾舞裙
+气球
+气球拱门
+棒球手
+舞厅
+竹子
+竹林
+香蕉
+香蕉面包
+香蕉叶子
+香蕉树
+乐队
+创可贴
+绷带
+头巾
+束发带
+刘海
+手镯
+栏杆
+五弦琴
+银行
+银行卡
+银行金库
+纸币
+横幅/旗帜
+宴会
+宴会厅
+榕树
+包子
+洗礼
+酒吧
+条形码
+高脚凳
+烧烤
+烧烤架
+杠铃
+理发师
+理发店
+芭比娃娃
+驳船
+咖啡师
+树皮
+大麦
+谷仓
+仓鸮
+挡光板
+桶
+路障
+屏障
+手推车
+酒保
+棒球
+棒球基地
+棒球棒
+棒球帽
+棒球场
+棒球比赛
+棒球手套
+棒球投手
+棒球队
+棒球制服
+地下室
+罗勒
+水盆
+篮子
+篮子
+篮球
+篮球篮板
+篮球教练
+篮球场
+篮球比赛
+篮球框
+篮球运动员
+篮球馆
+篮球队
+贝斯
+低音吉他
+低音喇叭
+贝斯手
+球棒/球拍
+浴室
+水浴加热器
+浴垫
+浴巾
+泳装
+浴袍
+浴室
+浴室配件
+浴室柜
+浴室门
+浴室镜子
+浴室水槽
+卫生纸
+浴室窗户
+蝙蝠侠
+棒子
+接连猛打/击球员
+电池
+战斗
+战绳
+战舰
+海湾
+海湾大桥
+凸窗
+杨梅
+集市
+海滩
+沙滩球
+沙滩椅
+海滨别墅
+海滩小屋
+沙滩毛巾
+沙滩排球
+灯塔
+珠子
+比格犬
+鸟嘴
+烧杯
+横梁
+豆子
+豆袋椅
+豆袋
+熊
+幼熊
+胡子
+野兽
+击打/击败
+美丽的
+美丽
+美容院
+海狸
+床
+床单
+床架
+卧室
+床上用品
+便盆
+卧室窗户
+床头灯
+蜜蜂
+山毛榉
+牛肉
+养蜂人
+蜂鸣器
+啤酒
+啤酒瓶
+啤酒罐
+啤酒花园
+啤酒杯
+啤酒馆
+甜菜
+甲虫
+米色
+时钟
+甜椒
+钟楼
+皮带
+皮带扣
+长凳
+弯曲
+孟加拉虎
+盒饭
+贝雷帽
+浆果
+停泊位
+饮料
+围嘴
+拌饭
+圣经
+比熊
+自行车
+自行车头盔
+自行车车轮
+自行车骑士
+坐浴盆
+大本钟
+自行车道
+自行车道
+自行车赛
+骑车
+比基尼
+比基尼上衣
+账单
+台球
+广告牌
+台球台
+垃圾箱
+活页夹
+双筒望远镜
+生物学实验室
+双翼飞机
+桦木
+桦树
+鸟
+鸟池
+喂鸟器
+鸟舍
+鸟巢
+鸟池
+鸟笼
+出生
+生日
+生日蛋糕
+生日蜡烛
+生日贺卡
+生日聚会
+饼干
+主教
+野牛
+钻头
+咬
+黑色
+黑山羊
+黑莓
+乌鸦
+黑板
+铁匠
+叶片/刀片
+毯子/覆盖层
+运动外套
+看台
+搅拌机
+祝福
+窗帘
+眼罩
+闪光
+暴风雪
+块
+博客
+血
+开花
+花
+女装衬衫
+吹
+吹风机
+河豚
+蓝色
+蓝色艺术家
+蓝松鸦
+蓝天
+蓝莓
+蓝知更鸟
+猪
+板子
+板擦
+棋盘游戏
+木板路
+船
+船甲板
+船屋
+桨
+乘船
+浮标
+山猫
+躯干
+身体冲浪板
+健美运动员
+水煮鸡蛋
+锅炉
+饰扣式领带
+门闩
+炸弹
+轰炸机
+披肩榛鸡
+骨骼
+篝火
+阀盖
+盆景
+书
+书籍封面
+书柜
+文件夹
+书签
+书架
+书店
+远程拾音器
+推动
+靴子
+边界
+边境牧羊犬
+植物园
+瓶
+瓶盖
+开瓶器
+螺旋开瓶器
+三角梅
+巨石
+花束
+时装店
+精品酒店
+鞠躬/蝴蝶结
+领结
+弓形窗
+碗
+保龄球运动
+保龄球馆
+保龄球
+保龄球设备
+盒子
+箱形梁桥
+箱龟
+拳击手
+内裤
+拳击
+拳击手套
+拳击台
+男孩
+支撑物
+支架
+辫子
+大脑
+刹车
+刹车灯
+树枝
+商标
+白兰地
+黄铜
+黄铜牌匾
+面包
+面包箱
+休息
+早餐
+防浪堤
+胸部
+啤酒厂
+砖块
+砖建筑物
+墙
+砖块
+婚纱
+新娘
+新郎
+伴娘
+桥
+缰绳
+公文包
+明亮的
+边沿
+钻头
+广播
+西兰花
+青铜
+铜牌
+青铜雕塑
+青铜雕像
+胸针
+小溪
+扫帚
+肉汤
+棕色
+棕熊
+巧克力蛋糕
+早午餐
+浅黑肤色的女人
+刷子
+郊狼
+包菜
+气泡
+泡泡糖
+珍珠奶茶
+斗柜
+盾牌
+芽
+佛
+水牛
+自助餐
+昆虫
+建造
+建造者
+建筑
+积木
+建筑立面
+建筑材料
+灯
+牛
+斗牛犬
+子弹
+动车
+公告栏
+防弹背心
+斗牛
+扩音器
+斗牛场
+大黄蜂
+保险杠
+卷/地形起伏
+捆
+蹦极
+双层床
+地堡/击球
+兔子
+浮标
+书桌
+墓室
+燃烧
+玉米煎饼
+公交车
+公交车司机
+公交车内部
+公交车站
+公交车站
+公交车窗户
+灌木
+商业
+名片
+业务主管
+商务西装
+业务团队
+女商人
+商人
+半身像
+屠夫
+肉铺
+孤峰
+黄油
+奶油
+蝴蝶
+蝴蝶馆
+按钮
+梧桐树
+购买
+出租车
+小屋
+卷心菜
+小屋
+守车
+储藏柜
+橱柜
+电缆
+缆车
+仙人掌
+咖啡馆
+食堂
+笼子
+蛋糕
+蛋糕台
+计算器
+大锅
+日历
+小腿
+通话
+电话亭
+书法
+平静的
+摄像机
+骆驼
+相机
+相机镜头
+迷彩
+露营
+露营者
+篝火
+露营
+营地
+校园
+罐
+开罐器
+运河
+金丝雀
+癌症
+蜡烛
+烛台
+糖果
+块状糖
+柺杖糖
+糖果店
+拐杖
+罐子
+大炮
+树冠/顶棚
+四柱床
+香瓜
+悬臂桥
+帆布
+峡谷
+帽子
+斗篷
+科德角
+卡布奇诺
+胶囊
+队长
+捕获
+车
+汽车经销商
+车门
+汽车内饰
+车标
+后视镜
+停车场
+汽车座椅
+车展
+洗车
+车窗
+焦糖
+卡片
+纸牌游戏
+纸板
+纸板盒
+羊毛衫
+红衣凤头鸟
+货物
+货运飞机
+货船
+加勒比
+康乃馨
+狂欢节
+食肉动物
+旋转木马
+鲤鱼
+木匠
+地毯
+拖鞋
+红雀
+长途客车
+斑点狗
+航空母舰
+胡萝卜
+胡萝卜蛋糕
+携带
+手推车
+纸箱/纸盒
+卡通
+卡通人物
+卡通插图
+卡通风格
+雕刻
+容器
+现金
+腰果
+赌场
+砂锅
+磁带
+盒式录音机
+石膏绷带
+铸造
+城堡
+猫
+猫窝
+猫粮
+猫器具
+猫架
+地下墓穴
+双体船
+美洲狮
+握着/抓着
+捕手
+毛毛虫
+鲶鱼
+教堂
+牛
+猫步
+走秀
+菜花
+洞穴
+鱼子酱
+光盘
+CD播放器
+雪松
+天花板
+吊扇
+庆祝
+庆典
+名人
+芹菜
+大提琴
+手机
+水泥
+墓地
+中心装饰品
+蜈蚣
+陶瓷
+瓷砖
+麦片
+仪式
+证书
+链条
+链锯
+椅子
+升降椅
+躺椅
+木屋
+圣杯
+粉笔
+房间
+变色龙
+香槟酒
+香槟杯
+冠军
+锦标赛
+吊灯
+婴儿换尿布台
+通道
+皴裂处
+小教堂
+人物雕塑
+木炭
+充电
+充电器
+战车
+慈善机构
+慈善活动
+魅力
+图表
+追逐
+底盘
+检查/支票
+支票簿
+棋盘
+检查表
+欢呼声
+鼓励/啦啦队
+奶酪
+奶酪汉堡
+奶酪蛋糕
+猎豹
+厨师
+化合物
+化学家
+化学
+化学实验室
+旗袍
+樱桃
+樱花
+樱桃番茄
+樱桃树
+国际象棋
+栗子
+鸡
+鸡胸肉
+鸡笼
+鸡肉沙拉
+鸡翅
+鹰嘴豆
+小衣橱
+吉娃娃
+孩子
+童星
+孩子的房间
+红番椒
+辣热狗
+烟囱
+黑猩猩
+瓷器
+白菜
+中国园林
+中国结
+月季
+中国塔
+炸薯条/炸薯条
+花栗鼠
+凿子
+巧克力
+巧克力棒
+巧克力蛋糕
+巧克力碎片
+巧克力饼干
+巧克力牛奶
+巧克力慕斯
+松露
+唱诗班
+厨房刀
+砧板
+筷子
+圣诞节
+圣诞球
+圣诞贺卡
+圣诞装饰
+圣诞晚宴
+平安夜
+圣诞帽
+圣诞灯
+圣诞市场
+圣诞装饰
+圣诞树
+菊花
+教堂
+教堂塔
+苹果酒
+雪茄
+雪茄盒
+香烟
+烟盒
+腰带
+电影院
+摄影师
+肉桂
+圆
+电路
+电路板
+马戏团
+水箱
+柑橘类水果
+城市
+城市公交
+市政厅
+城市夜景
+城市公园
+城市天际线
+城市广场
+城市街道
+城墙
+城市景观
+蛤蜊
+单簧管
+扣子
+班级
+经典
+教室
+锁骨
+爪子
+黏土
+陶器
+清洁
+洁净室
+清洁工人
+清洁用品
+清晰的
+栓
+克莱门氏小柑橘
+客户端
+悬崖
+爬
+爬山
+登山者
+诊所
+夹子
+剪贴画
+剪贴板
+快速帆船
+君子兰
+斗篷
+木底鞋
+特写
+壁橱
+布
+穿衣
+衣服
+晒衣夹
+晒衣绳
+服装店
+云
+云雾森林
+多云
+三叶草
+小丑
+小丑鱼
+俱乐部
+离合器
+手拿包
+煤炭
+海岸
+外套
+衣帽架
+玉米
+公鸡
+凤头鹦鹉
+可卡犬
+驾驶
+蟑螂
+鸡尾酒
+小礼服
+鸡尾酒调制器
+鸡尾酒桌
+可可
+椰子
+椰子树
+咖啡
+咖啡豆
+咖啡杯
+咖啡机
+咖啡店
+咖啡壶
+棺材
+法国白兰地
+螺旋
+硬币
+可口可乐
+滤器
+冷的
+卷心菜沙拉
+合作
+拼贴画
+收藏品
+大学生
+牧羊犬
+碰撞
+颜色
+涂色书
+染色材料
+矮种马
+柱子
+梳子
+密码锁
+喜剧演员
+喜剧
+喜剧电影
+彗星
+舒服
+安慰食物
+漫画书
+漫画人物
+连环画
+指挥官
+评论员
+社区
+通勤
+公司
+指南针
+比赛
+比赛
+竞争者
+作曲家
+作文
+堆肥
+电脑
+电脑机箱
+电脑椅
+电脑桌
+键盘
+计算机显示器
+计算机房
+电脑屏幕
+机箱
+概念车
+音乐会
+音乐厅
+贝壳
+混凝土
+调味品
+避孕套
+独立产权的公寓
+指挥
+锥形物
+会议
+会议中心
+会议厅
+会议室
+五彩纸屑
+冲突
+合流
+连接
+连接器
+温室
+星座
+建筑工地
+建筑工人
+包含
+容器
+集装箱船
+大陆
+轮廓
+合同
+控制
+控制塔
+便利店
+集会
+交谈
+转换器
+可转换的
+输送机
+厨师/烹饪
+烹饪
+烹饪喷雾剂
+炊具
+凉的
+冷却器
+铜
+一本/一册
+珊瑚
+珊瑚礁
+粗绳
+有线电话
+酒
+威尔士矮脚狗
+瓶塞
+软木板
+鸬鹚
+玉米
+玉米田
+玉米面包
+角落
+小号
+飞檐
+燕麦片
+围栏
+走廊
+紧身衣
+化妆品
+化妆刷
+化妆镜
+角色扮演
+服装
+服装电影设计师
+婴儿床
+小屋
+棉花
+棉花糖
+沙发
+倒计时
+柜台
+台面
+最佳乡村歌手
+乡村别墅
+乡村公路
+乡村流行歌手
+农村
+双门小轿车
+夫妇/两人/几个
+情侣写真
+小胡瓜
+课程
+球场
+法院
+院子
+堂兄弟
+工作服
+奶牛
+母牛的颈铃
+牛仔
+牛仔靴
+牛仔帽
+螃蟹
+蟹肉
+裂纹
+摇篮
+工艺
+工匠
+蔓越莓
+起重机
+黑纱
+厕所
+板条箱
+火山口湖
+龙虾
+蜡笔
+奶油乳酪
+奶油罐
+创建
+生物
+信用卡
+新月形
+新月形面包
+山顶
+全体船员
+蟋蟀
+板球用球
+板球队
+板球队员
+钩边
+克罗克电锅
+鳄鱼
+庄稼
+露脐上衣
+交叉
+横木
+十字路口
+相声
+人行横道
+油煎面包块
+乌鸦
+撬棍
+人群
+拥挤的
+皇冠
+阴极射线管屏幕
+耶稣受难像
+巡游
+游轮
+巡洋艇
+面包屑
+压坏
+拐杖
+水晶
+幼兽
+立方体
+黄瓜
+球杆
+袖口
+袖扣
+烹饪
+农田
+杯子
+纸杯蛋糕
+丘比特
+马路牙子
+旋度
+卷发器
+无籽葡萄干
+货币
+咖喱
+窗帘
+曲线
+软垫
+顾客
+切
+餐具
+自行车
+骑自行车
+龙卷风
+汽缸
+铙钹
+柏树
+柏树
+达克斯猎狗
+水仙花
+匕首
+大丽花
+萝卜
+乳制品
+雏菊
+大坝
+损害
+潮湿的
+跳舞
+舞池
+舞蹈室
+舞者
+蒲公英
+黑暗
+黑暗
+飞镖
+圆靶
+指示板
+日期
+女儿
+黎明
+天床上
+日光
+门栓
+死亡
+辩论
+碎片
+玻璃水瓶
+甲板
+双层巴士
+装饰
+装修/装饰
+装饰画
+鹿
+后卫
+神
+熟食
+投递
+拆迁
+怪兽
+演示
+兽窝/休闲室
+牛仔夹克
+牙医
+百货商店
+抑郁症
+德比
+皮肤病
+沙漠
+沙漠公路
+设计
+设计师
+桌子/表格
+台灯
+桌面
+台式电脑
+甜点
+破坏
+侦探
+洗涤剂
+露水
+仪表盘
+钻石
+尿布
+尿布包
+杂志
+死
+饮食
+挖掘机
+数字
+数字时钟
+莳萝
+晚餐
+小船
+餐厅
+晚宴
+餐桌
+恐龙
+浸
+文凭
+指引
+导演
+尘埃
+越野摩托车
+泥土地
+泥土路
+泥路/土路
+灾难
+信徒
+迪斯科舞厅
+迪斯科灯秋
+迪斯科舞厅
+疾病
+盘子
+碟形天线
+洗碗机
+抹布
+菜肴
+洗碗液
+迪斯尼乐园
+自动售货机
+展示
+陈列窗
+壕沟
+潜水
+潜水员
+跳水板
+纸杯
+流行音乐播音员
+杜宾犬
+码头
+医生
+文件
+纪录片
+狗
+狗窝
+犬种
+狗项圈
+狗粮
+狗窝
+洋娃娃
+美元
+玩偶之家
+洋娃娃
+海豚
+穹顶
+住宅
+多米诺骨牌
+驴
+甜甜圈
+涂鸦
+门
+门把手
+受气包
+门牌
+门口
+宿舍
+面团
+市中心
+推土机
+拖
+龙
+蜻蜓
+排水沟
+剧本
+戏剧电影
+画
+抽屉里
+图画/画画
+图钉
+辫子
+连衣裙/特定场合的服装
+礼帽
+正装衬衫
+皮鞋
+大礼服
+梳妆台
+更衣室
+运球
+漂移
+浮木
+钻
+饮品/喝
+饮用水
+开车
+司机
+车道
+无人机
+水滴/下降
+吊灯
+滴管
+干旱
+药物
+药店
+鼓
+鼓手
+鸡腿
+干的
+公爵夫人
+鸭子
+鸭嘴兽
+小鸭子
+布基胶带
+伙计
+二重唱
+粗呢
+独木舟
+哑铃
+饺子
+沙丘
+扣篮
+榴莲
+黄昏
+灰尘
+垃圾车
+簸箕
+羽绒被
+DVD
+染料
+鹰
+耳朵
+御寒耳罩
+耳机
+耳塞
+耳环
+地震
+画架
+复活节
+复活节兔子
+复活节彩蛋
+吃
+餐厅
+泡芙
+日食
+生态系统
+编辑
+教育
+教育家
+鳗鱼
+蛋
+蛋卷
+蛋挞
+打蛋器
+白鹭
+埃菲尔铁塔
+橡皮筋
+上级
+电椅
+电钻
+电工
+电
+电子
+电子器件
+大象
+高度图
+电梯
+电梯轿厢
+电梯门
+电梯大堂
+电梯井
+路堤
+大使馆
+装饰
+灰烬
+会徽
+刺绣
+翡翠
+紧急
+紧急服务
+紧急车辆
+情感
+帝国大厦
+搪瓷
+外壳/围墙
+茶几
+能源
+订婚
+订婚戒指
+引擎
+机舱
+工程师
+工程
+英国短毛猫
+乐团
+回车键
+演艺人员
+娱乐
+娱乐中心
+入口
+入口大厅
+信封
+马术
+设备
+橡皮擦
+二胡
+侵蚀
+自动扶梯
+食用蜗牛
+浓缩咖啡
+房地产
+河口
+桉树
+晚上
+晚礼服
+夜光
+傍晚天空
+晚上的太阳
+事件
+常绿的
+母羊
+挖掘
+运动
+排气罩
+展览
+出口
+探险者
+爆炸
+延长线
+灭火器
+排气扇
+挤压
+眼睛
+眼影
+眉
+眼线笔
+布料
+纺织品商店
+外观
+脸
+脸部特写
+蜜粉
+毛巾
+面巾纸架
+设施
+工厂
+工厂车间
+集市
+露天市场
+仙女
+猎鹰
+秋天
+家庭
+家庭轿车
+全家福
+家庭房
+风扇/扇子
+尖牙
+农场
+农民
+农民市场
+农舍
+时尚
+时尚配饰
+时装设计师
+时尚的女孩
+时装插图
+时装大片
+时装模特
+时装表演
+快餐
+西式快餐
+父亲
+水龙头
+故障
+动物
+小鹿
+传真
+宴会
+羽毛
+软呢帽
+饲料
+一餐
+饲养
+喂养的椅子
+猫科
+美洲狮
+栅栏
+芬达
+蕨类植物
+雪貂
+摩天轮
+渡船
+肥料
+节日
+纤维
+小说
+小说书
+田野/场地/野外
+田间道路
+无花果
+打架
+花样滑冰运动员
+小雕像
+文件
+档案照片
+文件柜
+填满
+胶片相机
+电影导演
+电影格式
+电影首映礼
+电影制片人
+拍摄
+过滤器
+鳍
+手
+终点线
+冷杉
+冷杉树
+火
+火灾报警
+消防部门
+消防车
+消防通道
+消防水带
+火坑
+消防站
+爆竹
+消防队员
+壁炉
+烟花
+烟花表演
+急救箱
+鱼
+鱼船
+海鲜市场
+鱼塘
+鱼缸
+渔夫
+钓鱼
+渔船
+渔网
+钓鱼
+渔村
+健身
+健身课程
+五个
+固定装置
+峡湾
+国旗
+旗杆
+小薄片
+火焰
+火烈鸟
+法兰绒
+拍打
+耀斑
+闪光
+烧瓶
+平
+比目鱼
+风味
+跳蚤
+跳蚤市场
+舰队
+飞行
+空中乘务员
+翻转
+触发器
+翻转图
+浮动
+群
+洪水
+地板/地面
+落地扇
+脚垫
+楼层平面图
+落地窗
+插花艺术
+花店
+牙线
+面粉
+流动
+花
+花篮
+花坛
+花箱
+花田
+花童
+花卉市场
+流体
+冲洗
+长笛
+飞
+飞行钓鱼
+传单
+马
+泡沫
+雾
+多雾的
+鹅肝酱
+箔纸
+折椅
+树叶
+民间艺术家
+民间舞蹈
+民间摇滚艺术家
+方旦糖
+火锅
+圣洗池
+食物
+食用色素
+美食广场
+食品加工机
+小吃摊
+快餐车
+桌上足球
+脚
+人行桥
+足球
+足球教练
+大学橄榄球赛
+足球比赛
+足球场
+足球比赛
+橄榄球头盔
+足球运动员
+足球场
+足球队
+小路
+脚印
+脚踏板
+台座
+鞋子
+故宫
+浅滩
+额头
+森林
+森林大火
+森林地面
+森林小路
+森林公路
+锻造
+餐叉
+叉车
+表格
+园林
+队列/形成物
+F1方程式赛车
+堡垒
+碉堡
+追逐
+化石
+粉底
+喷泉
+钢笔
+狐狸
+框架
+雀斑
+高速公路
+卡车
+法国
+法国斗牛犬
+薯条
+法式吐司
+化妆水
+冰箱
+炸鸡
+煎蛋
+炒饭
+友谊
+飞盘
+青蛙
+霜
+结霜
+严寒
+结冰
+水果
+水果蛋糕
+水果盘
+水果市场
+水果沙拉
+水果摊
+果树
+水果商店
+油炸食品
+煎锅
+软糖
+燃料
+吸烟罩
+有趣的
+葬礼
+真菌
+漏斗
+毛皮衣服
+毛皮大衣
+家具
+蒲团
+小工具
+枪口
+星云/星系
+美术馆
+游戏
+游戏棋盘
+游戏手柄
+火腿
+团伙
+车库
+车库门
+手工模型
+垃圾
+花园
+花园芦笋
+橡胶软管
+花园蜘蛛
+园丁
+园艺
+加菲猫
+滴水嘴
+花环
+大蒜
+衣服
+气体
+加油站
+煤气炉
+防毒面具
+收集
+聚集
+测量仪器
+露台
+齿轮
+壁虎
+艺妓
+凝胶
+百货商店
+发电机
+天竺葵
+幽灵
+礼物
+礼品袋
+礼品篮
+礼物盒
+礼品卡
+礼品商店
+礼物包装
+演唱会
+杜松子酒
+姜
+姜饼
+姜饼屋
+银杏树
+长颈鹿
+女孩
+给
+冰川
+角斗士
+玻璃珠
+玻璃瓶
+玻璃碗
+玻璃箱
+玻璃建筑
+玻璃门
+玻璃地板
+玻璃屋
+玻璃罐
+玻璃板
+玻璃桌子
+玻璃花瓶
+玻璃墙
+玻璃窗
+眼镜
+光滑面
+滑翔机
+地球
+手套
+发光
+汤圆
+去
+袭击
+球门
+守门员
+山羊
+羊奶酪
+戈壁
+护目镜/墨镜
+黄金
+金牌
+金门大桥
+金毛猎犬
+金鱼
+高尔夫运动
+高尔夫球帽
+高尔夫球车
+高尔夫球杆
+高尔夫球场
+高尔夫球手
+鹅
+大猩猩
+哥特式
+葫芦
+政府
+政府机构
+礼服
+毕业生
+毕业典礼
+谷物
+逆戟鲸
+大奖赛
+祖父
+祖母
+祖父母
+花岗岩
+格兰诺拉麦片
+葡萄
+西柚
+葡萄酒
+草
+蚱蜢
+草原
+长满草的
+擦菜器
+坟墓
+碎石
+墓碑
+肉汁
+调味汁瓶
+灰色
+吃草
+放牧
+绿色
+绿色植物
+欢迎
+问候
+贺卡
+灰狗
+网格
+筛子
+烧烤架
+格栅
+烤鳗鱼
+磨
+研磨机
+粗燕麦粉
+杂货袋
+洞穴
+地松鼠
+群体
+合影
+小树林
+生长
+牛油果酱
+警卫
+看门狗
+宾馆
+客房
+指南
+豚鼠
+吉他
+吉他手
+海湾
+海鸥
+枪
+高达
+谒师所
+古筝
+健身房
+体操运动员
+栖息地
+黑客
+冰雹
+头发
+头发颜色
+发胶
+毛刷
+发型
+发夹
+发网
+发夹
+发型
+一半
+礼堂
+万圣节
+万圣节服装
+万圣节南瓜
+露背装
+汉堡
+汉堡包
+哈密瓜
+锤子
+吊床
+阻碍
+仓鼠
+烘手机
+放大镜
+擦手巾
+手提包
+手球
+手铐
+手枪
+手帕
+把手
+手锯
+握手
+倒立
+手写
+汉服
+悬挂
+飞机库
+衣架
+幸福
+海港
+斑海豹
+硬摇滚艺术家
+精装书
+建筑工人
+硬件
+五金店
+硬木
+硬木地板
+口琴
+管风琴
+羽管键琴
+收获
+收割机
+坐垫/搁脚凳/草丛
+帽子
+帽盒
+双簧管
+山楂
+干草
+干草地
+榛子
+头
+主教练
+大灯
+床头板
+头饰
+海岬
+总部
+听力
+心脏
+心形
+热能
+加热器
+帚石楠
+树篱
+刺猬
+脚后跟
+直升机
+直升机机场
+头盔
+帮助
+母鸡
+指甲花
+药草
+兽群
+寄居蟹
+英雄
+苍鹭
+芙蓉花
+芙蓉花
+隐藏/隐蔽处
+高杠
+高跟鞋
+高地
+突出
+徒步旅行
+徒步旅行者
+徒步靴
+登山设备
+山丘
+丘陵地
+别墅
+山坡
+印度教寺庙
+铰链
+臀部
+嘻哈艺人
+河马
+历史学家
+历史遗迹
+历史
+曲棍球
+冰球馆
+曲棍球比赛
+曲棍球运动员
+曲棍球棒
+锄头
+洞
+假日
+冬青树
+海参
+家/住宅
+家用电器
+基地
+家居装饰
+室内设计
+内政部
+家庭影院
+家庭作业
+鹰嘴豆泥
+蜂蜜
+蜂窝
+蜜月
+风帽
+连帽衫
+挂钩/勾住
+跳
+地平线
+犀鸟
+长角牛
+大黄蜂
+震惊
+恐怖电影
+马鞍褥
+马车
+马场
+骑马
+马背
+马蹄铁
+软管
+医院
+医院病床
+病房
+主持人
+小旅馆
+热
+热气球
+热狗
+辣椒酱
+温泉
+旅馆
+酒店大堂
+酒店房间
+电炉
+沙漏
+房子
+房子外部
+室内植物
+悬滑板
+吼
+蜷缩
+拥抱
+呼啦圈
+人
+增湿器
+蜂鸟
+座头鲸
+打猎
+狩猎小屋
+障碍
+飓风
+哈士奇
+小屋
+鬣狗
+混合物
+绣球花
+消火栓
+水上飞机
+冰
+冰袋
+北极熊
+冰洞
+冰淇淋
+冰淇淋蛋卷
+冰淇淋商店
+冰块
+浮冰
+冰球运动员
+冰球队
+棒棒糖
+制冰机
+溜冰场
+冰雕
+冰架
+溜冰鞋
+滑冰
+冰山
+冰柱
+结冰
+图标
+身份证照片
+身份证
+冰屋
+光/灯光/光线
+鬣蜥蜴
+照亮
+插图
+形象
+黑斑羚
+熏香
+独立日
+个人
+室内
+划船器
+电磁炉
+工业区
+工业
+步兵
+充气艇
+服务台
+基础设施
+成分
+吸入器
+注射
+受伤
+墨水
+印泥
+小湖湾
+题词
+昆虫
+安装
+乐器/器械
+绝缘杯
+互动
+室内设计
+网站
+十字路口
+面试
+无脊椎动物
+邀请
+平板电脑
+苹果手机
+苹果音乐播放器
+虹膜
+铁
+熨衣板
+灌溉系统
+岛
+小岛
+等足类动物
+象牙
+常青藤
+居酒屋
+千斤顶
+帝王蟹/蟹
+夹克衫
+按摩浴缸
+玉
+美洲虎
+监狱牢房
+果酱
+日式花园
+茉莉花
+下巴
+松鸦
+爵士乐
+爵士乐艺术家
+爵士融合艺术家
+牛仔裤
+吉普车
+果冻
+果冻豆
+水母
+喷气式飞机
+摩托艇
+珠宝
+珠宝
+珠宝店
+拼图游戏
+人力车
+赛马骑师
+赛马帽
+慢跑
+联合的
+记者
+操纵杆
+法官
+水壶
+玩杂耍
+果汁
+榨汁器
+枣子
+跳绳
+连身裤
+丛林
+废品堆放场
+羽衣甘蓝
+万花筒
+袋鼠
+卡拉ok
+空手道
+卡丁车运动
+旧城区
+皮船
+烤肉串
+按键/钥匙
+门卡
+卡其色
+踢
+苏格兰裙
+和服
+幼儿园教室
+幼儿园
+国王
+帝王蟹
+亲吻
+工具包
+厨房
+厨房橱柜
+厨房台面
+厨房地板
+厨房抽油烟机
+厨房岛
+厨房水槽
+厨房桌子
+厨房用具
+厨房窗户
+厨房用具
+风筝
+猕猴桃
+护膝
+跪下
+餐刀
+骑手
+编织
+编织针
+球形把手
+门环
+结
+考拉
+锦鲤
+ktv
+实验室
+实验室外套
+标签
+拉布拉多
+迷宫
+网眼织物
+蕾丝连衣裙
+梯子
+长柄杓
+瓢虫
+环礁湖
+湖泊
+湖区
+湖边小屋
+湖岸
+羊肉
+羊排
+灯柱
+灯罩
+矛
+土地
+陆地车辆
+废物填埋
+着陆
+降落甲板
+地标
+风景
+山崩
+挂带
+灯笼
+腿/大腿
+笔记本电脑
+笔记本键盘
+幼体
+烤宽面条
+激光
+睫毛
+套索
+门闩
+乳胶
+拿铁咖啡
+笑
+发射
+发布会
+举办会议
+自助洗衣店
+洗衣房
+洗衣篮
+洗衣房
+熔岩
+薰衣草
+草坪
+草坪婚礼
+律师
+躺
+引领
+主唱
+通向
+领袖
+泄漏
+倾斜/倚靠
+学习
+皮带
+皮革
+皮夹克
+皮鞋
+演讲
+演讲厅
+教学室
+窗台
+剩饭
+腿
+传说
+紧身裤/秋裤
+立法院
+乐高
+豆类
+柠檬
+柠檬汁
+柠檬水
+狐猴
+镜头
+眩光
+扁豆
+豹
+紧身连衣裤
+紧身裤袜
+小妖精
+功课
+信函
+信箱
+信的标志
+刻字
+生菜
+水平
+图书馆
+许可证
+车牌
+地衣
+舔
+盖子
+躺着
+安全带
+救生衣
+救生艇
+救生员
+提起
+灯具
+灯光秀
+电灯开关
+照明/照明设备
+闪电
+避雷针
+淡紫色
+百合
+肢体
+石灰
+石灰石
+豪华轿车
+线条
+艺术线条
+排队
+亚麻
+邮轮
+狮子
+润唇膏
+口红
+液体
+酒类商店
+列表
+荔枝
+生活
+家畜
+客厅
+生活空间
+蜥蜴
+负载
+装卸码头
+游手好闲的人
+走廊
+定位
+锁
+闸室
+储物柜
+阁楼
+原木
+小木屋
+标志
+洛基
+长头发
+冲浪板
+隐约显现/织布机
+环状
+遗失
+彩票
+莲花
+爱
+双人沙发
+行李
+木材
+伐木工人
+午餐
+午餐盒
+郁郁葱葱的
+奢侈品
+豪华游艇
+雨衣
+澳洲胡桃
+短尾猿
+通心粉
+金刚鹦鹉
+弯刀
+机器
+机枪
+杂志
+魔法
+魔术师
+磁铁
+放大镜
+木兰花
+喜鹊
+麻将
+象夫
+女仆
+邮件
+邮件槽
+制作
+改造
+化妆师
+化妆工具
+野鸭
+野鸭
+槌棒
+哺乳动物
+猛犸象
+男人
+管理
+经理
+海牛
+曼荼罗
+橘子
+普通话
+鬃毛
+漫画
+食槽
+芒果
+山竹果
+红树林
+曼哈顿
+检修孔
+井盖
+修指甲
+人体模型
+庄园主宅
+大厦
+螳螂
+地幔
+活动房层
+制造业
+手稿
+地图
+枫木
+枫叶
+枫糖浆
+沙球
+马拉松
+大理石
+行进
+行进乐队
+母马
+金盏花
+水兵
+海洋无脊椎动物
+海洋哺乳动物
+木偶
+标志
+集市
+市场广场
+市场摊位
+结婚
+武术
+武术家
+武术馆
+马提尼
+马丁尼酒杯
+睫毛膏
+吉祥物
+土豆泥
+搅碎机
+面具/口罩
+按摩
+桅杆
+地垫
+斗牛士
+比赛
+火柴盒
+衣料
+床垫
+陵墓
+长裙
+一餐
+量杯
+卷尺
+肉类
+肉丸
+机械师
+机械风扇
+奖牌
+媒体
+医疗设备
+医学图像
+医务人员
+医药箱
+中世纪的
+麦地那市
+冥想
+猫鼬
+赛事
+香瓜
+纪念碑
+菜单
+美人鱼
+网
+肮脏
+信使袋
+金属
+金属艺术家
+金属探测器
+计量器
+中层楼
+麦克风
+显微镜
+微波炉
+午夜
+里程碑
+军装
+牛奶
+牛奶罐
+奶茶
+奶昔
+磨坊
+矿井
+矿工
+矿物质
+矿泉水
+迷你
+微缩模型
+面包车
+部长
+小型货车
+薄荷
+薄荷糖
+镜子
+小姐
+投掷物
+任务
+槲寄生
+混合
+搅拌机
+搅拌碗
+混合物
+护城河
+电动踏板车
+模型/模特
+汽车模型
+现代
+现代大厦
+潮湿
+模具
+模具
+鼹鼠
+君主
+钱
+监控器
+和尚
+猴子
+活动扳手
+黑白照片
+独轮脚踏车
+怪物卡车
+月亮
+月饼
+月光
+沼泽
+驼鹿
+拭子
+助力车
+早晨
+晨雾
+晨光
+朝阳
+砂浆
+马赛克
+清真寺
+蚊子
+藓类植物
+汽车旅馆
+蛾
+母亲
+主板
+主题
+动作
+电动机
+摩托车
+摩托车
+摩托车头盔
+摩托车赛车手
+骑摩托车的人
+赛车运动
+土堆
+山
+山地自行车
+山地自行车员
+山地自行车运动
+山地大猩猩
+山湖
+山景观
+山口
+山路
+山脉
+山区河流
+山雪
+山间溪流
+山景城
+山村
+登山者
+登山包
+鼠标/鼠
+鼠标垫
+捕鼠器
+嘴
+漱口水
+移动
+电影海报
+电影票
+割草机
+mp3播放器
+先生
+泥
+松饼
+马克杯
+桑树
+覆盖物
+骡子
+直辖市
+壁画
+肌肉
+肌肉车
+博物馆
+蘑菇
+音乐
+音乐节
+音乐凳子
+音乐工作室
+音乐录影带表演者
+音乐键盘
+音乐家
+贻贝
+芥末
+神话
+烤干酪辣味玉米片
+指甲油
+指甲锉
+保姆
+餐巾
+狭窄的
+国旗
+基督诞生的场景
+自然历史博物馆
+自然
+自然保护区
+导航
+九夜节
+海军
+星云
+脖子
+围颈带/领口
+项链
+领口
+花蜜
+油桃
+针状物
+邻居
+与某处邻近的地区
+霓虹灯
+霓虹灯
+神经
+巢
+新年
+新生的
+纽芬兰
+新婚
+新闻
+记者招待会
+报摊
+晚上
+夜市
+夜空
+夜景
+夜总会
+床头柜
+面条
+鼻子
+鼻羁
+注解
+笔记本
+记事本
+信纸
+公告
+数字图标
+修女
+护士
+托儿所
+养老院
+螺母
+胡桃夹子
+橡木
+橡树
+桨
+绿洲
+烘干室
+燕麦片
+燕麦
+方尖塔
+观察塔
+天文台
+超越障碍训练场
+海洋
+章鱼
+提供
+办公室
+办公大楼
+办公椅
+办公室隔间
+办公桌
+办公用品
+办公室的窗户
+军官
+行政官员
+石油
+油灯
+油画
+石油钻台
+秋葵
+老照片
+橄榄
+橄榄油
+橄榄树
+煎蛋卷
+洋葱
+洋葱圈
+蛋白石
+开阔的/张开
+开始
+开幕式
+歌剧
+歌剧院
+操作
+手术室
+操作
+眼镜店
+猩猩
+橙子/橙色
+橙汁
+橙树
+橘园
+轨道
+果园
+乐池
+兰花
+订单
+组织
+折纸
+点缀
+鱼鹰
+鸵鸟
+水獭
+外面的
+露头
+户外
+厕所
+电源插头
+大纲
+椭圆形
+烤箱
+整体
+大衣
+天桥
+猫头鹰
+牡蛎
+橡皮环
+包裹
+包/包装/包裹
+围场
+警车
+挂锁
+肉菜饭
+宝塔
+疼痛
+油漆刷
+画家
+佩斯利印花大手帕
+宫殿
+调色板
+打桩
+棺罩
+棕榈树
+平底锅
+煎饼
+熊猫
+面板
+全景
+三色堇
+喘息
+储藏室
+裤子
+连裤袜
+木瓜
+纸
+纸袋
+切纸机
+纸灯笼
+纸盘子
+纸巾
+平装书
+压纸器
+降落伞
+游行
+天堂
+鹦鹉
+护理人员
+长尾小鹦鹉
+滑翔伞
+伞兵
+羊皮纸
+教区
+公园
+公园长椅
+停车
+停车场
+停车费
+停车标志
+议会
+欧芹/香菜
+参与者
+合作伙伴
+帕特里奇
+聚会
+派对帽
+通过
+通道
+存折
+乘客
+客船
+旅客列车
+百香果
+护照
+面食
+粘贴
+糕点
+牧场
+补丁
+病人
+图案/款式
+人行道/硬路面
+大帐篷
+爪子
+支付
+付费电话
+豌豆
+和平
+桃子
+孔雀
+山峰/尖顶
+花生
+花生酱
+梨
+珍珠
+卵石
+山核桃
+行人
+人行天桥
+步行街
+果皮
+削皮器
+小钉板
+木质腿
+鹈鹕
+笔/围栏
+点球
+铅笔
+铅笔盒
+卷笔刀
+铅笔裙
+吊坠
+钟摆
+企鹅
+半岛
+锦标旗
+便士
+储蓄罐
+牡丹
+胡椒/辣椒
+胡椒研磨机
+胡椒子
+意大利辣香肠
+栖息/鲈鱼
+表演
+性能
+表演舞台
+香水
+绿廊
+波斯猫
+柿子
+个人护理
+个人漂浮装置
+害虫
+宠物
+宠物店
+宠物店
+花瓣
+佩妮
+教堂的长椅
+野鸡
+现象
+哲学家
+电话
+电话簿
+留声机
+照片
+照相亭
+相框
+摄影
+物理学家
+物理实验室
+钢琴家
+钢琴
+选择
+捡起
+泡菜
+野餐
+野餐区
+野餐篮
+野餐桌
+图片
+相框
+馅饼
+鸽子
+朝圣者
+药片
+枕头
+飞行员
+领航艇
+别针
+松树
+松果
+松林
+松子
+菠萝
+乒乓球桌
+乒乓球
+粉色
+一品脱的量
+琵琶
+管子
+管碗
+海盗
+海盗旗
+海盗船
+阿月浑子
+滑雪场
+口袋里的面包
+火龙果
+斗牛犬
+球场
+投手
+猪笼草
+干草叉
+披萨
+披萨刀
+比萨锅
+披萨店
+招牌
+地方
+餐具垫
+格子
+平原
+示意图
+行星
+行星地球
+厚木板
+植物
+种植园
+种植
+匾额
+石膏
+塑料
+橡皮泥
+高原
+平台
+白金
+大浅盘
+玩/演奏/运动
+打羽毛球
+打棒球
+打篮球
+玩台球
+踢足球
+玩乒乓球
+打网球
+打排球
+选手/运动员
+操场
+剧场
+扑克牌
+下棋
+打高尔夫球
+打麻将
+运动场
+护栏
+游戏室
+广场
+钳子
+故事情节
+犁
+插头
+插头帽
+李子
+水管工
+卫生洁具
+羽毛
+夹板
+口袋
+怀表
+随身小折刀
+圆荚体
+乐队指挥台
+诗歌
+一品红
+指/朝向
+指针
+扑克卡
+筹码
+扑克表
+杆/柱
+臭猫
+警察
+警车
+警犬
+警察局
+政治家
+圆点
+花粉
+污染
+马球
+马球领
+马球衬衫
+石榴
+波美拉尼亚的
+雨披
+池塘
+马尾辫
+贵宾犬
+池
+流行
+流行艺术家
+爆米花
+教皇
+罂粟
+瓷
+玄关
+猪肉
+粥
+便携式电池
+门户网站
+投资组合
+汽门
+肖像
+肖像会话
+摆姿势拍照
+负鼠
+帖子
+邮局
+邮票
+明信片
+海报
+海报页
+锅/罐/陶盆
+土豆
+土豆片
+土豆沙拉
+布垫子
+便壶
+袋
+家禽
+英镑
+倾泻
+粉末
+电源线
+电源插头及插座
+权力看
+电站
+练习
+布拉格城堡
+祈祷
+牧师
+首映
+处方
+显示
+演讲
+总统
+新闻发布室
+高压锅
+椒盐卷饼
+王子
+公主
+打印
+打印页面
+打印机
+印刷
+监狱
+农产品/生产
+产品
+职业
+专业的
+教授
+项目图片
+投影屏幕
+投影仪
+毕业舞会
+散步
+螺旋桨
+先知
+建议
+防护服
+抗议
+抗议者
+出版
+宣传画像
+冰上曲棍球
+布丁
+水坑
+泡芙
+角嘴海雀
+哈巴狗
+拉
+讲坛
+脉冲
+泵
+南瓜
+南瓜饼
+南瓜种子
+拳击吊袋
+拳头猛击/穿孔
+学生
+紫色
+推
+轻轻一击
+谜题
+塔
+金字塔
+大蟒
+二维码
+鹌鹑
+采石场
+季度
+石英
+女王
+油炸玉米粉饼
+队列
+乳蛋饼
+被子
+绗缝
+引用
+兔子
+浣熊
+比赛
+赛道
+水沟/跑道
+赛车
+球拍
+雷达
+散热器
+广播
+木筏/橡皮艇
+布娃娃
+栏杆/铁轨
+轨道车
+铁道
+铁路桥梁
+轨道线
+火车站
+雨
+雨靴
+彩虹
+虹鳟鱼
+雨衣
+热带雨林
+多雨的
+葡萄干
+耙子
+公羊
+斜坡
+油菜籽
+快速
+说唱歌手
+树莓
+老鼠
+棘轮
+乌鸦
+峡谷
+雷
+剃须刀
+锋利的
+阅读
+阅读材料
+钻孔器
+后面
+尾灯
+后视图
+后视镜
+收据
+收到
+接待
+配方
+记录
+唱片制作人
+录音机
+录音室
+娱乐室
+休闲车
+矩形
+回收
+回收站
+红色
+红地毯
+红旗
+红熊猫
+红酒
+红木
+芦苇
+礁石
+卷轴
+裁判
+倒影
+倒影
+反射器
+注册
+控制
+驯鹿
+放松
+释放
+救援
+宗教
+宗教的
+享受
+保持
+改造
+遥控器
+移除
+修复
+维修店
+爬行动物
+救援
+救助者
+研究
+研究员
+储层
+住宅
+居民区
+树脂
+度假胜地
+度假小镇
+餐厅的厨房
+餐厅的露台
+厕所
+零售
+寻回犬
+制动火箭
+揭示
+犀牛
+杜鹃
+肋骨
+丝带
+大米
+电饭煲
+稻田
+骑/搭乘
+脊
+骑马
+步枪
+边缘
+环/戒指
+暴乱
+涟漪
+上升
+高层建筑
+河
+河岸
+河船
+河谷
+河床
+路
+路标
+公路旅行
+路边
+烤鸡
+长袍
+罗宾
+机器人
+石头
+岩石拱
+摇滚艺术家
+摇滚乐队
+攀岩者
+攀岩
+摇滚音乐会
+岩石表面
+岩层
+摇滚歌手
+火箭
+摇椅
+岩石
+啮齿动物
+牛仔竞技表演
+竞技舞台
+罗伊
+狍子
+辊
+过山车
+轮式溜冰鞋
+溜冰鞋
+擀面杖
+浪漫
+浪漫的
+屋顶
+屋顶花园
+房间
+房间分频器
+根
+根啤酒
+绳索桥
+念珠
+玫瑰
+迷迭香
+玫瑰色的云
+罗特韦尔犬
+圆桌
+路由器
+行
+罗文
+皇家
+橡皮图章
+废墟
+魔方
+红宝石
+莱夫
+橄榄球
+橄榄球
+橄榄球运动员
+毁坏
+尺
+朗姆酒
+跑
+跑步者
+跑步鞋
+农村的
+锈
+乡村的
+黑麦
+袋
+鞍
+鞍囊
+旅行
+安全
+安全背心
+圣人
+帆
+帆船
+航行
+水手
+松鼠猴
+缘故
+沙拉
+沙拉碗
+火蜥蜴
+意大利蒜味腊肠
+出售
+三文鱼
+沙龙
+萨尔萨舞
+盐
+盐和胡椒瓶
+盐湖
+盐沼
+盐瓶
+敬礼
+萨莫耶德人
+武士
+沙子
+沙洲
+砂箱
+沙堡
+沙雕
+凉鞋
+三明治
+卫生巾
+圣诞老人
+蓝宝石
+沙丁鱼
+莎丽
+生鱼片
+沙爹
+书包
+卫星
+缎
+酱汁
+碟子
+桑拿
+香肠
+稀树大草原
+锯
+锯木架
+萨克斯管
+萨克斯手
+脚手架
+秤/标尺
+比例模型
+扇贝
+疤痕
+稻草人
+围巾
+场景
+风景
+雪纳瑞犬
+学校
+校车
+校服
+校舍
+纵帆船
+科学
+科幻电影
+科学博物馆
+科学家
+剪刀
+壁灯
+司康饼
+勺子
+踏板车
+分数
+记分板
+蝎子
+童子军
+炒蛋
+废弃
+刮板
+刮伤
+屏幕
+纱门
+截图
+螺杆
+螺丝刀
+长卷纸/卷轴
+擦洗
+硬毛刷
+雕塑家
+雕塑
+海洞穴
+海冰
+海狮
+海龟
+海胆
+尖吻鲈
+海底
+海鸟
+海鲜
+海马
+海豹
+海景
+海贝
+海滨度假胜地
+季节
+座位
+安全带
+海藻
+秘书
+安全
+小轿车
+看到
+种子
+跷跷板
+赛格威
+自拍
+出售
+研讨会
+感觉
+传感器
+服务器
+服务器机房
+服务
+集
+缝纫机
+影子
+摇
+瓶
+洗发水
+形状
+分享
+鲨鱼
+卷笔刀
+记号笔
+剃须刀
+剃须膏
+披肩/围巾
+剪切
+剪刀
+羊
+床单
+乐谱
+架子
+贝壳
+贝类
+避难所
+搁置
+牧羊人
+果子露
+柴犬
+发光
+航运
+集装箱
+海难
+船厂
+衬衫
+赤膊的
+浅滩
+鞋
+鞋盒
+鞋店
+鞋楦
+射击
+得分篮球后卫
+商店橱窗
+门面
+购物者
+购物
+购物袋
+购物篮
+购物车
+购物中心
+购物街
+海岸
+海岸线
+短的
+短发
+短裤
+小酒杯
+散弹枪
+肩膀
+单肩包
+铲
+陈列柜
+淋浴
+浴帽
+浴帘
+淋浴门
+淋浴头
+碎纸机
+泼妇
+虾
+神社
+灌木
+快门
+暹罗猫
+西伯利亚
+兄弟姐妹
+侧面
+边柜
+配菜
+边车
+边线
+壁板
+标志
+指示牌
+信号
+签名
+丝绸
+丝袜
+筒仓
+银
+银牌
+银器
+唱歌
+烧焦
+歌手
+水槽
+啜
+坐/放置/坐落
+坐着
+滑板公园
+滑板
+滑板者
+溜冰者
+溜冰场
+骨架
+草图
+串串
+滑雪
+滑雪靴
+滑雪设备
+滑雪服
+滑雪缆车
+滑雪杖
+滑雪胜地
+滑雪板
+滑雪
+滑雪鞋
+皮肤
+头骨
+无边便帽
+天空
+天空塔
+天窗
+天际线
+摩天大楼
+激流回旋
+石板
+雪橇
+睡眠
+睡袋
+睡衣
+袖子
+片
+滑动
+滑块
+吊索
+坡
+投币口
+老虎机
+树懒
+慢炖锅
+鼻涕虫
+贫民窟
+气味
+微笑
+烟雾/抽烟
+零食
+蜗牛
+蛇
+鲷鱼
+快照
+通气管
+鼻子
+雪
+雪豹
+雪山
+雪球
+单板滑雪者
+雪原
+雪花
+雪人
+雪地摩托
+雪犁
+雪鞋
+雪
+肥皂
+肥皂泡
+给皂器
+足球守门员
+社会名流
+短袜
+插座
+苏打水
+垒球
+软件
+太阳能电池阵列
+士兵
+独奏
+解决方案
+宽边帽
+歌曲
+声音
+汤
+汤碗
+汤匙
+酸奶油
+纪念品
+豆浆
+水疗中心
+空间
+航天飞机
+空间站
+宇宙飞船
+意大利面
+横跨
+扳手
+火花
+闪耀
+烟火
+起泡葡萄酒
+麻雀
+抹刀
+扬声器
+观众
+会话框
+速度限制
+限速标志
+快艇
+车速表
+球
+香料
+调料架
+蜘蛛
+蜘蛛网
+扣球
+旋转
+菠菜
+尖塔
+飞溅
+海绵
+勺子
+体育协会
+运动器材
+运动团队
+体育球
+体育器材
+运动会
+运动服装
+点
+喷雾
+伸展
+春天
+春卷
+撒
+洒水器
+发芽
+云杉
+云杉森林
+队
+广场
+南瓜
+蹲
+挤
+鱿鱼
+松鼠
+水枪
+刺
+稳定的
+（码放整齐的）一叠
+体育场
+工作人员
+舞台
+舞台灯
+驿马车
+弄脏
+不锈钢
+楼梯
+楼梯
+楼梯间
+摊位/小隔间
+种马
+站/矗立/摊位
+站
+主食
+订书机
+星星
+盯着
+海星
+杨桃
+燕八哥
+州立公园
+公立学校
+车站
+固定自行车
+文具
+雕像
+牛排
+牛排刀
+蒸汽
+蒸汽机
+蒸汽机车
+蒸汽火车
+馒头
+钢
+方向盘
+（花草的）茎
+模版
+梯凳
+立体声
+听诊器
+炖
+戳/条状物
+竹节虫
+贴纸
+静物画
+高跷
+黄貂鱼
+搅拌
+搅拌器
+镫
+缝
+股票
+长筒袜
+腹部
+石头建筑
+石雕
+石屋
+石磨
+凳子
+停止
+停在
+红灯
+停车标志
+秒表
+红绿灯
+存储箱
+储藏室
+罐/蓄水池
+商店
+店面
+鹳
+风暴
+暴风云
+狂风暴雨的
+炉子
+扑克
+跨骑
+过滤器
+海峡
+带
+稻草/吸管
+草帽
+草莓
+溪流
+街头艺术
+街头艺术家
+街角
+流浪狗
+街头食品
+路灯
+街市场
+街头摄影
+街景
+路标
+街头小贩
+拉伸
+担架
+罢工
+前锋
+细绳
+芝士条
+带子
+条纹
+漫步
+结构
+工作室
+影棚拍摄
+材料
+填充玩具动物
+毛绒玩具
+馅
+树桩
+惊人的
+特技
+佛塔
+风格
+手写笔
+潜艇
+潜艇形大三明治
+海底水
+郊区
+地铁
+地铁站
+低音炮
+多肉
+绒面革
+糖
+糖碗
+甘蔗
+方糖
+西装
+套房
+夏天
+夏天傍晚
+峰顶
+太阳
+太阳帽
+日光浴
+周日
+日晷
+向日葵
+向日葵田
+葵花籽
+太阳镜
+晴天
+日出
+日落
+遮阳伞
+阳光
+超级碗
+跑车
+超级英雄
+超市
+超市货架
+超模
+支持者
+冲浪
+表面
+冲浪板
+冲浪者
+外科医生
+外科手术
+环绕
+寿司
+寿司吧
+背带裤
+悬架
+吊桥
+越野车
+燕子
+燕尾蝶
+沼泽
+天鹅
+天鹅游艇
+运动裤
+防汗带
+毛衣
+运动衫
+甜的
+红薯
+游泳
+泳帽
+游泳者
+游泳洞
+游泳池
+摆动
+平转桥
+秋千
+漩涡
+开关
+转椅
+剑
+旗鱼
+象征
+对称
+犹太教堂
+注射器
+糖浆
+系统
+t恤
+t恤
+塔巴斯科辣椒酱
+虎斑
+乒乓球拍
+桌面
+桌布
+平板电脑
+餐具
+转速表
+拦截
+墨西哥煎玉米卷
+跆拳道
+太极
+尾巴
+裁缝
+拍/拿
+起飞
+说话/交谈/演讲
+手鼓
+棕褐色
+橘子
+胶带/磁带/终点线
+挂毯
+沥青碎石路面
+芋头
+篷布
+果馅饼
+流苏
+味道
+榻榻米
+纹身
+纹身艺术家
+酒馆
+茶
+茶包
+茶话会
+茶园
+茶壶
+茶具
+教
+老师
+茶杯
+水鸭
+团队合影
+团队介绍
+眼泪
+技术员
+技术
+泰迪熊
+T字形物
+青少年
+电线杆
+变焦镜头
+望远镜
+电视
+电视摄像机
+电视室
+电视演播室
+温度
+寺庙
+天妇罗
+网球
+网球场
+网球比赛
+网球网
+网球运动员
+网球拍
+帐篷
+龙舌兰酒
+终端/航站楼
+阳台
+地形
+玻璃容器
+领土
+测试
+测试赛
+试管
+文本
+短信
+纺织
+纹理
+感恩节
+感恩节晚餐
+剧院
+戏剧演员
+治疗
+温度计
+热水瓶
+暖瓶
+恒温器
+灌木丛
+顶针
+东西
+思考
+蓟
+宝座
+金銮殿
+扔
+抱枕
+雷
+雷雨
+百里香
+皇冠
+记号
+票
+售票亭
+潮池
+领带
+老虎
+紧
+瓦
+瓷砖地板
+瓦屋顶
+瓷砖墙
+锡
+锡纸
+箔
+提拉米苏
+轮胎
+纸巾
+烤面包
+烤面包机
+烟草
+烟斗
+学步的小孩
+脚趾
+豆腐
+马桶
+马桶座圈
+化妆包
+东京铁塔
+番茄
+番茄酱
+番茄汤
+墓
+钳子
+钳子
+工具
+工具箱
+牙刷
+牙膏
+牙签
+修剪成形的花园
+配料
+火炬/光源
+龙卷风
+玉米粉圆饼
+乌龟
+大手提袋
+图腾柱
+龙猫
+巨嘴鸟
+触摸
+触地
+旅行
+旅游巴士
+导游
+游客
+旅游景点
+锦标赛
+拖车
+毛巾
+毛巾杆
+大厦
+塔桥
+小镇
+城镇广场
+玩具
+玩具车
+玩具枪
+玩具店
+跑道
+拖拉机
+贸易
+传统
+传统的
+交通
+锥形交通路标
+交通拥堵
+交通堵塞
+交通标志
+小道
+预告片
+拖车
+火车
+火车桥
+火车车厢
+火车内部
+火车轨道
+火车窗口
+教练
+训练
+训练长椅
+训练场
+电车/手推车
+蹦床
+变形金刚
+透明度
+旅行
+托盘/碟子
+跑步机
+款待
+树
+树枝
+林场
+树蛙
+树屋
+树根
+树干
+试验
+三角形
+铁人三项
+部落
+支流
+戏法
+三轮车
+修剪
+三人组
+三脚架
+长号
+部队
+奖杯
+奖杯
+热带
+鳟鱼
+卡车
+卡车司机
+浴缸
+管子
+拖船
+郁金香
+金枪鱼
+苔原
+隧道
+涡轮
+火鸡
+转动
+芜菁
+绿松石
+炮塔
+乌龟
+獠牙
+电视演员
+电视柜
+电视剧
+电视节目类型
+电视名人
+电视节目
+情景喜剧
+电视塔
+枝条
+黄昏
+双胞胎
+麻线
+扭
+类型
+键入
+打字机
+尤克里里
+奥特曼
+伞
+内衣
+水下
+独角兽
+制服
+宇宙
+大学
+向上
+城市
+尿壶
+瓮
+使用
+用具
+杂物间
+吸尘器/真空
+谷
+阀门
+吸血鬼
+货车
+香草
+虚荣
+种类
+花瓶/瓶
+金库
+矢量卡通插图
+矢量图标
+蔬菜
+菜园
+蔬菜市场
+植被
+车辆
+面纱
+静脉
+天鹅绒
+自动售货机
+小贩
+通风孔
+胡蜂属
+船
+背心
+兽医
+经验丰富的
+兽医办公室
+高架桥
+视频
+摄像机
+电子游戏
+录像带
+视镜
+守夜
+别墅
+村庄
+藤蔓
+醋
+葡萄园
+暴力
+紫罗兰色
+小提琴
+小提琴家
+中提琴演奏者
+愿景
+遮阳板
+伏特加
+火山
+排球
+排球场
+排球运动员
+志愿者
+航行
+秃鹰
+华夫饼干
+华夫饼机
+货车
+马车车轮
+腰
+服务员
+候机室
+等候室
+走
+步行
+手杖
+挂钟
+壁纸
+核桃
+海象
+战争
+仓库
+温暖的
+警告标志
+战士
+军舰
+疣猪
+洗
+洗衣机/垫圈
+洗
+洗衣机
+黄蜂
+浪费
+废物容器
+手表
+水
+水鸟
+水牛
+水冷却器
+水滴
+水景
+热水器
+水位
+荷花
+水上乐园
+水管
+净水器
+滑水板
+水上运动
+水面
+水塔
+水彩
+水彩插图
+水彩画
+瀑布
+喷壶
+水印叠加图章
+西瓜
+防水外套
+水路
+波浪
+蜡
+武器
+穿着
+天气
+叶片
+网
+摄像头
+婚礼
+结婚戒指
+婚礼花束
+结婚蛋糕
+新婚夫妇
+婚礼请柬
+婚礼派对
+婚纱照
+婚礼摄影师
+婚纱摄影
+婚宴
+楔
+杂草
+重量
+体重秤
+焊接工
+井
+西餐
+西餐厅
+湿
+吧台
+潜水衣
+湿地
+潜水服
+鲸鱼
+鲸鲨
+小麦
+麦田
+车轮
+轮椅
+后轮支撑车技
+生奶油
+搅拌器
+胡须
+威士忌
+哨子
+白色
+白宫
+白葡萄酒
+白板
+便门
+宽的
+挥动
+假发
+Wii
+Wii手柄
+荒野
+角马
+野火
+野花
+野生动物
+柳树
+风
+风铃
+风电场
+风力涡轮机
+风车
+窗户
+窗台花盆箱
+橱窗展示
+窗框
+纱窗
+靠窗的座位
+窗台
+雨刮器
+挡风玻璃
+有风的
+酒瓶
+冷酒器
+酒柜
+酒窖
+酒杯
+酒架
+品酒
+酒庄
+翅膀
+冬天
+冬瓜
+冬天的早晨
+冬季场景
+冬季运动
+冬季风暴
+电线
+紫藤
+巫婆
+女巫帽子
+炒锅
+狼
+女人
+木头
+林鸳鸯
+木地板
+木墙
+烧木炉
+木匙
+林地
+啄木鸟
+木工刨
+羊毛
+工作
+练习卡
+工作台
+工人
+工作场所
+车间
+世界
+蠕虫
+敬拜
+伤口
+包
+裹身裙
+包装纸
+搏斗
+摔跤手
+皱纹
+腕带
+写
+作家
+手写/字迹
+毛笔
+写字桌
+游艇
+牦牛
+院子
+黄色
+瑜伽
+瑜伽垫
+酸奶
+轭
+蛋黄
+青年
+青年旅馆
+蒙古包
+斑马
+斑马线
+禅意花园
+拉链
+拉链
+僵尸
+粽子
+动物园
diff --git a/fengshen/models/Lyrics/ram/data/ram_tag_list_threshold.py b/fengshen/models/Lyrics/ram/data/ram_tag_list_threshold.py
new file mode 100644
index 0000000..1583295
--- /dev/null
+++ b/fengshen/models/Lyrics/ram/data/ram_tag_list_threshold.py
@@ -0,0 +1,4585 @@
+ram_class_threshold = [0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.71,
+0.75,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.9,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.61,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.8,
+0.7,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.82,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.85,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.77,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.89,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.78,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.9,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.9,
+0.65,
+0.83,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.79,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.89,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.86,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.79,
+0.65,
+0.63,
+0.65,
+0.87,
+0.8,
+0.46,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.9,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.8,
+0.65,
+0.8,
+0.8,
+0.8,
+0.65,
+0.65,
+0.84,
+0.65,
+0.65,
+0.79,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.81,
+0.65,
+0.8,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.87,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.83,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.77,
+0.87,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.85,
+0.65,
+0.68,
+0.65,
+0.8,
+0.65,
+0.65,
+0.75,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.8,
+0.8,
+0.8,
+0.79,
+0.65,
+0.85,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.89,
+0.8,
+0.65,
+0.65,
+0.65,
+0.76,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+1,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.89,
+0.7,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.71,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.8,
+0.65,
+0.8,
+0.65,
+0.8,
+0.8,
+0.9,
+0.65,
+0.85,
+0.8,
+0.8,
+0.8,
+0.9,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.75,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.63,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.88,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.71,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.9,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.71,
+0.65,
+0.8,
+0.76,
+0.85,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.79,
+0.65,
+0.75,
+0.65,
+0.8,
+0.65,
+0.86,
+0.65,
+0.65,
+0.9,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.73,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.9,
+0.65,
+0.85,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.75,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.85,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.77,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.86,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.6,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.74,
+0.65,
+0.65,
+0.67,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.85,
+0.65,
+0.8,
+0.65,
+0.65,
+0.84,
+0.8,
+0.8,
+0.8,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.9,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.89,
+0.65,
+0.65,
+0.65,
+0.83,
+0.65,
+0.65,
+0.65,
+0.65,
+0.6,
+0.65,
+0.8,
+0.8,
+0.8,
+0.65,
+0.65,
+0.89,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.77,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.87,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.74,
+0.65,
+0.65,
+0.66,
+0.89,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.84,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.88,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.7,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.82,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.75,
+0.65,
+0.7,
+0.9,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.88,
+0.65,
+0.65,
+1,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.8,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.71,
+0.65,
+0.65,
+0.65,
+0.79,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.89,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.88,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.82,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.9,
+0.65,
+0.65,
+0.88,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.89,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.87,
+0.65,
+0.66,
+0.65,
+0.84,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.84,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.5,
+0.65,
+0.64,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.81,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.8,
+0.65,
+0.84,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.8,
+0.65,
+0.85,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.73,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.86,
+0.65,
+0.65,
+0.65,
+0.65,
+0.87,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.8,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.82,
+0.8,
+0.65,
+0.65,
+0.65,
+0.84,
+0.9,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.64,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.87,
+0.65,
+0.65,
+0.78,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.8,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.9,
+0.65,
+0.65,
+0.8,
+0.65,
+0.85,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.74,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.88,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.83,
+0.89,
+0.89,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.86,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.85,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.86,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.87,
+0.8,
+0.84,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.81,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.7,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.82,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.87,
+0.65,
+0.9,
+0.8,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.7,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.85,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.73,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.89,
+0.8,
+0.65,
+0.9,
+0.65,
+1,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.89,
+0.89,
+0.65,
+0.65,
+0.65,
+0.8,
+0.75,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.8,
+0.65,
+0.65,
+0.88,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.85,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.9,
+0.57,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.8,
+0.8,
+0.79,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.89,
+0.8,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.81,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.89,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.89,
+0.84,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.89,
+0.65,
+0.8,
+0.83,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.72,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+1,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.9,
+0.65,
+0.65,
+0.89,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.69,
+0.8,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.71,
+0.65,
+0.65,
+0.65,
+0.88,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.85,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.87,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.8,
+0.9,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.85,
+0.65,
+0.65,
+0.8,
+0.65,
+0.89,
+0.65,
+0.65,
+0.9,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.86,
+0.65,
+0.77,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.89,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.75,
+0.8,
+0.65,
+0.8,
+0.88,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.88,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.82,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.83,
+0.65,
+0.65,
+0.92,
+0.89,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.75,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.85,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.87,
+0.65,
+0.79,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.83,
+0.8,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.7,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.7,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.87,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.82,
+0.65,
+0.8,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+1,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.64,
+0.65,
+0.65,
+0.63,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.76,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.75,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.87,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.82,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.89,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.8,
+0.65,
+0.73,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.86,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.9,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.86,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.86,
+0.65,
+0.8,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.69,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.88,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.72,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.9,
+0.9,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.45,
+0.8,
+0.65,
+0.88,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.8,
+0.51,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.66,
+0.65,
+0.8,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.81,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.75,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.66,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.8,
+0.65,
+0.85,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.81,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.89,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.79,
+0.75,
+0.65,
+0.65,
+0.8,
+0.65,
+0.67,
+0.8,
+0.8,
+0.86,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.81,
+0.8,
+0.65,
+0.65,
+0.9,
+0.65,
+0.79,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.77,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.8,
+0.65,
+0.74,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.6,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.89,
+0.8,
+0.65,
+0.65,
+0.88,
+0.65,
+0.65,
+0.65,
+0.9,
+0.75,
+0.65,
+0.65,
+0.65,
+0.8,
+0.6,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.84,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.8,
+0.8,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.85,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.63,
+0.65,
+0.65,
+0.65,
+0.7,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.9,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.84,
+0.65,
+0.65,
+0.8,
+0.65,
+0.81,
+0.8,
+0.8,
+0.8,
+0.82,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.88,
+0.65,
+0.8,
+0.65,
+0.7,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+1,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.8,
+0.65,
+0.74,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.85,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.9,
+0.86,
+0.8,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.64,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.87,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.87,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.7,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.75,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.85,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.71,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.73,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.8,
+0.65,
+0.86,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.75,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.88,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.81,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.9,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.7,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.77,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.85,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.87,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.57,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.76,
+1,
+0.8,
+0.65,
+0.65,
+0.58,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+1,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.87,
+0.8,
+0.9,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.87,
+0.68,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.99,
+0.8,
+0.77,
+0.65,
+0.9,
+0.65,
+0.65,
+0.88,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.88,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.89,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.7,
+0.65,
+0.65,
+0.8,
+0.9,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.8,
+0.65,
+0.77,
+0.65,
+0.65,
+0.65,
+0.65,
+0.79,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.85,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.52,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.86,
+0.65,
+0.65,
+0.8,
+0.56,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.72,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.9,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.6,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.88,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.89,
+0.85,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.87,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.75,
+0.65,
+0.65,
+0.65,
+0.65,
+0.54,
+1,
+0.65,
+0.65,
+0.75,
+0.65,
+0.75,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.9,
+0.62,
+0.65,
+0.65,
+0.65,
+0.65,
+0.86,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.82,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.9,
+0.74,
+0.8,
+0.65,
+0.8,
+0.8,
+0.7,
+0.65,
+0.65,
+0.65,
+0.89,
+0.65,
+0.65,
+0.8,
+0.8,
+0.8,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.84,
+0.8,
+0.65,
+0.65,
+0.8,
+0.75,
+0.65,
+0.65,
+0.65,
+0.89,
+0.65,
+0.65,
+0.65,
+0.65,
+0.82,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.84,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.7,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.74,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.85,
+0.65,
+0.9,
+0.9,
+0.65,
+0.65,
+0.65,
+0.63,
+0.82,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.7,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.65,
+0.74,
+0.9,
+0.65,
+0.8,
+0.65,
+0.65,
+0.58,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.89,
+0.75,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.88,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.89,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.8,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.87,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.64,
+0.65,
+0.65,
+0.65,
+0.8,
+0.87,
+0.65,
+0.65,
+0.8,
+0.9,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.89,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.83,
+0.65,
+0.65,
+0.8,
+0.65,
+0.9,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.78,
+0.65,
+0.8,
+0.65,
+0.9,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.65,
+0.88,
+0.8,
+0.65,
+0.65,
+0.65,
+0.81,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.77,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+1,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.85,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.88,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.68,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.89,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.9,
+0.65,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.81,
+0.65,
+0.65,
+0.65,
+0.8,
+0.85,
+0.65,
+0.77,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.8,
+0.8,
+0.9,
+0.65,
+0.65,
+0.89,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.8,
+0.65,
+0.65,
+0.65,
+0.88,
+0.8,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.82,
+0.65,
+0.8,
+0.74,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.85,
+0.65,
+0.65,
+0.85,
+0.65,
+0.65,
+0.65,
+0.65,
+0.7,
+0.7,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.87,
+0.8,
+0.65,
+0.65,
+0.65,
+0.89,
+0.85,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.7,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.9,
+0.8,
+0.8,
+0.65,
+0.66,
+0.57,
+0.65,
+0.65,
+0.65,
+0.49,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.65,
+0.65,
+0.65,
+0.8,
+0.65,
+0.8,
+0.8,
+0.86,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.89,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.65,
+0.76]
diff --git a/fengshen/models/Lyrics/ram/models/bert.py b/fengshen/models/Lyrics/ram/models/bert.py
new file mode 100644
index 0000000..cb90b79
--- /dev/null
+++ b/fengshen/models/Lyrics/ram/models/bert.py
@@ -0,0 +1,1035 @@
+'''
+ * Copyright (c) 2022, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on huggingface code base
+ * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert
+'''
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+from torch import Tensor, device, dtype, nn
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+import torch.nn.functional as F
+
+from transformers.activations import ACT2FN
+from transformers.file_utils import (
+    ModelOutput,
+)
+from transformers.modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from transformers.modeling_utils import (
+    PreTrainedModel,
+    apply_chunking_to_forward,
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.utils import logging
+from transformers.models.bert.configuration_bert import BertConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class BertEmbeddings_nopos(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        # self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        # self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        # self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        
+        self.config = config
+
+    def forward(
+        self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        # if position_ids is None:
+            # position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        embeddings = inputs_embeds
+
+        # if self.position_embedding_type == "absolute":
+        #     position_embeddings = self.position_embeddings(position_ids)
+        #     # print('add position_embeddings!!!!')
+        #     embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+
+
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        
+        self.config = config
+
+    def forward(
+        self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        embeddings = inputs_embeds
+
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            # print('add position_embeddings!!!!')
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class BertSelfAttention(nn.Module):
+    def __init__(self, config, is_cross_attention):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+        
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_width, self.all_head_size)
+            self.value = nn.Linear(config.encoder_width, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+        self.save_attention = False   
+            
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+        
+    def get_attn_gradients(self):
+        return self.attn_gradients
+    
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+        
+    def get_attention_map(self):
+        return self.attention_map
+    
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            # print(self.key.weight.shape)
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # compatible with higher versions of transformers 
+        if key_layer.shape[0] > query_layer.shape[0]:
+            key_layer = key_layer[:query_layer.shape[0], :, :, :]
+            attention_mask = attention_mask[:query_layer.shape[0], :, :]
+            value_layer = value_layer[:query_layer.shape[0], :, :, :]
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        
+        if is_cross_attention and self.save_attention:
+            self.save_attention_map(attention_probs)
+            attention_probs.register_hook(self.save_attn_gradients)         
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False):
+        super().__init__()
+        self.self = BertSelfAttention(config, is_cross_attention)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BertLayer(nn.Module):
+    def __init__(self, config, layer_num):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BertAttention(config)      
+        self.layer_num = layer_num          
+        if self.config.add_cross_attention:
+            self.crossattention = BertAttention(config, is_cross_attention=self.config.add_cross_attention)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        mode=None,
+    ):
+        
+        if mode == 'tagging':
+            
+            assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers"
+
+            cross_attention_outputs = self.crossattention(
+                hidden_states,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights  
+
+            present_key_value = cross_attention_outputs[-1]
+
+        else:
+            # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+            self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+            self_attention_outputs = self.attention(
+                hidden_states,
+                attention_mask,
+                head_mask,
+                output_attentions=output_attentions,
+                past_key_value=self_attn_past_key_value,
+            )
+            attention_output = self_attention_outputs[0]
+
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+
+            if mode=='multimodal':
+                assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers"
+
+                cross_attention_outputs = self.crossattention(
+                    attention_output,
+                    attention_mask,
+                    head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions=output_attentions,
+                )
+                attention_output = cross_attention_outputs[0]
+                outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights                               
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([BertLayer(config,i) for i in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        mode='multimodal',
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+               
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+
+                if use_cache:
+                    logger.warn(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, past_key_value, output_attentions)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    mode=mode,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    past_key_value,
+                    output_attentions,
+                    mode=mode,
+                )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class BertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BertConfig
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+class BertModel(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need <https://arxiv.org/abs/1706.03762>`__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an
+    input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = BertEmbeddings(config)
+        
+        self.encoder = BertEncoder(config)
+
+        self.pooler = BertPooler(config) if add_pooling_layer else None
+
+        self.init_weights()
+ 
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    
+    def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int], device: device, is_decoder: bool) -> Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (:obj:`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (:obj:`Tuple[int]`):
+                The shape of the input to the model.
+            device: (:obj:`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+                # in case past_key_values are used we need to add a prefix ones mask to the causal mask
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+   
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones((batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )                     
+
+                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+    
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        is_decoder=False,
+        mode='multimodal',
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+            device = input_ids.device
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+            device = inputs_embeds.device
+        elif encoder_embeds is not None:    
+            input_shape = encoder_embeds.size()[:-1]
+            batch_size, seq_length = input_shape 
+            device = encoder_embeds.device
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+            
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, 
+                                                                                 device, is_decoder)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if type(encoder_hidden_states) == list:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            
+            if type(encoder_attention_mask) == list:
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:    
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        
+        if encoder_embeds is None:
+            embedding_output = self.embeddings(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                inputs_embeds=inputs_embeds,
+                past_key_values_length=past_key_values_length,
+            )
+        else:
+            embedding_output = encoder_embeds
+            
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            mode=mode,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+class BertLMHeadModel(BertPreTrainedModel):
+
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config, add_pooling_layer=False)
+        self.cls = BertOnlyMLMHead(config)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,            
+        is_decoder=True,
+        reduction='mean',
+        mode='multimodal', 
+    ):
+        r"""
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are
+            ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]``
+        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
+            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
+            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
+        use_cache (:obj:`bool`, `optional`):
+            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
+            decoding (see :obj:`past_key_values`).
+        Returns:
+        Example::
+            >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+            >>> import torch
+            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+            >>> config = BertConfig.from_pretrained("bert-base-cased")
+            >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+            >>> outputs = model(**inputs)
+            >>> prediction_logits = outputs.logits
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+            mode=mode,
+        )
+        
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        # sequence_output.shape torch.Size([85, 30, 768])
+        # prediction_scores.shape torch.Size([85, 30, 30524])
+        # labels.shape torch.Size([85, 30])
+
+
+        if return_logits:
+            return prediction_scores[:, :-1, :].contiguous()  
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1) 
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            if reduction=='none':
+                lm_loss = lm_loss.view(prediction_scores.size(0),-1).sum(1)               
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "input_ids": input_ids, 
+            "attention_mask": attention_mask, 
+            "past_key_values": past,
+            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None),
+            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None),
+            "is_decoder": True,
+        }
+
+    def _reorder_cache(self, past, beam_idx):
+        reordered_past = ()
+        for layer_past in past:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+
diff --git a/fengshen/models/Lyrics/ram/models/ram.py b/fengshen/models/Lyrics/ram/models/ram.py
new file mode 100644
index 0000000..dd33951
--- /dev/null
+++ b/fengshen/models/Lyrics/ram/models/ram.py
@@ -0,0 +1,212 @@
+'''
+ * The Recognize Anything Model (RAM) & Tag2Text Model
+ * Written by Xinyu Huang
+'''
+import numpy as np
+import json
+import torch
+import warnings
+import sys
+from torch import nn
+from fengshen.models.groundedblip.ram.models.bert import BertConfig, BertModel, BertLMHeadModel
+from fengshen.models.groundedblip.ram.models.swin_transformer import SwinTransformer
+from fengshen.models.groundedblip.ram.data.ram_tag_list_threshold import ram_class_threshold
+
+from fengshen.models.groundedblip.ram.models.utils import *
+
+warnings.filterwarnings("ignore")
+
+
+class RAM(nn.Module):
+    def __init__(self,
+                 args):
+        r""" The Recognize Anything Model (RAM) inference module.
+        RAM is a strong image tagging model, which can recognize any common category with high accuracy.
+        Described in the paper " Recognize Anything: A Strong Image Tagging Model" https://recognize-anything.github.io/
+        
+        Args:
+            med_config (str): path for the mixture of encoder-decoder model's configuration file
+            image_size (int): input image size
+            vit (str): model size of vision transformer
+            threshold (int): tagging threshold
+            delete_tag_index (list): delete some tags that may disturb captioning
+        """
+        super().__init__()
+
+        # create image encoder
+
+        if args.vit == 'swin_l':
+            assert args.image_size == args.image_res
+            # assert config['patch_size'] == 32
+            vision_width =args.vision_width
+
+            self.visual_encoder = SwinTransformer(
+                img_size=args.image_res,
+                patch_size=4,
+                in_chans=3,
+                embed_dim=args.embed_dim,
+                depths=args.depths,
+                num_heads=args.num_heads,
+                window_size=args.window_size,
+                mlp_ratio=4.,
+                qkv_bias=True,
+                drop_rate=0.0,
+                drop_path_rate=0.1,
+                ape=False,
+                patch_norm=True,
+                use_checkpoint=False)
+            
+        # create tokenzier
+        self.tokenizer = init_tokenizer()
+
+        # Tag2Text employ encoder-decoder architecture for image-tag-text generation: image-tag interaction encoder and image-tag-text decoder
+        # create image-tag interaction encoder
+        encoder_config = BertConfig.from_json_file(args.med_config)
+        encoder_config.encoder_width = 512
+        self.tag_encoder = BertModel(config=encoder_config,
+                                     add_pooling_layer=False)
+
+        # create image-tag-text decoder
+        decoder_config = BertConfig.from_json_file(args.med_config)
+        self.text_decoder = BertLMHeadModel(config=decoder_config)
+
+        self.delete_tag_index = args.delete_tag_index
+        self.prompt = args.prompt
+        self.prompt_length = len(self.tokenizer(self.prompt).input_ids) - 1
+
+        # load tag list
+        self.tag_list = self.load_tag_list(args.tag_list)
+        self.tag_list_chinese = self.load_tag_list(args.tag_list_chinese)
+
+        # create image-tag recognition decoder
+        self.threshold = args.threshold
+        self.num_class = len(self.tag_list)
+        q2l_config = BertConfig.from_json_file(args.q2l_config)
+        q2l_config.encoder_width = 512
+        self.tagging_head = BertModel(config=q2l_config,
+                                      add_pooling_layer=False)
+        self.tagging_head.resize_token_embeddings(len(self.tokenizer))
+        self.label_embed = nn.Embedding(self.num_class, q2l_config.hidden_size)
+
+        if q2l_config.hidden_size != 512:
+            self.wordvec_proj = nn.Linear(512, q2l_config.hidden_size)
+        else:
+            self.wordvec_proj = nn.Identity()
+
+        self.fc = nn.Linear(q2l_config.hidden_size, 1)
+
+        self.del_selfattention()
+
+        # share weights of the lowest 2-layer of "image-tag interaction encoder" with the "image-tag recogntion decoder"
+        tie_encoder_decoder_weights(self.tag_encoder, self.tagging_head, '',
+                                    ' ')
+        self.image_proj = nn.Linear(vision_width, 512)
+        # self.label_embed = nn.Parameter(torch.load(args.textual_label_embedding_path,map_location='cpu').float())
+        self.label_embed = nn.Parameter(torch.zeros(4585, 512, dtype=float)) # 4585是标签个数
+        # adjust thresholds for some tags
+        self.class_threshold = torch.ones(self.num_class) * self.threshold
+        for key,value in enumerate(ram_class_threshold):
+            self.class_threshold[key] = value
+
+    def load_tag_list(self, tag_list_file):
+        with open(tag_list_file, 'r', encoding="utf-8") as f:
+            tag_list = f.read().splitlines()
+        tag_list = np.array(tag_list)
+        return tag_list
+
+    # delete self-attention layer of image-tag recognition decoder to reduce computation, follower Query2Label
+    def del_selfattention(self):
+        del self.tagging_head.embeddings
+        for layer in self.tagging_head.encoder.layer:
+            del layer.attention
+
+    def generate_tag(self,
+                 image,
+                 threshold=0.68,
+                 tag_input=None,
+                 ):
+            
+        label_embed = torch.nn.functional.relu(self.wordvec_proj(self.label_embed))
+
+        image_embeds = self.image_proj(self.visual_encoder(image))
+        image_atts = torch.ones(image_embeds.size()[:-1],
+                                dtype=torch.long).to(image.device)
+
+        # recognized image tags using image-tag recogntiion decoder
+        image_cls_embeds = image_embeds[:, 0, :]
+        image_spatial_embeds = image_embeds[:, 1:, :]
+
+        bs = image_spatial_embeds.shape[0]
+        label_embed = label_embed.unsqueeze(0).repeat(bs, 1, 1)
+        tagging_embed = self.tagging_head(
+            encoder_embeds=label_embed,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_atts,
+            return_dict=False,
+            mode='tagging',
+        )
+
+        logits = self.fc(tagging_embed[0]).squeeze(-1)
+
+        targets = torch.where(
+            torch.sigmoid(logits) > self.class_threshold.to(image.device),
+            torch.tensor(1.0).to(image.device),
+            torch.zeros(self.num_class).to(image.device))
+
+        tag = targets.cpu().numpy()
+        tag[:,self.delete_tag_index] = 0
+        tag_output = []
+        tag_output_chinese = []
+        for b in range(bs):
+            index = np.argwhere(tag[b] == 1)
+            token = self.tag_list[index].squeeze(axis=1)
+            tag_output.append(' | '.join(token))
+            token_chinese = self.tag_list_chinese[index].squeeze(axis=1)
+            tag_output_chinese.append(' | '.join(token_chinese))
+
+
+        return tag_output, tag_output_chinese
+
+    def generate_tag_zeroshot(self,
+                 image,
+                 threshold=0.68,
+                 tag_input=None,
+                 ):
+            
+        label_embed = torch.nn.functional.relu(self.wordvec_proj(self.label_embed))
+
+        image_embeds = self.image_proj(self.visual_encoder(image))
+        image_atts = torch.ones(image_embeds.size()[:-1],
+                                dtype=torch.long).to(image.device)
+
+        # recognized image tags using image-tag recogntiion decoder
+        image_cls_embeds = image_embeds[:, 0, :]
+        image_spatial_embeds = image_embeds[:, 1:, :]
+
+        bs = image_spatial_embeds.shape[0]
+        label_embed = label_embed.unsqueeze(0).repeat(bs, 1, 1)
+        tagging_embed = self.tagging_head(
+            encoder_embeds=label_embed,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_atts,
+            return_dict=False,
+            mode='tagging',
+        )
+
+        logits = self.fc(tagging_embed[0]).squeeze(-1)
+
+        targets = torch.where(
+            torch.sigmoid(logits) > self.class_threshold.to(image.device),
+            torch.tensor(1.0).to(image.device),
+            torch.zeros(self.num_class).to(image.device))
+
+        tag = targets.cpu().numpy()
+        tag[:,self.delete_tag_index] = 0
+        tag_output = []
+        for b in range(bs):
+            index = np.argwhere(tag[b] == 1)
+            token = self.tag_list[index].squeeze(axis=1)
+            tag_output.append(' | '.join(token))
+
+        return tag_output
+
diff --git a/fengshen/models/Lyrics/ram/models/swin_transformer.py b/fengshen/models/Lyrics/ram/models/swin_transformer.py
new file mode 100644
index 0000000..c1affc9
--- /dev/null
+++ b/fengshen/models/Lyrics/ram/models/swin_transformer.py
@@ -0,0 +1,654 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu
+# --------------------------------------------------------
+
+import numpy as np
+from scipy import interpolate
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+
+
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
+
+    def flops(self, N):
+        # calculate flops for 1 window with token length of N
+        flops = 0
+        # qkv = self.qkv(x)
+        flops += N * self.dim * 3 * self.dim
+        # attn = (q @ k.transpose(-2, -1))
+        flops += self.num_heads * N * (self.dim // self.num_heads) * N
+        #  x = (attn @ v)
+        flops += self.num_heads * N * N * (self.dim // self.num_heads)
+        # x = self.proj(x)
+        flops += N * self.dim * self.dim
+        return flops
+
+
+class SwinTransformerBlock(nn.Module):
+    r""" Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            attn_mask = None
+
+        self.register_buffer("attn_mask", attn_mask)
+
+    def forward(self, x):
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_x = x
+
+        # partition windows
+        x_windows = window_partition(shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size, C)  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(x_windows, mask=self.attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        x = x.view(B, H * W, C)
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+
+    def flops(self):
+        flops = 0
+        H, W = self.input_resolution
+        # norm1
+        flops += self.dim * H * W
+        # W-MSA/SW-MSA
+        nW = H * W / self.window_size / self.window_size
+        flops += nW * self.attn.flops(self.window_size * self.window_size)
+        # mlp
+        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
+        # norm2
+        flops += self.dim * H * W
+        return flops
+
+
+class PatchMerging(nn.Module):
+    r""" Patch Merging Layer.
+
+    Args:
+        input_resolution (tuple[int]): Resolution of input feature.
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x):
+        """
+        x: B, H*W, C
+        """
+        H, W = self.input_resolution
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+        assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even."
+
+        x = x.view(B, H, W, C)
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+    def extra_repr(self) -> str:
+        return f"input_resolution={self.input_resolution}, dim={self.dim}"
+
+    def flops(self):
+        H, W = self.input_resolution
+        flops = H * W * self.dim
+        flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim
+        return flops
+
+
+class BasicLayer(nn.Module):
+    """ A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False):
+
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(dim=dim, input_resolution=input_resolution,
+                                 num_heads=num_heads, window_size=window_size,
+                                 shift_size=0 if (i % 2 == 0) else window_size // 2,
+                                 mlp_ratio=mlp_ratio,
+                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
+                                 drop=drop, attn_drop=attn_drop,
+                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                                 norm_layer=norm_layer)
+            for i in range(depth)])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x):
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return x
+
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+
+    def flops(self):
+        flops = 0
+        for blk in self.blocks:
+            flops += blk.flops()
+        if self.downsample is not None:
+            flops += self.downsample.flops()
+        return flops
+
+
+class PatchEmbed(nn.Module):
+    r""" Image to Patch Embedding
+
+    Args:
+        img_size (int): Image size.  Default: 224.
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]]
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.patches_resolution = patches_resolution
+        self.num_patches = patches_resolution[0] * patches_resolution[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)  # B Ph*Pw C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+
+    def flops(self):
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
+
+
+class SwinTransformer(nn.Module):
+    r""" Swin Transformer
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+
+    Args:
+        img_size (int | tuple(int)): Input image size. Default 224
+        patch_size (int | tuple(int)): Patch size. Default: 4
+        in_chans (int): Number of input image channels. Default: 3
+        num_classes (int): Number of classes for classification head. Default: 1000
+        embed_dim (int): Patch embedding dimension. Default: 96
+        depths (tuple(int)): Depth of each Swin Transformer layer.
+        num_heads (tuple(int)): Number of attention heads in different layers.
+        window_size (int): Window size. Default: 7
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None
+        drop_rate (float): Dropout rate. Default: 0
+        attn_drop_rate (float): Attention dropout rate. Default: 0
+        drop_path_rate (float): Stochastic depth rate. Default: 0.1
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False
+    """
+
+    def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000,
+                 embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24],
+                 window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1,
+                 norm_layer=nn.LayerNorm, ape=False, patch_norm=True,
+                 use_checkpoint=False, **kwargs):
+        super().__init__()
+
+        self.num_classes = num_classes
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.num_features = int(embed_dim * 2 ** (self.num_layers - 1))
+        self.mlp_ratio = mlp_ratio
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patches_resolution
+        self.patches_resolution = patches_resolution
+
+        # absolute position embedding
+        if self.ape:
+            self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer),
+                               input_resolution=(patches_resolution[0] // (2 ** i_layer),
+                                                 patches_resolution[1] // (2 ** i_layer)),
+                               depth=depths[i_layer],
+                               num_heads=num_heads[i_layer],
+                               window_size=window_size,
+                               mlp_ratio=self.mlp_ratio,
+                               qkv_bias=qkv_bias, qk_scale=qk_scale,
+                               drop=drop_rate, attn_drop=attn_drop_rate,
+                               drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                               norm_layer=norm_layer,
+                               downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                               use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+
+        self.norm = norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool1d(1)
+        # self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'absolute_pos_embed'}
+
+    @torch.jit.ignore
+    def no_weight_decay_keywords(self):
+        return {'relative_position_bias_table'}
+
+    def forward(self, x, idx_to_group_img=None, image_atts=None, **kwargs):
+        x = self.patch_embed(x)
+        if self.ape:
+            x = x + self.absolute_pos_embed
+        x = self.pos_drop(x)
+
+        for layer in self.layers:
+            x = layer(x)
+
+        x = self.norm(x)  # B L C
+
+        x_cls = self.avgpool(x.transpose(1, 2))  # B C 1
+
+        if idx_to_group_img is None:
+            return torch.cat([x_cls.transpose(1, 2), x], dim=1)
+        else:
+            x_bs = torch.gather(x, dim=0, index=idx_to_group_img.view(-1, 1, 1).expand(-1, x.shape[1], x.shape[2]))
+            weights = image_atts[:, 1:].unsqueeze(2)  # B L 1
+            x_bs_cls = torch.sum((weights * x_bs).transpose(1, 2), dim=-1, keepdim=True)   # B C 1
+            x_bs_cls = x_bs_cls / torch.sum(weights.transpose(1, 2), dim=-1, keepdim=True)  # avgpool
+
+            return torch.cat([x_bs_cls.transpose(1, 2), x_bs], dim=1), \
+                   torch.cat([x_cls.transpose(1, 2), x], dim=1)
+
+    def flops(self):
+        flops = 0
+        flops += self.patch_embed.flops()
+        for i, layer in enumerate(self.layers):
+            flops += layer.flops()
+        flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers)
+        flops += self.num_features * self.num_classes
+        return flops
+
+
+def interpolate_relative_pos_embed(rel_pos_bias, dst_num_pos, param_name=''):
+    # from: https://github.com/microsoft/unilm/blob/8a0a1c1f4e7326938ea7580a00d56d7f17d65612/beit/run_class_finetuning.py#L348
+
+    # rel_pos_bias: relative_position_bias_table
+    src_num_pos, num_attn_heads = rel_pos_bias.size()
+
+    num_extra_tokens = 0
+    src_size = int((src_num_pos - num_extra_tokens) ** 0.5)
+    dst_size = int((dst_num_pos - num_extra_tokens) ** 0.5)
+    if src_size != dst_size:
+        print("Position interpolate %s from %dx%d to %dx%d" % (param_name, src_size, src_size, dst_size, dst_size))
+
+        # extra_tokens = rel_pos_bias[-num_extra_tokens:, :]
+        # rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :]
+
+        def geometric_progression(a, r, n):
+            return a * (1.0 - r ** n) / (1.0 - r)
+
+        left, right = 1.01, 1.5
+        while right - left > 1e-6:
+            q = (left + right) / 2.0
+            gp = geometric_progression(1, q, src_size // 2)
+            if gp > dst_size // 2:
+                right = q
+            else:
+                left = q
+
+        # if q > 1.090307:
+        #     q = 1.090307
+
+        dis = []
+        cur = 1
+        for i in range(src_size // 2):
+            dis.append(cur)
+            cur += q ** (i + 1)
+
+        r_ids = [-_ for _ in reversed(dis)]
+
+        x = r_ids + [0] + dis
+        y = r_ids + [0] + dis
+
+        t = dst_size // 2.0
+        dx = np.arange(-t, t + 0.1, 1.0)
+        dy = np.arange(-t, t + 0.1, 1.0)
+
+        # print("Original positions = %s" % str(x))
+        # print("Target positions = %s" % str(dx))
+
+        all_rel_pos_bias = []
+
+        for i in range(num_attn_heads):
+            z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy()
+            f = interpolate.interp2d(x, y, z, kind='cubic')
+            all_rel_pos_bias.append(
+                torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(rel_pos_bias.device))
+
+        rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1)
+
+    return rel_pos_bias
\ No newline at end of file
diff --git a/fengshen/models/Lyrics/ram/models/utils.py b/fengshen/models/Lyrics/ram/models/utils.py
new file mode 100644
index 0000000..4b445d4
--- /dev/null
+++ b/fengshen/models/Lyrics/ram/models/utils.py
@@ -0,0 +1,282 @@
+import os
+import json
+import torch
+import math
+
+from torch import nn
+from typing import List
+from transformers import BertTokenizer
+from urllib.parse import urlparse
+from timm.models.hub import download_cached_file
+from fengshen.models.groundedblip.ram.models.vit import interpolate_pos_embed, VisionTransformer
+from fengshen.models.groundedblip.ram.models.swin_transformer import interpolate_relative_pos_embed
+from pathlib import Path
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+CONFIG_PATH=(Path(__file__).resolve().parents[1])
+
+def read_json(rpath):
+    with open(rpath, 'r') as f:
+        return json.load(f)
+
+
+def tie_encoder_decoder_weights(encoder: nn.Module, decoder: nn.Module,
+                                base_model_prefix: str, skip_key: str):
+    uninitialized_encoder_weights: List[str] = []
+    if decoder.__class__ != encoder.__class__:
+        logger.info(
+            f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized."
+        )
+
+    def tie_encoder_to_decoder_recursively(
+        decoder_pointer: nn.Module,
+        encoder_pointer: nn.Module,
+        module_name: str,
+        uninitialized_encoder_weights: List[str],
+        skip_key: str,
+        depth=0,
+    ):
+        assert isinstance(decoder_pointer, nn.Module) and isinstance(
+            encoder_pointer, nn.Module
+        ), f"{decoder_pointer} and {encoder_pointer} have to be of type torch.nn.Module"
+        if hasattr(decoder_pointer, "weight") and skip_key not in module_name:
+            assert hasattr(encoder_pointer, "weight")
+            encoder_pointer.weight = decoder_pointer.weight
+            if hasattr(decoder_pointer, "bias"):
+                assert hasattr(encoder_pointer, "bias")
+                encoder_pointer.bias = decoder_pointer.bias
+            print(module_name + ' is tied')
+            return
+
+        encoder_modules = encoder_pointer._modules
+        decoder_modules = decoder_pointer._modules
+        if len(decoder_modules) > 0:
+            assert (
+                len(encoder_modules) > 0
+            ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}"
+
+            all_encoder_weights = set([
+                module_name + "/" + sub_name
+                for sub_name in encoder_modules.keys()
+            ])
+            encoder_layer_pos = 0
+            for name, module in decoder_modules.items():
+                if name.isdigit():
+                    encoder_name = str(int(name) + encoder_layer_pos)
+                    decoder_name = name
+                    if not isinstance(
+                            decoder_modules[decoder_name],
+                            type(encoder_modules[encoder_name])) and len(
+                                encoder_modules) != len(decoder_modules):
+                        # this can happen if the name corresponds to the position in a list module list of layers
+                        # in this case the decoder has added a cross-attention that the encoder does not have
+                        # thus skip this step and subtract one layer pos from encoder
+                        encoder_layer_pos -= 1
+                        continue
+                elif name not in encoder_modules:
+                    continue
+                elif depth > 500:
+                    raise ValueError(
+                        "Max depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your model."
+                    )
+                else:
+                    decoder_name = encoder_name = name
+                tie_encoder_to_decoder_recursively(
+                    decoder_modules[decoder_name],
+                    encoder_modules[encoder_name],
+                    module_name + "/" + name,
+                    uninitialized_encoder_weights,
+                    skip_key,
+                    depth=depth + 1,
+                )
+                all_encoder_weights.remove(module_name + "/" + encoder_name)
+
+            uninitialized_encoder_weights += list(all_encoder_weights)
+
+    # tie weights recursively
+    tie_encoder_to_decoder_recursively(decoder, encoder, base_model_prefix,
+                                       uninitialized_encoder_weights, skip_key)
+
+
+class GroupWiseLinear(nn.Module):
+    # could be changed to:
+    # output = torch.einsum('ijk,zjk->ij', x, self.W)
+    # or output = torch.einsum('ijk,jk->ij', x, self.W[0])
+    def __init__(self, num_class, hidden_dim, bias=True):
+        super().__init__()
+        self.num_class = num_class
+        self.hidden_dim = hidden_dim
+        self.bias = bias
+
+        self.W = nn.Parameter(torch.Tensor(1, num_class, hidden_dim))
+        if bias:
+            self.b = nn.Parameter(torch.Tensor(1, num_class))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        stdv = 1. / math.sqrt(self.W.size(2))
+        for i in range(self.num_class):
+            self.W[0][i].data.uniform_(-stdv, stdv)
+        if self.bias:
+            for i in range(self.num_class):
+                self.b[0][i].data.uniform_(-stdv, stdv)
+
+    def forward(self, x):
+        # x: B,K,d
+        x = (self.W * x).sum(-1)
+        if self.bias:
+            x = x + self.b
+        return x
+
+
+def init_tokenizer():
+    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    tokenizer.add_special_tokens({'bos_token': '[DEC]'})
+    tokenizer.add_special_tokens({'additional_special_tokens': ['[ENC]']})
+    tokenizer.enc_token_id = tokenizer.additional_special_tokens_ids[0]
+    return tokenizer
+
+
+def create_vit(vit,
+               image_size,
+               use_grad_checkpointing=False,
+               ckpt_layer=0,
+               drop_path_rate=0):
+
+    assert vit in ['base', 'large'], "vit parameter must be base or large"
+    if vit == 'base':
+        vision_width = 768
+        visual_encoder = VisionTransformer(
+            img_size=image_size,
+            patch_size=16,
+            embed_dim=vision_width,
+            depth=12,
+            num_heads=12,
+            use_grad_checkpointing=use_grad_checkpointing,
+            ckpt_layer=ckpt_layer,
+            drop_path_rate=0 or drop_path_rate)
+    elif vit == 'large':
+        vision_width = 1024
+        visual_encoder = VisionTransformer(
+            img_size=image_size,
+            patch_size=16,
+            embed_dim=vision_width,
+            depth=24,
+            num_heads=16,
+            use_grad_checkpointing=use_grad_checkpointing,
+            ckpt_layer=ckpt_layer,
+            drop_path_rate=0.1 or drop_path_rate)
+    return visual_encoder, vision_width
+
+
+def is_url(url_or_filename):
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ("http", "https")
+
+
+def load_checkpoint(model, url_or_filename):
+    if is_url(url_or_filename):
+        cached_file = download_cached_file(url_or_filename,
+                                           check_hash=False,
+                                           progress=True)
+        checkpoint = torch.load(cached_file, map_location='cpu')
+    elif os.path.isfile(url_or_filename):
+        checkpoint = torch.load(url_or_filename, map_location='cpu')
+    else:
+        raise RuntimeError('checkpoint url or path is invalid')
+
+    state_dict = checkpoint['model']
+
+    state_dict['visual_encoder.pos_embed'] = interpolate_pos_embed(
+        state_dict['visual_encoder.pos_embed'], model.visual_encoder)
+    if 'visual_encoder_m.pos_embed' in model.state_dict().keys():
+        state_dict['visual_encoder_m.pos_embed'] = interpolate_pos_embed(
+            state_dict['visual_encoder_m.pos_embed'], model.visual_encoder_m)
+    for key in model.state_dict().keys():
+        if key in state_dict.keys():
+            if state_dict[key].shape != model.state_dict()[key].shape:
+                del state_dict[key]
+
+    msg = model.load_state_dict(state_dict, strict=False)
+    print('load checkpoint from %s' % url_or_filename)
+    return model, msg
+
+
+def load_checkpoint_swinbase(model, url_or_filename, kwargs):
+    if kwargs['image_size'] == 224:
+        vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinB_224.json'
+    elif kwargs['image_size'] == 384:
+        vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinB_384.json'
+    window_size = read_json(vision_config_path)['window_size']
+    print('--------------')
+    print(url_or_filename)
+    print('--------------')
+    if is_url(url_or_filename):
+        cached_file = download_cached_file(url_or_filename,
+                                           check_hash=False,
+                                           progress=True)
+        checkpoint = torch.load(cached_file, map_location='cpu')
+    elif os.path.isfile(url_or_filename):
+        checkpoint = torch.load(url_or_filename, map_location='cpu')
+    else:
+        raise RuntimeError('checkpoint url or path is invalid')
+
+    state_dict = checkpoint['model']
+
+    for k in list(state_dict.keys()):
+        if 'relative_position_bias_table' in k:
+            dst_num_pos = (2 * window_size - 1)**2
+            state_dict[k] = interpolate_relative_pos_embed(state_dict[k],
+                                                           dst_num_pos,
+                                                           param_name=k)
+        elif ('relative_position_index' in k) or ('attn_mask' in k):
+            del state_dict[k]
+        elif "vision_multi" in k:
+            state_dict[k.replace("vision_multi",
+                                 "tagging_head")] = state_dict.pop(k)
+
+    msg = model.load_state_dict(state_dict, strict=False)
+    print('load checkpoint from %s' % url_or_filename)
+    return model, msg
+
+
+def load_checkpoint_swinlarge(model, url_or_filename, kwargs):
+    if kwargs['image_size'] == 224:
+        vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinL_224.json'
+    elif kwargs['image_size'] == 384:
+        vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinL_384.json'
+    window_size = read_json(vision_config_path)['window_size']
+    print('--------------')
+    print(url_or_filename)
+    print('--------------')
+    if is_url(url_or_filename):
+        cached_file = download_cached_file(url_or_filename,
+                                           check_hash=False,
+                                           progress=True)
+        checkpoint = torch.load(cached_file, map_location='cpu')
+    elif os.path.isfile(url_or_filename):
+        checkpoint = torch.load(url_or_filename, map_location='cpu')
+    else:
+        raise RuntimeError('checkpoint url or path is invalid')
+
+    state_dict = checkpoint['model']
+
+    for k in list(state_dict.keys()):
+        if 'relative_position_bias_table' in k:
+            dst_num_pos = (2 * window_size - 1)**2
+            state_dict[k] = interpolate_relative_pos_embed(state_dict[k],
+                                                           dst_num_pos,
+                                                           param_name=k)
+        elif ('relative_position_index' in k) or ('attn_mask' in k):
+            del state_dict[k]
+        elif "vision_multi" in k:
+            state_dict[k.replace("vision_multi",
+                                 "tagging_head")] = state_dict.pop(k)
+
+    msg = model.load_state_dict(state_dict, strict=False)
+    print('load checkpoint from %s' % url_or_filename)
+    return model, msg
+
+
diff --git a/fengshen/models/Lyrics/ram/models/vit.py b/fengshen/models/Lyrics/ram/models/vit.py
new file mode 100644
index 0000000..cec3d8e
--- /dev/null
+++ b/fengshen/models/Lyrics/ram/models/vit.py
@@ -0,0 +1,305 @@
+'''
+ * Copyright (c) 2022, salesforce.com, inc.
+ * All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+ * By Junnan Li
+ * Based on timm code base
+ * https://github.com/rwightman/pytorch-image-models/tree/master/timm
+'''
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+
+from timm.models.vision_transformer import _cfg, PatchEmbed
+from timm.models.registry import register_model
+from timm.models.layers import trunc_normal_, DropPath
+from timm.models.helpers import named_apply, adapt_input_conv
+
+from fairscale.nn.checkpoint.checkpoint_activations import checkpoint_wrapper
+
+class Mlp(nn.Module):
+    """ MLP as used in Vision Transformer, MLP-Mixer and related networks
+    """
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.attn_gradients = None
+        self.attention_map = None
+        
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+        
+    def get_attn_gradients(self):
+        return self.attn_gradients
+    
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+        
+    def get_attention_map(self):
+        return self.attention_map
+    
+    def forward(self, x, register_hook=False):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]   # make torchscript happy (cannot use tensor as tuple)
+
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+                
+        if register_hook:
+            self.save_attention_map(attn)
+            attn.register_hook(self.save_attn_gradients)        
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, use_grad_checkpointing=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+
+        if use_grad_checkpointing:
+            self.attn = checkpoint_wrapper(self.attn)
+            self.mlp = checkpoint_wrapper(self.mlp)
+
+    def forward(self, x, register_hook=False):
+        x = x + self.drop_path(self.attn(self.norm1(x), register_hook=register_hook))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+    
+class VisionTransformer(nn.Module):
+    """ Vision Transformer
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`  -
+        https://arxiv.org/abs/2010.11929
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None,
+                 drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=None, 
+                 use_grad_checkpointing=False, ckpt_layer=0):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            num_classes (int): number of classes for classification head
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            qk_scale (float): override default qk scale of head_dim ** -0.5 if set
+            representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set
+            drop_rate (float): dropout rate
+            attn_drop_rate (float): attention dropout rate
+            drop_path_rate (float): stochastic depth rate
+            norm_layer: (nn.Module): normalization layer
+        """
+        super().__init__()
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6)
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                use_grad_checkpointing=(use_grad_checkpointing and i>=depth-ckpt_layer)
+            )
+            for i in range(depth)])
+        self.norm = norm_layer(embed_dim)
+
+        trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed', 'cls_token'}
+
+    def forward(self, x, register_blk=-1):
+        B = x.shape[0]
+        x = self.patch_embed(x)
+
+        cls_tokens = self.cls_token.expand(B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+  
+        x = x + self.pos_embed[:,:x.size(1),:]
+        x = self.pos_drop(x)
+
+        for i,blk in enumerate(self.blocks):
+            x = blk(x, register_blk==i)
+        x = self.norm(x)
+        
+        return x
+
+    @torch.jit.ignore()
+    def load_pretrained(self, checkpoint_path, prefix=''):
+        _load_weights(self, checkpoint_path, prefix)
+        
+
+@torch.no_grad()
+def _load_weights(model: VisionTransformer, checkpoint_path: str, prefix: str = ''):
+    """ Load weights from .npz checkpoints for official Google Brain Flax implementation
+    """
+    import numpy as np
+
+    def _n2p(w, t=True):
+        if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1:
+            w = w.flatten()
+        if t:
+            if w.ndim == 4:
+                w = w.transpose([3, 2, 0, 1])
+            elif w.ndim == 3:
+                w = w.transpose([2, 0, 1])
+            elif w.ndim == 2:
+                w = w.transpose([1, 0])
+        return torch.from_numpy(w)
+
+    w = np.load(checkpoint_path)
+    if not prefix and 'opt/target/embedding/kernel' in w:
+        prefix = 'opt/target/'
+
+    if hasattr(model.patch_embed, 'backbone'):
+        # hybrid
+        backbone = model.patch_embed.backbone
+        stem_only = not hasattr(backbone, 'stem')
+        stem = backbone if stem_only else backbone.stem
+        stem.conv.weight.copy_(adapt_input_conv(stem.conv.weight.shape[1], _n2p(w[f'{prefix}conv_root/kernel'])))
+        stem.norm.weight.copy_(_n2p(w[f'{prefix}gn_root/scale']))
+        stem.norm.bias.copy_(_n2p(w[f'{prefix}gn_root/bias']))
+        if not stem_only:
+            for i, stage in enumerate(backbone.stages):
+                for j, block in enumerate(stage.blocks):
+                    bp = f'{prefix}block{i + 1}/unit{j + 1}/'
+                    for r in range(3):
+                        getattr(block, f'conv{r + 1}').weight.copy_(_n2p(w[f'{bp}conv{r + 1}/kernel']))
+                        getattr(block, f'norm{r + 1}').weight.copy_(_n2p(w[f'{bp}gn{r + 1}/scale']))
+                        getattr(block, f'norm{r + 1}').bias.copy_(_n2p(w[f'{bp}gn{r + 1}/bias']))
+                    if block.downsample is not None:
+                        block.downsample.conv.weight.copy_(_n2p(w[f'{bp}conv_proj/kernel']))
+                        block.downsample.norm.weight.copy_(_n2p(w[f'{bp}gn_proj/scale']))
+                        block.downsample.norm.bias.copy_(_n2p(w[f'{bp}gn_proj/bias']))
+        embed_conv_w = _n2p(w[f'{prefix}embedding/kernel'])
+    else:
+        embed_conv_w = adapt_input_conv(
+            model.patch_embed.proj.weight.shape[1], _n2p(w[f'{prefix}embedding/kernel']))
+    model.patch_embed.proj.weight.copy_(embed_conv_w)
+    model.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias']))
+    model.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False))
+    pos_embed_w = _n2p(w[f'{prefix}Transformer/posembed_input/pos_embedding'], t=False)
+    if pos_embed_w.shape != model.pos_embed.shape:
+        pos_embed_w = resize_pos_embed(  # resize pos embedding when different size from pretrained weights
+            pos_embed_w, model.pos_embed, getattr(model, 'num_tokens', 1), model.patch_embed.grid_size)
+    model.pos_embed.copy_(pos_embed_w)
+    model.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale']))
+    model.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias']))
+#     if isinstance(model.head, nn.Linear) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]:
+#         model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel']))
+#         model.head.bias.copy_(_n2p(w[f'{prefix}head/bias']))
+#     if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w:
+#         model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel']))
+#         model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias']))
+    for i, block in enumerate(model.blocks.children()):
+        block_prefix = f'{prefix}Transformer/encoderblock_{i}/'
+        mha_prefix = block_prefix + 'MultiHeadDotProductAttention_1/'
+        block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale']))
+        block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias']))
+        block.attn.qkv.weight.copy_(torch.cat([
+            _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('query', 'key', 'value')]))
+        block.attn.qkv.bias.copy_(torch.cat([
+            _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('query', 'key', 'value')]))
+        block.attn.proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1))
+        block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias']))
+        for r in range(2):
+            getattr(block.mlp, f'fc{r + 1}').weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/kernel']))
+            getattr(block.mlp, f'fc{r + 1}').bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/bias']))
+        block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/scale']))
+        block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/bias']))
+
+            
+def interpolate_pos_embed(pos_embed_checkpoint, visual_encoder):        
+    # interpolate position embedding
+    embedding_size = pos_embed_checkpoint.shape[-1]
+    num_patches = visual_encoder.patch_embed.num_patches
+    num_extra_tokens = visual_encoder.pos_embed.shape[-2] - num_patches
+    # height (== width) for the checkpoint position embedding
+    orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+    # height (== width) for the new position embedding
+    new_size = int(num_patches ** 0.5)
+
+    if orig_size!=new_size:
+        # class_token and dist_token are kept unchanged
+        extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+        # only the position tokens are interpolated
+        pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+        pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+        pos_tokens = torch.nn.functional.interpolate(
+            pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+        pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+        new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+        print('reshape position embedding from %d to %d'%(orig_size ** 2,new_size ** 2))
+        
+        return new_pos_embed    
+    else:
+        return pos_embed_checkpoint
\ No newline at end of file