diff --git a/fengshen/examples/Lyrics/pretrain_lyrics.py b/fengshen/examples/Lyrics/pretrain_lyrics.py new file mode 100644 index 0000000..04c34f6 --- /dev/null +++ b/fengshen/examples/Lyrics/pretrain_lyrics.py @@ -0,0 +1,217 @@ +from pytorch_lightning import ( + LightningModule, + Trainer, +) +from pytorch_lightning.loggers import ( + WandbLogger +) +from pytorch_lightning.callbacks import ( + LearningRateMonitor, +) +from fengshen.models.lyrics.modeling_lyrics import LyricsQFromerForPretrain +import fengshen.models.lyrics.groundingdino.transforms as T +from fengshen.models.lyrics.configuration_lyrics import LyricsConfig +from fengshen.models.model_utils import ( + add_module_args, + configure_optimizers, + get_total_steps, +) +import argparse +from fengshen.data.universal_datamodule import UniversalDataModule +from fengshen.data.taiyi_stable_diffusion_datasets.taiyi_datasets import add_data_args, load_data +from fengshen.utils.universal_checkpoint import UniversalCheckpoint +import numpy as np +from torchvision.transforms import Normalize, Compose, Resize, RandomResizedCrop, InterpolationMode, ToTensor, RandomHorizontalFlip +from PIL import Image +from transformers import BertTokenizer, Blip2Processor +from torch.utils.data._utils.collate import default_collate +import os +import torch + +OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) +OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) + +class TensorObject(object): + def __init__(self, tensor: torch.Tensor): + self.data = tensor + +class Collator(): + def __init__(self, args): + self.transforms = Blip2Processor.from_pretrained(args.model_path) + self.grounding_transforms = T.Compose( + [ + T.RandomResize([800], max_size=1333), + # T.RandomResize([800]), + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), + ] + ) + self.ram_transforms = Compose([ + Resize((384, 384)), + ToTensor(), + Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + ]) + + def __call__(self, inputs): + # samples = [] + image = [] + grounding_image = [] + ram_image = [] + input_captions = [] + input_languages = [] + + ran = None + for (cnt, i) in enumerate(inputs): + if 'npy_path' in i: + instance_image = Image.fromarray(np.load(i['npy_path'])) + elif 'img_path' in i: + try: + instance_image = Image.open(i['img_path']) + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + except: + continue + elif "image" in i and i["image"] is not None: + instance_image = i["image"] + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + elif "img" in i and i["img"] is not None: + instance_image = i["img"] + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + else: + raise ValueError('no img path in samples') + + if 'blip_caption' in i: + try: + loc = torch.multinomial(torch.tensor(i['blip_scores']), 1) + caption = i['blip_caption'][loc] + language = 'zh' + except Exception: + caption = '' + print(i) + elif 'caption' in i: + caption = i['caption'] + language = 'en' + elif 'caption_zh' in i: + caption = i['caption_zh'] + language = 'zh' + image.append(self.transforms(instance_image, return_tensors="pt")['pixel_values'][0]) + grounding_image.append(self.grounding_transforms(instance_image, None)[0]) + ram_image.append(self.ram_transforms(instance_image)) + input_captions.append(caption) + input_languages.append(language) + model_inputs = { + "image": torch.stack(image), + "grounding_image": grounding_image, + "ram_image": torch.stack(ram_image), + "caption": input_captions, + "language": input_languages, + } + return model_inputs + + +class LyricsQFromer(LightningModule): + @staticmethod + def add_module_specific_args(parent_parser): + parser = parent_parser.add_argument_group('LyricsQFromer') + parser.add_argument('--freeze_image_tower', default=False, action='store_true') + return parent_parser + + def __init__(self, args, **kwargs) -> None: + super().__init__() + self.save_hyperparameters(args) + + # self.model = LyricsQFromerForPretrain.from_pretrained(args.model_path, ignore_mismatched_sizes=True) + self.model = LyricsQFromerForPretrain.from_pretrained(args.model_path) + tokenizer = BertTokenizer.from_pretrained(os.path.join(args.model_path, 'tokenizer')) + self.model.tokenizer = tokenizer + self.model.box_threshold = 0.25 + self.model.text_threshold = 0.2 + self.model.iou_threshold = 0.6 + + if args.freeze_image_tower: + self.model.vision_model.eval() + for param in self.model.vision_model.parameters(): + param.requires_grad = False + self.model.ram.eval() + for param in self.model.ram.parameters(): + param.requires_grad = False + self.model.grounding_dino.eval() + for param in self.model.grounding_dino.parameters(): + param.requires_grad = False + + def setup(self, stage) -> None: + if stage == 'fit': + self.total_steps = get_total_steps(self.trainer, self.hparams) + self.steps_per_epoch = self.total_steps // self.trainer.max_epochs + print('Total steps: {}' .format(self.total_steps)) + elif stage == 'validate': + self.total_steps = 100 + + def configure_optimizers(self): + return configure_optimizers(self) + + def training_step(self, batch): + output = self.model(**batch) + self.log('train/loss_itc', output.loss_itc) + self.log('train/loss_itm', output.loss_itm) + self.log('train/loss_lm', output.loss_lm) + self.log('train/loss_mlm', output.loss_mlm) + self.log('train/loss', output.loss) + if self.trainer.global_rank == 0: + if self.trainer.global_step % 1000 == 0: + print('loss_itc:', output.loss_itc) + print('loss_itm:', output.loss_itm) + print('loss_lm:', output.loss_lm) + print('loss_mlm:', output.loss_mlm) + return output.loss + + def validation_step(self, batch, batch_idx): + raise Exception("not impl") + + def on_load_checkpoint(self, checkpoint) -> None: + # 兼容低版本lightning,低版本lightning从ckpt起来时steps数会被重置为0 + global_step_offset = checkpoint["global_step"] + if 'global_samples' in checkpoint: + self.consumed_samples = checkpoint['global_samples'] + self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset + + def on_save_checkpoint(self, checkpoint) -> None: + # 保存的时候把权重按huggingface的形式保存出来 + if self.global_rank == 0: + dir_path = os.path.join( + self.hparams.default_root_dir, f'hf_out_{self.trainer.current_epoch}_{self.trainer.global_step}') + if not os.path.exists(dir_path): + os.mkdir(dir_path) + self.model.save_pretrained(dir_path) + + +if __name__ == '__main__': + args_parser = argparse.ArgumentParser() + args_parser = add_module_args(args_parser) + args_parser = add_data_args(args_parser) + args_parser = UniversalDataModule.add_data_specific_args(args_parser) + args_parser = Trainer.add_argparse_args(args_parser) + args_parser = LyricsQFromer.add_module_specific_args(args_parser) + args_parser = UniversalCheckpoint.add_argparse_args(args_parser) + args = args_parser.parse_args() + + lr_monitor = LearningRateMonitor(logging_interval='step') + checkpoint_callback = UniversalCheckpoint(args) + + # wandb_logger = WandbLogger(project="ditto_pretrain") # 初始化个WandbLogger对象 + trainer = Trainer.from_argparse_args(args, + # logger=wandb_logger, + callbacks=[ + lr_monitor, + checkpoint_callback]) + + model = LyricsQFromer(args) + collate_fn = Collator(args) + datasets = load_data(args, global_rank=trainer.global_rank) + datamoule = UniversalDataModule( + tokenizer=None, collate_fn=collate_fn, args=args, datasets=datasets) + trainer.fit(model, datamoule) + # trainer.fit(model, datamoule, ckpt_path=args.load_ckpt_path) \ No newline at end of file diff --git a/fengshen/examples/Lyrics/pretrain_lyrics_stage2.py b/fengshen/examples/Lyrics/pretrain_lyrics_stage2.py new file mode 100644 index 0000000..ccd7da8 --- /dev/null +++ b/fengshen/examples/Lyrics/pretrain_lyrics_stage2.py @@ -0,0 +1,311 @@ +from pytorch_lightning import ( + LightningModule, + Trainer, +) +from pytorch_lightning.loggers import ( + WandbLogger +) +from pytorch_lightning.callbacks import ( + LearningRateMonitor, +) +from fengshen.models.lyrics.modeling_lyrics import LyricsLMForConditionalGeneration +import fengshen.models.lyrics.groundingdino.transforms as T +from fengshen.models.lyrics.configuration_lyrics import LyricsConfig +from fengshen.models.model_utils import ( + add_module_args, + configure_optimizers, + get_total_steps, +) +import argparse +from fengshen.data.universal_datamodule import UniversalDataModule +from fengshen.data.taiyi_stable_diffusion_datasets.taiyi_datasets import add_data_args, load_data +from fengshen.utils.universal_checkpoint import UniversalCheckpoint +import numpy as np +from torchvision.transforms import Normalize, Compose, Resize, RandomResizedCrop, InterpolationMode, ToTensor, RandomHorizontalFlip +from PIL import Image +from transformers import BertTokenizer, Blip2Processor, InstructBlipProcessor, InstructBlipForConditionalGeneration, LlamaTokenizer +from torch.utils.data._utils.collate import default_collate +import os +import torch +import random + +# BlipImageProcessor +OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) +OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) + +class TensorObject(object): + def __init__(self, tensor: torch.Tensor): + self.data = tensor + +class Collator(): + def __init__(self, args): + self.processor = InstructBlipProcessor.from_pretrained(args.model_path, padding_side = "right") + self.eos_token = self.processor.tokenizer.eos_token + self.grounding_transforms = T.Compose( + [ + T.RandomResize([800], max_size=1333), + # T.RandomResize([800]), + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), + ] + ) + self.ram_transforms = Compose([ + Resize((384, 384)), + ToTensor(), + Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + ]) + self.max_txt_len = 24 #128 + self.max_output_txt_len = 40 #256 + self.prompts = { + "zh":[ + "在本任务中,您将获得一张图片,您的任务是生成该图片的描述。", + "在这项任务中,您将获得一篇图片。你的任务是用一句话概括这张图片。", + "为给定的图片生成一个适当的描述。", + "本任务中,您将获得一张图片。你的任务是描述它。", + "这张图片的内容是什么。", + "请简单描述一下这张图片。", + ], + "en": [ + 'A short image caption:', + 'A short image description:', + 'A photo of', + 'An image that shows', + 'Write a short description for the image.', + 'Write a description for the photo.', + 'Provide a description of what is presented in the photo.', + 'Briefly describe the content of the image.', + 'Can you briefly explain what you see in the image?', + 'Could you use a few words to describe what you perceive in the photo?', + 'Please provide a short depiction of the picture.', + 'Using language, provide a short account of the image.', + 'Use a few words to illustrate what is happening in the picture.', + ] + } + self.stage = 'first' # 一阶段和二阶段代表有无instruct部分 + + # 只需要写qa拼接逻辑 + def concat_text_input_output(self, input_ids, input_atts, output_ids, output_atts): + input_part_targets_len = [] + llm_tokens = {"input_ids": [], "attention_mask": []} + for i in range(input_ids.size(0)): + this_input_ones = input_atts[i].sum() + input_part_targets_len.append(this_input_ones) + llm_tokens['input_ids'].append( + torch.cat([ + input_ids[i][:this_input_ones], + output_ids[i][1:], + input_ids[i][this_input_ones:] + ]) + ) + llm_tokens['attention_mask'].append( + torch.cat([ + input_atts[i][:this_input_ones], + output_atts[i][1:], + input_atts[i][this_input_ones:] + ]) + ) + llm_tokens['input_ids'] = torch.stack(llm_tokens['input_ids']) + llm_tokens['attention_mask'] = torch.stack(llm_tokens['attention_mask']) + return llm_tokens, input_part_targets_len + + def __call__(self, inputs): + # samples = [] + images = [] + grounding_pixel_values = [] + ram_pixel_values = [] + questions = [] + answers = [] + + for (cnt, i) in enumerate(inputs): + if 'npy_path' in i: + instance_image = Image.fromarray(np.load(i['npy_path'])) + elif 'img_path' in i: + try: + instance_image = Image.open(i['img_path']) + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + except: + continue + elif "image" in i and i["image"] is not None: + instance_image = i["image"] + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + elif "img" in i and i["img"] is not None: + instance_image = i["img"] + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + else: + raise ValueError('no img path in samples') + + if 'caption' in i: + answer = i['caption'] + ' ' + self.eos_token + prompts = self.prompts['en'] + prompt = prompts[random.randint(0, len(prompts) - 1)] + elif 'caption_zh' in i: + answer = i['caption_zh'] + ' ' + self.eos_token + prompts = self.prompts['zh'] + prompt = prompts[random.randint(0, len(prompts) - 1)] + elif 'text' in i: + answer = i['text']['answer'] + ' ' + self.eos_token + prompt = i['text']['question'] + # elif 'caption_zh' in i: + # caption = i['caption_zh'] + + images.append(instance_image) + grounding_pixel_values.append(self.grounding_transforms(instance_image, None)[0]) + ram_pixel_values.append(self.ram_transforms(instance_image)) + questions.append(prompt) + answers.append(answer) + + self.processor.tokenizer.truncation_side = "left" + text_input_tokens = self.processor(text=questions, padding="longest", truncation=True, max_length=self.max_txt_len, return_tensors="pt") + # print('text_input_tokens:', text_input_tokens.input_ids) + + self.processor.tokenizer.truncation_side = 'right' + text_output_tokens = self.processor(text=answers, padding="longest", truncation=True, max_length=self.max_output_txt_len, return_tensors="pt") + # print('text_output_tokens:', text_output_tokens.input_ids) + llm_tokens, input_part_targets_len = self.concat_text_input_output( + text_input_tokens.input_ids, + text_input_tokens.attention_mask, + text_output_tokens.input_ids, + text_output_tokens.attention_mask, + ) + + labels = llm_tokens['input_ids'].masked_fill( + llm_tokens['input_ids'] == self.processor.tokenizer.pad_token_id, -100 + ) + for i, l in enumerate(input_part_targets_len): + labels[i][:l] = -100 + # labels = text_input_tokens.input_ids + + images_pixel_values = self.processor.image_processor(images=images, return_tensors="pt") + + model_inputs = { + "pixel_values":images_pixel_values['pixel_values'], + "grounding_pixel_values": grounding_pixel_values, + "ram_pixel_values": torch.stack(ram_pixel_values), + "input_ids": llm_tokens['input_ids'], + "attention_mask": llm_tokens['attention_mask'], + "labels": labels, + } + return model_inputs + + +class Lyrics(LightningModule): + @staticmethod + def add_module_specific_args(parent_parser): + parser = parent_parser.add_argument_group('Lyrics') + parser.add_argument('--freeze_image_tower', default=False, action='store_true') + parser.add_argument('--freeze_qformer', default=False, action='store_true') + return parent_parser + + def __init__(self, args, **kwargs) -> None: + super().__init__() + self.save_hyperparameters(args) + + # self.model = LyricsQFromerForPretrain.from_pretrained(args.model_path, ignore_mismatched_sizes=True) + self.model = LyricsLMForConditionalGeneration.from_pretrained(args.model_path) + self.processor = InstructBlipProcessor.from_pretrained(args.model_path, padding_side = "right") + + self.model.box_threshold = 0.25 + self.model.text_threshold = 0.2 + self.model.iou_threshold = 0.6 + + if args.freeze_image_tower: + self.model.vision_model.eval() + for param in self.model.vision_model.parameters(): + param.requires_grad = False + self.model.ram.eval() + for param in self.model.ram.parameters(): + param.requires_grad = False + self.model.grounding_dino.eval() + for param in self.model.grounding_dino.parameters(): + param.requires_grad = False + if args.freeze_qformer: + # freeze qformer, minigpt4 + self.model.qformer.eval() + self.model.qformer.requires_grad_(False) + self.model.query_tokens.requires_grad_(False) + self.model.language_model.eval() + self.model.language_model.requires_grad_(False) + + + def setup(self, stage) -> None: + if stage == 'fit': + self.total_steps = get_total_steps(self.trainer, self.hparams) + self.steps_per_epoch = self.total_steps // self.trainer.max_epochs + print('Total steps: {}' .format(self.total_steps)) + elif stage == 'validate': + self.total_steps = 100 + + def configure_optimizers(self): + return configure_optimizers(self) + + def detokenize(self, token_ids): + toks = self.processor.tokenizer.convert_ids_to_tokens(token_ids) + return self.processor.tokenizer.convert_tokens_to_string(toks) + + def qformer_detokenize(self, token_ids): + toks = self.processor.qformer_tokenizer.convert_ids_to_tokens(token_ids) + return self.processor.qformer_tokenizer.convert_tokens_to_string(toks) + + def training_step(self, batch): + if self.trainer.global_rank == 0: + global SHOW_DATA + if self.trainer.global_step % 1000 == 0: + SHOW_DATA = True + print(f"input_ids: {batch['input_ids'][0]}") + print(f"input: {self.detokenize(batch['input_ids'][0])}") + print(f"labels: {batch['labels'][0]}") + + output = self.model(**batch) + return output.loss + + def validation_step(self, batch, batch_idx): + raise Exception("not impl") + + def on_load_checkpoint(self, checkpoint) -> None: + # 兼容低版本lightning,低版本lightning从ckpt起来时steps数会被重置为0 + global_step_offset = checkpoint["global_step"] + if 'global_samples' in checkpoint: + self.consumed_samples = checkpoint['global_samples'] + self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset + + def on_save_checkpoint(self, checkpoint) -> None: + # 保存的时候把权重按huggingface的形式保存出来 + if self.global_rank == 0: + dir_path = os.path.join( + self.hparams.default_root_dir, f'hf_out_{self.trainer.current_epoch}_{self.trainer.global_step}') + if not os.path.exists(dir_path): + os.mkdir(dir_path) + self.model.save_pretrained(dir_path) + + +if __name__ == '__main__': + args_parser = argparse.ArgumentParser() + args_parser = add_module_args(args_parser) + args_parser = add_data_args(args_parser) + args_parser = UniversalDataModule.add_data_specific_args(args_parser) + args_parser = Trainer.add_argparse_args(args_parser) + args_parser = Lyrics.add_module_specific_args(args_parser) + args_parser = UniversalCheckpoint.add_argparse_args(args_parser) + args = args_parser.parse_args() + + lr_monitor = LearningRateMonitor(logging_interval='step') + checkpoint_callback = UniversalCheckpoint(args) + + # wandb_logger = WandbLogger(project="Lyrics") # 初始化个WandbLogger对象 + trainer = Trainer.from_argparse_args(args, + # logger=wandb_logger, + callbacks=[ + lr_monitor, + checkpoint_callback]) + + model = Lyrics(args) + collate_fn = Collator(args) + datasets = load_data(args, global_rank=trainer.global_rank) + datamoule = UniversalDataModule( + tokenizer=None, collate_fn=collate_fn, args=args, datasets=datasets) + # trainer.fit(model, datamoule) + trainer.fit(model, datamoule, ckpt_path=args.load_ckpt_path) \ No newline at end of file diff --git a/fengshen/examples/Lyrics/pretrain_lyrics_stage2_instruct.py b/fengshen/examples/Lyrics/pretrain_lyrics_stage2_instruct.py new file mode 100644 index 0000000..0fa31c2 --- /dev/null +++ b/fengshen/examples/Lyrics/pretrain_lyrics_stage2_instruct.py @@ -0,0 +1,369 @@ +from pytorch_lightning import ( + LightningModule, + Trainer, +) +from pytorch_lightning.loggers import ( + WandbLogger +) +from pytorch_lightning.callbacks import ( + LearningRateMonitor, +) +from fengshen.models.lyrics.modeling_lyrics import LyricsLMForConditionalGeneration +import fengshen.models.lyrics.groundingdino.transforms as T +from fengshen.models.lyrics.configuration_lyrics import LyricsConfig +from fengshen.models.model_utils import ( + add_module_args, + configure_optimizers, + get_total_steps, +) +import argparse +from peft import LoraConfig, get_peft_config, get_peft_model +from fengshen.data.universal_datamodule import UniversalDataModule +from fengshen.data.taiyi_stable_diffusion_datasets.taiyi_datasets import add_data_args, load_data +from fengshen.utils.universal_checkpoint import UniversalCheckpoint +import numpy as np +from torchvision.transforms import Normalize, Compose, Resize, RandomResizedCrop, InterpolationMode, ToTensor, RandomHorizontalFlip +from PIL import Image +from transformers import BertTokenizer, Blip2Processor, InstructBlipProcessor, InstructBlipForConditionalGeneration, LlamaTokenizer +from torch.utils.data._utils.collate import default_collate +import os +import torch +import random +from io import BytesIO +from base64 import b64decode + +# BlipImageProcessor +OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) +OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) + +class Collator(): + def __init__(self, args): + self.processor = InstructBlipProcessor.from_pretrained(args.model_path, padding_side = "right") + self.eos_token = self.processor.tokenizer.eos_token + self.grounding_transforms = T.Compose( + [ + T.RandomResize([800], max_size=1333), + T.ToTensor(), + T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), + ] + ) + self.ram_transforms = Compose([ + Resize((384, 384)), + ToTensor(), + Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + ]) + self.max_txt_len = 24 + self.max_output_txt_len = 40 + self.prompts = { + "zh":[ + "在本任务中,您将获得一张图片,您的任务是生成该图片的描述。", + "在这项任务中,您将获得一篇图片。你的任务是用一句话概括这张图片。", + "为给定的图片生成一个适当的描述。", + "本任务中,您将获得一张图片。你的任务是描述它。", + "这张图片的内容是什么。", + "请简单描述一下这张图片。", + ], + "en": [ + 'A short image caption:', + 'A short image description:', + 'A photo of', + 'An image that shows', + 'Write a short description for the image.', + 'Write a description for the photo.', + 'Provide a description of what is presented in the photo.', + 'Briefly describe the content of the image.', + 'Can you briefly explain what you see in the image?', + 'Could you use a few words to describe what you perceive in the photo?', + 'Please provide a short depiction of the picture.', + 'Using language, provide a short account of the image.', + 'Use a few words to illustrate what is happening in the picture.', + ] + } + self.stage = 'second' # 一阶段和二阶段代表有无instruct部分 + + def concat_text_input_output(self, input_ids, input_atts, output_ids, output_atts): + input_part_targets_len = [] + llm_tokens = {"input_ids": [], "attention_mask": []} + for i in range(input_ids.size(0)): + this_input_ones = input_atts[i].sum() + input_part_targets_len.append(this_input_ones) + llm_tokens['input_ids'].append( + torch.cat([ + input_ids[i][:this_input_ones], + output_ids[i][1:], + input_ids[i][this_input_ones:] + ]) + ) + llm_tokens['attention_mask'].append( + torch.cat([ + input_atts[i][:this_input_ones], + output_atts[i][1:], + input_atts[i][this_input_ones:] + ]) + ) + llm_tokens['input_ids'] = torch.stack(llm_tokens['input_ids']) + llm_tokens['attention_mask'] = torch.stack(llm_tokens['attention_mask']) + return llm_tokens, input_part_targets_len + + def __call__(self, inputs): + # samples = [] + images = [] + grounding_pixel_values = [] + ram_pixel_values = [] + questions = [] + answers = [] + + for (cnt, i) in enumerate(inputs): + if 'npy_path' in i: + instance_image = Image.fromarray(np.load(i['npy_path'])) + elif 'img_path' in i: + try: + instance_image = Image.open(i['img_path']) + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + except: + continue + elif "image" in i and i["image"] is not None: + instance_image = i["image"] + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + elif "img" in i and i["img"] is not None: + instance_image = i["img"] + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + elif "image_base64_str" in i: + try: + instance_image = Image.open(BytesIO(b64decode(i["image_base64_str"][0]))) + if not instance_image.mode == "RGB": + instance_image = instance_image.convert("RGB") + except: + continue + else: + raise ValueError('no img path in samples') + if 1 in instance_image.size: + continue + + if 'caption' in i: + answer = i['caption'] + ' ' + self.eos_token + prompts = self.prompts['en'] + prompt = prompts[random.randint(0, len(prompts) - 1)] + elif 'caption_zh' in i: + answer = i['caption_zh'] + ' ' + self.eos_token + prompts = self.prompts['zh'] + prompt = prompts[random.randint(0, len(prompts) - 1)] + elif 'text' in i: + answer = i['text'][0]['answer'] + ' ' + self.eos_token + prompt = i['text'][0]['question'] + elif 'instruction' in i: + answer = i['outputs'] + ' ' + self.eos_token + # if random.random() <=0.15: + prompt = i['instruction'] + i['inputs'] + # elif 'caption_zh' in i: + # caption = i['caption_zh'] + elif 'question' in i: + answer = i['answer']+ ' ' + self.eos_token + prompt = i['question'] + + images.append(instance_image) + grounding_pixel_values.append(self.grounding_transforms(instance_image, None)[0]) + ram_pixel_values.append(self.ram_transforms(instance_image)) + questions.append(prompt) + answers.append(answer) + + self.processor.tokenizer.truncation_side = "left" + # print(questions) + # + text_input_tokens = self.processor(text=questions, padding="longest", truncation=True, max_length=self.max_txt_len, return_tensors="pt") + # print('text_input_tokens:', text_input_tokens.input_ids) + + self.processor.tokenizer.truncation_side = 'right' + text_output_tokens = self.processor(text=answers, padding="longest", truncation=True, max_length=self.max_output_txt_len, return_tensors="pt") + # print('text_output_tokens:', text_output_tokens.input_ids) + + llm_tokens, input_part_targets_len = self.concat_text_input_output( + text_input_tokens.input_ids, + text_input_tokens.attention_mask, + text_output_tokens.input_ids, + text_output_tokens.attention_mask, + ) + + labels = llm_tokens['input_ids'].masked_fill( + llm_tokens['input_ids'] == self.processor.tokenizer.pad_token_id, -100 + ) + for i, l in enumerate(input_part_targets_len): + labels[i][:l] = -100 + + images_pixel_values = self.processor.image_processor(images=images, return_tensors="pt") + # images_pixel_values = torch.stack(images_pixel_values['pixel_values']) + model_inputs = { + "pixel_values":images_pixel_values['pixel_values'], + "grounding_pixel_values": grounding_pixel_values, + "ram_pixel_values": torch.stack(ram_pixel_values), + "input_ids": llm_tokens['input_ids'], + "attention_mask": llm_tokens['attention_mask'], + "qformer_input_ids": text_input_tokens.qformer_input_ids, + "qformer_attention_mask": text_input_tokens.qformer_attention_mask, + "labels": labels, + } + return model_inputs + + +class Lyrics(LightningModule): + @staticmethod + def add_module_specific_args(parent_parser): + parser = parent_parser.add_argument_group('Lyrics') + parser.add_argument('--freeze_image_tower', default=False, action='store_true') + parser.add_argument('--freeze_qformer', default=False, action='store_true') + parser.add_argument('--lora-r', type=int, default=8, + help='curvature.') + parser.add_argument('--inference_mode', type=bool, default=False, + help='The inference mode.') + parser.add_argument('--lora-alpha', type=int, default=32, + help='The initialization coefficient of lora-alpha.') + parser.add_argument('--lora-dropout', type=int, default=0.05, + help='The initialization coefficient of lora_dropout.') + parser.add_argument('--use-lora', action='store_true', help='LORA.') + return parent_parser + + def __init__(self, args, **kwargs) -> None: + super().__init__() + self.save_hyperparameters(args) + + # self.model = LyricsQFromerForPretrain.from_pretrained(args.model_path, ignore_mismatched_sizes=True) + self.model = LyricsLMForConditionalGeneration.from_pretrained(args.model_path) + self.processor = InstructBlipProcessor.from_pretrained(args.model_path, padding_side = "right") + + self.model.box_threshold = 0.25 + self.model.text_threshold = 0.2 + self.model.iou_threshold = 0.6 + + if args.freeze_image_tower: + self.model.vision_model.eval() + for param in self.model.vision_model.parameters(): + param.requires_grad = False + self.model.ram.eval() + for param in self.model.ram.parameters(): + param.requires_grad = False + self.model.grounding_dino.eval() + for param in self.model.grounding_dino.parameters(): + param.requires_grad = False + # if args.freeze_qformer: + # self.model.qformer.eval() + # self.model.qformer.requires_grad_(False) + # self.model.query_tokens.requires_grad_(False) + # freeze lm + if args.use_lora: + # for param in self.model.parameters(): + # # freeze base model's layers + # param.requires_grad = False + peft_config = LoraConfig( + target_modules=r'.*language_model.*\.(q_proj|k_proj|v_proj)', + inference_mode=args.inference_mode, + r=args.lora_r, + lora_alpha=args.lora_alpha, + lora_dropout=args.lora_dropout + ) + self.model = get_peft_model(self.model, peft_config) + # self.model.base_model.model.qformer.train() + # self.model.base_model.model.qformer.requires_grad_(True) + # self.model.base_model.model.query_tokens.requires_grad_(True) + # self.model.base_model.model.language_projection.train() + # self.model.base_model.model.language_projection.requires_grad_(True) + self.model.print_trainable_parameters() + elif args.freeze_qformer: + self.model.language_model.eval() + self.model.language_model.requires_grad_(False) + self.model.qformer.eval() + self.model.qformer.requires_grad_(False) + self.model.query_tokens.requires_grad_(False) + else: + self.model.language_model.eval() + self.model.language_model.requires_grad_(False) + # for name, param in self.model.named_parameters(): + # if param.requires_grad == True: + # print(name) + + + + def setup(self, stage) -> None: + if stage == 'fit': + self.total_steps = get_total_steps(self.trainer, self.hparams) + self.steps_per_epoch = self.total_steps // self.trainer.max_epochs + print('Total steps: {}' .format(self.total_steps)) + elif stage == 'validate': + self.total_steps = 100 + + def configure_optimizers(self): + return configure_optimizers(self) + + def detokenize(self, token_ids): + toks = self.processor.tokenizer.convert_ids_to_tokens(token_ids) + return self.processor.tokenizer.convert_tokens_to_string(toks) + + def qformer_detokenize(self, token_ids): + toks = self.processor.qformer_tokenizer.convert_ids_to_tokens(token_ids) + return self.processor.qformer_tokenizer.convert_tokens_to_string(toks) + + def training_step(self, batch): + if self.trainer.global_rank == 0: + global SHOW_DATA + if self.trainer.global_step % 1000 == 0: + SHOW_DATA = True + print(f"input_ids: {batch['input_ids'][0]}") + print(f"input: {self.detokenize(batch['input_ids'][0])}") + print(f"labels_id: {batch['labels'][0]}") + print(f"qformer_input_ids: {batch['qformer_input_ids'][0]}") + print(f"qformer_input: {self.qformer_detokenize(batch['qformer_input_ids'][0])}") + + output = self.model(**batch) + return output.loss + + def validation_step(self, batch, batch_idx): + raise Exception("not impl") + + def on_load_checkpoint(self, checkpoint) -> None: + # 兼容低版本lightning,低版本lightning从ckpt起来时steps数会被重置为0 + global_step_offset = checkpoint["global_step"] + if 'global_samples' in checkpoint: + self.consumed_samples = checkpoint['global_samples'] + self.trainer.fit_loop.epoch_loop._batches_that_stepped = global_step_offset + + def on_save_checkpoint(self, checkpoint) -> None: + # 保存的时候把权重按huggingface的形式保存出来 + if self.global_rank == 0: + dir_path = os.path.join( + self.hparams.default_root_dir, f'hf_out_{self.trainer.current_epoch}_{self.trainer.global_step}') + if not os.path.exists(dir_path): + os.mkdir(dir_path) + self.model.save_pretrained(dir_path) + + +if __name__ == '__main__': + args_parser = argparse.ArgumentParser() + args_parser = add_module_args(args_parser) + args_parser = add_data_args(args_parser) + args_parser = UniversalDataModule.add_data_specific_args(args_parser) + args_parser = Trainer.add_argparse_args(args_parser) + args_parser = Lyrics.add_module_specific_args(args_parser) + args_parser = UniversalCheckpoint.add_argparse_args(args_parser) + args = args_parser.parse_args() + + lr_monitor = LearningRateMonitor(logging_interval='step') + checkpoint_callback = UniversalCheckpoint(args) + + # wandb_logger = WandbLogger(project="Lyrics") # 初始化个WandbLogger对象 + trainer = Trainer.from_argparse_args(args, + # logger=wandb_logger, + callbacks=[ + lr_monitor, + checkpoint_callback]) + + model = Lyrics(args) + collate_fn = Collator(args) + datasets = load_data(args, global_rank=trainer.global_rank) + # print(datasets) + datamoule = UniversalDataModule( + tokenizer=None, collate_fn=collate_fn, args=args, datasets=datasets) + # trainer.fit(model, datamoule) + trainer.fit(model, datamoule, ckpt_path=args.load_ckpt_path) diff --git a/fengshen/models/Lyrics/backbone/__init__.py b/fengshen/models/Lyrics/backbone/__init__.py new file mode 100644 index 0000000..c0de392 --- /dev/null +++ b/fengshen/models/Lyrics/backbone/__init__.py @@ -0,0 +1,3 @@ +from fengshen.models.groundedblip.backbone.backbone import Joiner +from fengshen.models.groundedblip.backbone.swin_transformer import SwinTransformer +from fengshen.models.groundedblip.backbone.position_encoding import PositionEmbeddingSineHW \ No newline at end of file diff --git a/fengshen/models/Lyrics/backbone/backbone.py b/fengshen/models/Lyrics/backbone/backbone.py new file mode 100644 index 0000000..2e82c34 --- /dev/null +++ b/fengshen/models/Lyrics/backbone/backbone.py @@ -0,0 +1,53 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Conditional DETR +# Copyright (c) 2021 Microsoft. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Copied from DETR (https://github.com/facebookresearch/detr) +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# ------------------------------------------------------------------------ + +""" +Backbone modules. +""" + +from typing import Dict, List + +import torch +import torch.nn.functional as F +import torchvision +from torch import nn +from torchvision.models._utils import IntermediateLayerGetter + +from fengshen.models.groundedblip.groundingdino.utils import NestedTensor, clean_state_dict, is_main_process + +from fengshen.models.groundedblip.backbone.swin_transformer import SwinTransformer +from fengshen.models.groundedblip.backbone.position_encoding import PositionEmbeddingSineHW + +class Joiner(nn.Module): + def __init__(self, args): + super().__init__() + self.swintransformer = SwinTransformer(args) + self.position_embedding = PositionEmbeddingSineHW(args.hidden_dim, + temperatureh=args.pe_temperatureh, + temperaturew=args.pe_temperaturew, + normalize=True, + ) + bb_num_channels = self.swintransformer.num_features[4 - len(tuple(args.return_interm_indices)) :] + self.num_channels = bb_num_channels + + def forward(self, tensor_list: NestedTensor): + xs = self.swintransformer(tensor_list) + out: List[NestedTensor] = [] + pos = [] + for name, x in xs.items(): + out.append(x) + # position encoding + pos.append(self.position_embedding(x).to(x.tensors.dtype)) + + return out, pos diff --git a/fengshen/models/Lyrics/backbone/position_encoding.py b/fengshen/models/Lyrics/backbone/position_encoding.py new file mode 100644 index 0000000..20407fa --- /dev/null +++ b/fengshen/models/Lyrics/backbone/position_encoding.py @@ -0,0 +1,119 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# DINO +# Copyright (c) 2022 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Conditional DETR +# Copyright (c) 2021 Microsoft. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Copied from DETR (https://github.com/facebookresearch/detr) +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# ------------------------------------------------------------------------ + +""" +Various positional encodings for the transformer. +""" +import math + +import torch +from torch import nn + +from fengshen.models.groundedblip.groundingdino.utils import NestedTensor + +class PositionEmbeddingSineHW(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one + used by the Attention is all you need paper, generalized to work on images. + """ + + def __init__( + self, num_pos_feats=64, temperatureh=10000, temperaturew=10000, normalize=False, scale=None + ): + super().__init__() + self.num_pos_feats = num_pos_feats // 2 + self.temperatureh = temperatureh + self.temperaturew = temperaturew + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, tensor_list: NestedTensor): + x = tensor_list.tensors + mask = tensor_list.mask + assert mask is not None + not_mask = ~mask + y_embed = not_mask.cumsum(1, dtype=torch.float32) + x_embed = not_mask.cumsum(2, dtype=torch.float32) + + # import ipdb; ipdb.set_trace() + + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_tx = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) + dim_tx = self.temperaturew ** (2 * (torch.div(dim_tx, 2, rounding_mode='floor')) / self.num_pos_feats) + pos_x = x_embed[:, :, :, None] / dim_tx + + dim_ty = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) + dim_ty = self.temperatureh ** (2 * (torch.div(dim_ty, 2, rounding_mode='floor')) / self.num_pos_feats) + pos_y = y_embed[:, :, :, None] / dim_ty + + pos_x = torch.stack( + (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos_y = torch.stack( + (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + + # import ipdb; ipdb.set_trace() + + return pos + + +class PositionEmbeddingLearned(nn.Module): + """ + Absolute pos embedding, learned. + """ + + def __init__(self, num_pos_feats=256): + super().__init__() + self.row_embed = nn.Embedding(50, num_pos_feats) + self.col_embed = nn.Embedding(50, num_pos_feats) + self.reset_parameters() + + def reset_parameters(self): + nn.init.uniform_(self.row_embed.weight) + nn.init.uniform_(self.col_embed.weight) + + def forward(self, tensor_list: NestedTensor): + x = tensor_list.tensors + h, w = x.shape[-2:] + i = torch.arange(w, device=x.device) + j = torch.arange(h, device=x.device) + x_emb = self.col_embed(i) + y_emb = self.row_embed(j) + pos = ( + torch.cat( + [ + x_emb.unsqueeze(0).repeat(h, 1, 1), + y_emb.unsqueeze(1).repeat(1, w, 1), + ], + dim=-1, + ) + .permute(2, 0, 1) + .unsqueeze(0) + .repeat(x.shape[0], 1, 1, 1) + ) + return pos diff --git a/fengshen/models/Lyrics/backbone/swin_transformer.py b/fengshen/models/Lyrics/backbone/swin_transformer.py new file mode 100644 index 0000000..bfa6316 --- /dev/null +++ b/fengshen/models/Lyrics/backbone/swin_transformer.py @@ -0,0 +1,788 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# DINO +# Copyright (c) 2022 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# -------------------------------------------------------- +# modified from https://github.com/SwinTransformer/Swin-Transformer-Object-Detection/blob/master/mmdet/models/backbones/swin_transformer.py +# -------------------------------------------------------- + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint +from timm.models.layers import DropPath, to_2tuple, trunc_normal_ + +from fengshen.models.groundedblip.groundingdino.utils import NestedTensor + + +class Mlp(nn.Module): + """Multilayer perceptron.""" + + def __init__( + self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0 + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class WindowAttention(nn.Module): + """Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__( + self, + dim, + window_size, + num_heads, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): + + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads) + ) # 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=0.02) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask=None): + """Forward function. + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv = ( + self.qkv(x) + .reshape(B_, N, 3, self.num_heads, C // self.num_heads) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1) + ].view( + self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 + ) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1 + ).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + if v.dtype == torch.half: + attn = self.attn_drop(attn).half() + # attn = self.attn_drop(attn) + elif v.dtype == torch.bfloat16: + attn = torch.tensor(attn, dtype=torch.bfloat16) + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class SwinTransformerBlock(nn.Module): + """Swin Transformer Block. + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__( + self, + dim, + num_heads, + window_size=7, + shift_size=0, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + ): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, + window_size=to_2tuple(self.window_size), + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop + ) + + self.H = None + self.W = None + + def forward(self, x, mask_matrix): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + mask_matrix: Attention mask for cyclic shift. + """ + B, L, C = x.shape + H, W = self.H, self.W + assert L == H * W, "input feature has wrong size" + + shortcut = x + x = self.norm1(x) + x = x.view(B, H, W, C) + + # pad feature maps to multiples of window size + pad_l = pad_t = 0 + pad_r = (self.window_size - W % self.window_size) % self.window_size + pad_b = (self.window_size - H % self.window_size) % self.window_size + x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) + _, Hp, Wp, _ = x.shape + + # cyclic shift + if self.shift_size > 0: + shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) + attn_mask = mask_matrix + else: + shifted_x = x + attn_mask = None + + # partition windows + x_windows = window_partition( + shifted_x, self.window_size + ) # nW*B, window_size, window_size, C + x_windows = x_windows.view( + -1, self.window_size * self.window_size, C + ) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) + shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) + else: + x = shifted_x + + if pad_r > 0 or pad_b > 0: + x = x[:, :H, :W, :].contiguous() + + x = x.view(B, H * W, C) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + +class PatchMerging(nn.Module): + """Patch Merging Layer + Args: + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x, H, W): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + + x = x.view(B, H, W, C) + + # padding + pad_input = (H % 2 == 1) or (W % 2 == 1) + if pad_input: + x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2)) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + +class BasicLayer(nn.Module): + """A basic Swin Transformer layer for one stage. + Args: + dim (int): Number of feature channels + depth (int): Depths of this stage. + num_heads (int): Number of attention head. + window_size (int): Local window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__( + self, + dim, + depth, + num_heads, + window_size=7, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False, + ): + super().__init__() + self.window_size = window_size + self.shift_size = window_size // 2 + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList( + [ + SwinTransformerBlock( + dim=dim, + num_heads=num_heads, + window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer, + ) + for i in range(depth) + ] + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(dim=dim, norm_layer=norm_layer) + else: + self.downsample = None + + def forward(self, x, H, W): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + + # calculate attention mask for SW-MSA + Hp = int(np.ceil(H / self.window_size)) * self.window_size + Wp = int(np.ceil(W / self.window_size)) * self.window_size + img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1 + h_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + w_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition( + img_mask, self.window_size + ) # nW, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( + attn_mask == 0, float(0.0) + ) + + for blk in self.blocks: + blk.H, blk.W = H, W + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x, attn_mask) + else: + x = blk(x, attn_mask) + if self.downsample is not None: + x_down = self.downsample(x, H, W) + Wh, Ww = (H + 1) // 2, (W + 1) // 2 + return x, H, W, x_down, Wh, Ww + else: + return x, H, W, x, H, W + + +class PatchEmbed(nn.Module): + """Image to Patch Embedding + Args: + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): + super().__init__() + patch_size = to_2tuple(patch_size) + self.patch_size = patch_size + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + """Forward function.""" + # padding + _, _, H, W = x.size() + if W % self.patch_size[1] != 0: + x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1])) + if H % self.patch_size[0] != 0: + x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) + + x = self.proj(x) # B C Wh Ww + if self.norm is not None: + Wh, Ww = x.size(2), x.size(3) + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww) + + return x + + +class SwinTransformer(nn.Module): + """Swin Transformer backbone. + A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - + https://arxiv.org/pdf/2103.14030 + Args: + pretrain_img_size (int): Input image size for training the pretrained model, + used in absolute postion embedding. Default 224. + patch_size (int | tuple(int)): Patch size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + depths (tuple[int]): Depths of each Swin Transformer stage. + num_heads (tuple[int]): Number of attention head of each stage. + window_size (int): Window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + drop_rate (float): Dropout rate. + attn_drop_rate (float): Attention dropout rate. Default: 0. + drop_path_rate (float): Stochastic depth rate. Default: 0.2. + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False. + patch_norm (bool): If True, add normalization after patch embedding. Default: True. + out_indices (Sequence[int]): Output from which stages. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + dilation (bool): if True, the output size if 16x downsample, ow 32x downsample. + """ + + def __init__( + self, + args, + ): + super().__init__() + + self.pretrain_img_size = args.pretrain_img_size + self.num_layers = args.num_layers + self.embed_dim = args.embed_dim + self.ape = args.ape + self.patch_norm = args.patch_norm + self.out_indices = args.out_indices + self.frozen_stages = args.frozen_stages + self.dilation = args.dilation + + # if use_checkpoint: + # print("use_checkpoint!!!!!!!!!!!!!!!!!!!!!!!!") + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + patch_size=args.patch_size, + in_chans=args.in_chans, + embed_dim=args.embed_dim, + norm_layer=nn.LayerNorm if self.patch_norm else None, + ) + + # absolute position embedding + if self.ape: + pretrain_img_size = to_2tuple(pretrain_img_size) + patch_size = to_2tuple(patch_size) + patches_resolution = [ + pretrain_img_size[0] // patch_size[0], + pretrain_img_size[1] // patch_size[1], + ] + + self.absolute_pos_embed = nn.Parameter( + torch.zeros(1, args.embed_dim, patches_resolution[0], patches_resolution[1]) + ) + trunc_normal_(self.absolute_pos_embed, std=0.02) + + self.pos_drop = nn.Dropout(p=args.drop_rate) + + # stochastic depth + dpr = [ + x.item() for x in torch.linspace(0, args.drop_path_rate, sum(args.depths)) + ] # stochastic depth decay rule + + # build layers + self.layers = nn.ModuleList() + # prepare downsample list + downsamplelist = [PatchMerging for i in range(self.num_layers)] + downsamplelist[-1] = None + num_features = [int(args.embed_dim * 2**i) for i in range(self.num_layers)] + if self.dilation: + downsamplelist[-2] = None + num_features[-1] = int(args.embed_dim * 2 ** (self.num_layers - 1)) // 2 + for i_layer in range(self.num_layers): + layer = BasicLayer( + # dim=int(embed_dim * 2 ** i_layer), + dim=num_features[i_layer], + depth=args.depths[i_layer], + num_heads=args.num_heads[i_layer], + window_size=args.window_size, + mlp_ratio=args.mlp_ratio, + qkv_bias=args.qkv_bias, + qk_scale=args.qk_scale, + drop=args.drop_rate, + attn_drop=args.attn_drop_rate, + drop_path=dpr[sum(args.depths[:i_layer]) : sum(args.depths[: i_layer + 1])], + norm_layer=nn.LayerNorm, + # downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, + downsample=downsamplelist[i_layer], + use_checkpoint=args.swintransformer_use_checkpoint, + ) + self.layers.append(layer) + + # num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)] + self.num_features = num_features + + # add a norm layer for each output + for i_layer in tuple(args.out_indices): + layer = nn.LayerNorm(num_features[i_layer]) + layer_name = f"norm{i_layer}" + self.add_module(layer_name, layer) + + self._freeze_stages() + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + + if self.frozen_stages >= 1 and self.ape: + self.absolute_pos_embed.requires_grad = False + + if self.frozen_stages >= 2: + self.pos_drop.eval() + for i in range(0, self.frozen_stages - 1): + m = self.layers[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + # def init_weights(self, pretrained=None): + # """Initialize the weights in backbone. + # Args: + # pretrained (str, optional): Path to pre-trained weights. + # Defaults to None. + # """ + + # def _init_weights(m): + # if isinstance(m, nn.Linear): + # trunc_normal_(m.weight, std=.02) + # if isinstance(m, nn.Linear) and m.bias is not None: + # nn.init.constant_(m.bias, 0) + # elif isinstance(m, nn.LayerNorm): + # nn.init.constant_(m.bias, 0) + # nn.init.constant_(m.weight, 1.0) + + # if isinstance(pretrained, str): + # self.apply(_init_weights) + # logger = get_root_logger() + # load_checkpoint(self, pretrained, strict=False, logger=logger) + # elif pretrained is None: + # self.apply(_init_weights) + # else: + # raise TypeError('pretrained must be a str or None') + + def forward_raw(self, x): + """Forward function.""" + x = self.patch_embed(x) + + Wh, Ww = x.size(2), x.size(3) + if self.ape: + # interpolate the position embedding to the corresponding size + absolute_pos_embed = F.interpolate( + self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic" + ) + x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C + else: + x = x.flatten(2).transpose(1, 2) + x = self.pos_drop(x) + + outs = [] + for i in range(self.num_layers): + layer = self.layers[i] + x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) + # import ipdb; ipdb.set_trace() + + if i in tuple(self.out_indices): + norm_layer = getattr(self, f"norm{i}") + x_out = norm_layer(x_out) + + out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous() + outs.append(out) + # in: + # torch.Size([2, 3, 1024, 1024]) + # outs: + # [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \ + # torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])] + return tuple(outs) + + def forward(self, tensor_list: NestedTensor): + x = tensor_list.tensors + + """Forward function.""" + x = self.patch_embed(x) + + Wh, Ww = x.size(2), x.size(3) + if self.ape: + # interpolate the position embedding to the corresponding size + absolute_pos_embed = F.interpolate( + self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic" + ) + x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C + else: + x = x.flatten(2).transpose(1, 2) + x = self.pos_drop(x) + + outs = [] + for i in range(self.num_layers): + layer = self.layers[i] + x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) + + if i in tuple(self.out_indices): + norm_layer = getattr(self, f"norm{i}") + x_out = norm_layer(x_out) + + out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous() + outs.append(out) + # print(out.shape) + # in: + # torch.Size([2, 3, 1024, 1024]) + # out: + # [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \ + # torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])] + + # collect for nesttensors + outs_dict = {} + for idx, out_i in enumerate(outs): + m = tensor_list.mask + assert m is not None + mask = F.interpolate(m[None].float(), size=out_i.shape[-2:]).to(torch.bool)[0] + outs_dict[idx] = NestedTensor(out_i, mask) + # print(out_i.shape) + + return outs_dict + + def train(self, mode=True): + """Convert the model into training mode while keep layers freezed.""" + super(SwinTransformer, self).train(mode) + self._freeze_stages() + + +def build_swin_transformer(modelname, pretrain_img_size, **kw): + assert modelname in [ + "swin_T_224_1k", + "swin_B_224_22k", + "swin_B_384_22k", + "swin_L_224_22k", + "swin_L_384_22k", + ] + + model_para_dict = { + "swin_T_224_1k": dict( + embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7 + ), + "swin_B_224_22k": dict( + embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=7 + ), + "swin_B_384_22k": dict( + embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=12 + ), + "swin_L_224_22k": dict( + embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=7 + ), + "swin_L_384_22k": dict( + embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=12 + ), + } + kw_cgf = model_para_dict[modelname] + kw_cgf.update(kw) + model = SwinTransformer(pretrain_img_size=pretrain_img_size, **kw_cgf) + return model + + +if __name__ == "__main__": + model = build_swin_transformer("swin_L_384_22k", 384, dilation=True) + x = torch.rand(2, 3, 1024, 1024) + y = model.forward_raw(x) + import ipdb + + ipdb.set_trace() + x = torch.rand(2, 3, 384, 384) + y = model.forward_raw(x) diff --git a/fengshen/models/Lyrics/configuration_lyrics.py b/fengshen/models/Lyrics/configuration_lyrics.py new file mode 100644 index 0000000..c4764de --- /dev/null +++ b/fengshen/models/Lyrics/configuration_lyrics.py @@ -0,0 +1,616 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" BLIP-2 model configuration""" + +import copy +import os +from typing import Union + +from transformers.configuration_utils import PretrainedConfig +from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES +from transformers.utils import logging +from transformers.models.auto import CONFIG_MAPPING + + +logger = logging.get_logger(__name__) + +# BLIP_2_PRETRAINED_CONFIG_ARCHIVE_MAP = { +# "salesforce/blip2-opt-2.7b": "https://huggingface.co/salesforce/blip2-opt-2.7b/resolve/main/config.json", +# } + + +class LyricsVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Blip2VisionModel`]. It is used to instantiate a + BLIP-2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration defaults will yield a similar configuration to that of the BLIP-2 + [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 1408): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 6144): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 39): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 14): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults + to 1e-5): The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries and values in the self-attention layers. + + Example: + + ```python + >>> from transformers import Blip2VisionConfig, Blip2VisionModel + + >>> # Initializing a Blip2VisionConfig with Salesforce/blip2-opt-2.7b style configuration + >>> configuration = Blip2VisionConfig() + + >>> # Initializing a Blip2VisionModel (with random weights) from the Salesforce/blip2-opt-2.7b style configuration + >>> model = Blip2VisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "blip_2_vision_model" + + def __init__( + self, + hidden_size=1408, + intermediate_size=6144, + num_hidden_layers=39, + num_attention_heads=16, + image_size=224, + patch_size=14, + hidden_act="gelu", + layer_norm_eps=0.00001, + attention_dropout=0.0, + initializer_range=1e-10, + qkv_bias=True, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.qkv_bias = qkv_bias + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the vision config dict if we are loading from Blip2Config + if config_dict.get("model_type") == "blip-2": + config_dict = config_dict["vision_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + +class LyricsDetectionConfig(PretrainedConfig): + + model_type = "blip_2_detection_model" + + def __init__( + self, + backbone = "swin_T_224_1k", + position_embedding = "sine", + pe_temperatureh = 20, + pe_temperaturew = 20, + return_interm_indices = [1, 2, 3], + backbone_freeze_keywords = None, + enc_layers = 6, + num_unicoder_layers = 0, + dec_layers = 6, + pre_norm = False, + dim_feedforward = 2048, + hidden_dim = 256, + dropout = 0.0, + nheads = 8, + num_queries = 900, + aux_loss = True, + iter_update = True, + dn_number = 0, + query_dim = 4, + num_patterns = 0, + num_feature_levels = 4, + enc_n_points = 4, + dec_n_points = 4, + learnable_tgt_init = True, + two_stage_type = "standard", + two_stage_bbox_embed_share = False, + two_stage_class_embed_share = False, + transformer_activation = "relu", + return_intermediate_dec = True, + dec_pred_bbox_embed_share = True, + dn_box_noise_scale = 1.0, + dn_label_noise_ratio = 0.5, + dn_label_coef = 1.0, + dn_bbox_coef = 1.0, + embed_init_tgt = True, + dn_labelbook_size = 2000, + max_text_len = 256, + text_encoder_type = "bert-base-uncased", + use_checkpoint = True, + use_transformer_ckpt = True, + use_text_cross_attention = True, + text_dropout = 0.0, + fusion_dropout = 0.0, + fusion_droppath = 0.1, + sub_sentence_present = True, + pretrain_img_size = 224, + patch_size = 4, + in_chans = 3, + num_layers = 4, + embed_dim = 96, + depths = [2, 2, 6, 2], + num_heads = [3, 6, 12, 24], + window_size = 7, + mlp_ratio = 4.0, + qkv_bias = True, + qk_scale = None, + drop_rate = 0.0, + attn_drop_rate = 0.0, + drop_path_rate = 0.2, + swintransformer_use_checkpoint = False, + ape = False, + patch_norm = True, + out_indices = [1, 2, 3], + frozen_stages = -1, + dilation = True, + **kwargs, + ): + super().__init__(**kwargs) + + self.backbone = backbone + self.position_embedding = position_embedding + self.pe_temperatureh = pe_temperatureh + self.pe_temperaturew = pe_temperaturew + self.return_interm_indices = return_interm_indices + self.backbone_freeze_keywords = backbone_freeze_keywords + self.enc_layers = enc_layers + self.num_unicoder_layers = num_unicoder_layers + self.dec_layers = dec_layers + self.pre_norm = pre_norm + self.dim_feedforward = dim_feedforward + self.hidden_dim = hidden_dim + self.dropout = dropout + self.nheads = nheads + self.num_queries = num_queries + self.aux_loss = aux_loss + self.iter_update = iter_update + self.dn_number = dn_number + self.query_dim = query_dim + self.num_patterns = num_patterns + self.num_feature_levels = num_feature_levels + self.enc_n_points = enc_n_points + self.dec_n_points = dec_n_points + self.learnable_tgt_init = learnable_tgt_init + self.two_stage_type = two_stage_type + self.two_stage_bbox_embed_share = two_stage_bbox_embed_share + self.two_stage_class_embed_share = two_stage_class_embed_share + self.transformer_activation = transformer_activation + self.return_intermediate_dec = return_intermediate_dec + self.dec_pred_bbox_embed_share = dec_pred_bbox_embed_share + self.dn_box_noise_scale = dn_box_noise_scale + self.dn_label_noise_ratio = dn_label_noise_ratio + self.dn_label_coef = dn_label_coef + self.dn_bbox_coef = dn_bbox_coef + self.embed_init_tgt = embed_init_tgt + self.dn_labelbook_size = dn_labelbook_size + self.max_text_len = max_text_len + self.text_encoder_type = text_encoder_type + self.use_checkpoint = use_checkpoint + self.use_transformer_ckpt = use_transformer_ckpt + self.use_text_cross_attention = use_text_cross_attention + self.text_dropout = text_dropout + self.fusion_dropout = fusion_dropout + self.fusion_droppath = fusion_droppath + self.sub_sentence_present = sub_sentence_present + self.pretrain_img_size = pretrain_img_size + self.num_layers = num_layers + self.patch_size = patch_size + self.in_chans = in_chans + self.embed_dim = embed_dim + self.depths = depths + self.num_heads = num_heads + self.window_size = window_size + self.mlp_ratio = mlp_ratio + self.qkv_bias = qkv_bias + self.qk_scale = qk_scale + self.drop_rate = drop_rate + self.attn_drop_rate = attn_drop_rate + self.drop_path_rate = drop_path_rate + self.swintransformer_use_checkpoint = swintransformer_use_checkpoint + self.ape = ape + self.patch_norm = patch_norm + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.dilation = dilation + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the vision config dict if we are loading from Blip2Config + if config_dict.get("model_type") == "blip-2": + config_dict = config_dict["detection_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + +class LyricsRAMConfig(PretrainedConfig): + + model_type = "blip_2_ram_model" + + def __init__( + self, + med_config='/med_config.json', + image_size=384, + window_size=12, + vit='swin_l', + vit_grad_ckpt=False, + vit_ckpt_layer=0, + prompt='a picture of ', + threshold=0.68, + delete_tag_index=[], + tag_list='/ram_tag_list.txt', + tag_list_chinese='/ram_tag_list_chinese.txt', + vision_width=1536, + image_res=384, + embed_dim=192, + depths=[2, 2, 18, 2], + num_heads=[6, 12, 24, 48], + **kwargs, + ): + super().__init__(**kwargs) + + self.med_config = med_config + self.image_size = image_size + self.window_size = window_size + self.vit = vit + self.vit_grad_ckpt = vit_grad_ckpt + self.vit_ckpt_layer = vit_ckpt_layer + self.prompt = prompt + self.threshold = threshold + self.delete_tag_index = delete_tag_index + self.tag_list = tag_list + self.tag_list_chinese = tag_list_chinese + self.vision_width = vision_width + self.image_res = image_res + self.embed_dim =embed_dim + self.depths = depths + self.num_heads = num_heads + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the vision config dict if we are loading from Blip2Config + if config_dict.get("model_type") == "blip-2": + config_dict = config_dict["ram_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class LyricsQFormerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Blip2QFormerModel`]. It is used to instantiate a + BLIP-2 Querying Transformer (Q-Former) model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the BLIP-2 + [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture. Configuration objects + inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from + [`PretrainedConfig`] for more information. + + Note that [`Blip2QFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention. + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling the model. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + cross_attention_frequency (`int`, *optional*, defaults to 2): + The frequency of adding cross-attention to the Transformer layers. + encoder_hidden_size (`int`, *optional*, defaults to 1408): + The hidden size of the hidden states for cross-attention. + + Examples: + + ```python + >>> from transformers import Blip2QFormerConfig, Blip2QFormerModel + + >>> # Initializing a BLIP-2 Salesforce/blip2-opt-2.7b style configuration + >>> configuration = Blip2QFormerConfig() + + >>> # Initializing a model (with random weights) from the Salesforce/blip2-opt-2.7b style configuration + >>> model = Blip2QFormerModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + model_type = "blip_2_qformer" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + cross_attention_frequency=2, + encoder_hidden_size=1408, + detection_encoder_hidden_size=256, + query_length=96, + num_vit_query_tokens=32, + num_dino_query_tokens=64, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.cross_attention_frequency = cross_attention_frequency + self.encoder_hidden_size = encoder_hidden_size + self.detection_encoder_hidden_size = detection_encoder_hidden_size + self.query_length = query_length + self.num_vit_query_tokens = num_vit_query_tokens + self.num_dino_query_tokens = num_dino_query_tokens + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the qformer config dict if we are loading from Blip2Config + if config_dict.get("model_type") == "blip-2": + config_dict = config_dict["qformer_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class LyricsConfig(PretrainedConfig): + r""" + [`Blip2Config`] is the configuration class to store the configuration of a [`Blip2ForConditionalGeneration`]. It is + used to instantiate a BLIP-2 model according to the specified arguments, defining the vision model, Q-Former model + and language model configs. Instantiating a configuration with the defaults will yield a similar configuration to + that of the BLIP-2 [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`Blip2VisionConfig`]. + qformer_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`Blip2QFormerConfig`]. + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize any [`PretrainedConfig`]. + num_query_tokens (`int`, *optional*, defaults to 32): + The number of query tokens passed through the Transformer. + + kwargs (*optional*): + Dictionary of keyword arguments. + + Example: + + ```python + >>> from transformers import ( + ... Blip2VisionConfig, + ... Blip2QFormerConfig, + ... OPTConfig, + ... Blip2Config, + ... Blip2ForConditionalGeneration, + ... ) + + >>> # Initializing a Blip2Config with Salesforce/blip2-opt-2.7b style configuration + >>> configuration = Blip2Config() + + >>> # Initializing a Blip2ForConditionalGeneration (with random weights) from the Salesforce/blip2-opt-2.7b style configuration + >>> model = Blip2ForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a Blip2Config from a Blip2VisionConfig, Blip2QFormerConfig and any PretrainedConfig + + >>> # Initializing BLIP-2 vision, BLIP-2 Q-Former and language model configurations + >>> vision_config = Blip2VisionConfig() + >>> qformer_config = Blip2QFormerConfig() + >>> text_config = OPTConfig() + + >>> config = Blip2Config.from_text_vision_configs(vision_config, qformer_config, text_config) + ```""" + + model_type = "blip-2" + is_composition = True + + def __init__(self, vision_config=None, qformer_config=None, text_config=None, detection_config=None, ram_config=None, num_query_tokens=96, image_text_hidden_size=256,**kwargs): + super().__init__(**kwargs) + + if vision_config is None: + vision_config = {} + logger.info("vision_config is None. initializing the LyricsVisionConfig with default values.") + + if qformer_config is None: + qformer_config = {} + logger.info("qformer_config is None. Initializing the LyricsQFormerConfig with default values.") + + if text_config is None: + text_config = {} + logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).") + + if detection_config is None: + detection_config = {} + logger.info("detection_config is None. initializing the LyricsDetectionConfig with default values.") + + if ram_config is None: + ram_config = {} + logger.info("ram_config is None. Initializing the LyricsRAMConfig with default values.") + + self.vision_config = LyricsVisionConfig(**vision_config) + self.qformer_config = LyricsQFormerConfig(**qformer_config) + self.detection_config = LyricsDetectionConfig(**detection_config) + self.ram_config = LyricsRAMConfig(**ram_config) + text_model_type = text_config["model_type"] if "model_type" in text_config else "opt" + self.text_config = CONFIG_MAPPING[text_model_type](**text_config) + + self.tie_word_embeddings = self.text_config.tie_word_embeddings + self.is_encoder_decoder = self.text_config.is_encoder_decoder + + self.num_query_tokens = num_query_tokens + self.image_text_hidden_size = image_text_hidden_size + self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size + self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES + self.initializer_factor = 1.0 + self.initializer_range = 0.02 + + @classmethod + def from_vision_qformer_text_configs( + cls, + vision_config: LyricsVisionConfig, + qformer_config: LyricsQFormerConfig, + text_config: PretrainedConfig, + detection_config: LyricsDetectionConfig, + ram_config: LyricsRAMConfig, + **kwargs, + ): + r""" + Instantiate a [`Blip2Config`] (or a derived class) from a BLIP-2 vision model, Q-Former and language model + configurations. + + Returns: + [`Blip2Config`]: An instance of a configuration object + """ + + return cls( + vision_config=vision_config.to_dict(), + qformer_config=qformer_config.to_dict(), + text_config=text_config.to_dict(), + detection_config=detection_config.to_dict(), + ram_config=ram_config.to_dict(), + **kwargs, + ) + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. + + Returns: + `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, + """ + output = copy.deepcopy(self.__dict__) + output["vision_config"] = self.vision_config.to_dict() + output["qformer_config"] = self.qformer_config.to_dict() + output["text_config"] = self.text_config.to_dict() + output["detection_config"] = self.detection_config.to_dict() + output["ram_config"] = self.ram_config.to_dict() + output["model_type"] = self.__class__.model_type + return output diff --git a/fengshen/models/Lyrics/groundingdino/bertwarper.py b/fengshen/models/Lyrics/groundingdino/bertwarper.py new file mode 100644 index 0000000..f0cf977 --- /dev/null +++ b/fengshen/models/Lyrics/groundingdino/bertwarper.py @@ -0,0 +1,273 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint +from torch import Tensor, nn +from torchvision.ops.boxes import nms +from transformers import BertConfig, BertModel, BertPreTrainedModel +from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions + + +class BertModelWarper(nn.Module): + def __init__(self, bert_model): + super().__init__() + # self.bert = bert_modelc + + self.config = bert_model.config + self.embeddings = bert_model.embeddings + self.encoder = bert_model.encoder + self.pooler = bert_model.pooler + + self.get_extended_attention_mask = bert_model.get_extended_attention_mask + self.invert_attention_mask = bert_model.invert_attention_mask + self.get_head_mask = bert_model.get_head_mask + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + """ + output_attentions = ( + output_attentions if output_attentions is not None else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + batch_size, seq_length = input_shape + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size, seq_length = input_shape + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] if past_key_values is not None else 0 + ) + + if attention_mask is None: + attention_mask = torch.ones( + ((batch_size, seq_length + past_key_values_length)), device=device + ) + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( + attention_mask, input_shape, device + ) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO': + # import ipdb; ipdb.set_trace() + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +class TextEncoderShell(nn.Module): + def __init__(self, text_encoder): + super().__init__() + self.text_encoder = text_encoder + self.config = self.text_encoder.config + + def forward(self, **kw): + # feed into text encoder + return self.text_encoder(**kw) + + +def generate_masks_with_special_tokens(tokenized, special_tokens_list, tokenizer): + """Generate attention mask between each pair of special tokens + Args: + input_ids (torch.Tensor): input ids. Shape: [bs, num_token] + special_tokens_mask (list): special tokens mask. + Returns: + torch.Tensor: attention mask between each special tokens. + """ + input_ids = tokenized["input_ids"] + bs, num_token = input_ids.shape + # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens + special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool() + for special_token in special_tokens_list: + special_tokens_mask |= input_ids == special_token + + # idxs: each row is a list of indices of special tokens + idxs = torch.nonzero(special_tokens_mask) + + # generate attention mask and positional ids + attention_mask = ( + torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1) + ) + position_ids = torch.zeros((bs, num_token), device=input_ids.device) + previous_col = 0 + for i in range(idxs.shape[0]): + row, col = idxs[i] + if (col == 0) or (col == num_token - 1): + attention_mask[row, col, col] = True + position_ids[row, col] = 0 + else: + attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True + position_ids[row, previous_col + 1 : col + 1] = torch.arange( + 0, col - previous_col, device=input_ids.device + ) + + previous_col = col + + # # padding mask + # padding_mask = tokenized['attention_mask'] + # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool() + + return attention_mask, position_ids.to(torch.long) + + +def generate_masks_with_special_tokens_and_transfer_map(tokenized, special_tokens_list, tokenizer): + """Generate attention mask between each pair of special tokens + Args: + input_ids (torch.Tensor): input ids. Shape: [bs, num_token] + special_tokens_mask (list): special tokens mask. + Returns: + torch.Tensor: attention mask between each special tokens. + """ + input_ids = tokenized["input_ids"] + bs, num_token = input_ids.shape + # special_tokens_mask: bs, num_token. 1 for special tokens. 0 for normal tokens + special_tokens_mask = torch.zeros((bs, num_token), device=input_ids.device).bool() + for special_token in special_tokens_list: + special_tokens_mask |= input_ids == special_token + + # idxs: each row is a list of indices of special tokens + idxs = torch.nonzero(special_tokens_mask) + + # generate attention mask and positional ids + attention_mask = ( + torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(bs, 1, 1) + ) + position_ids = torch.zeros((bs, num_token), device=input_ids.device) + cate_to_token_mask_list = [[] for _ in range(bs)] + previous_col = 0 + for i in range(idxs.shape[0]): + row, col = idxs[i] + if (col == 0) or (col == num_token - 1): + attention_mask[row, col, col] = True + position_ids[row, col] = 0 + else: + attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True + position_ids[row, previous_col + 1 : col + 1] = torch.arange( + 0, col - previous_col, device=input_ids.device + ) + c2t_maski = torch.zeros((num_token), device=input_ids.device).bool() + c2t_maski[previous_col + 1 : col] = True + cate_to_token_mask_list[row].append(c2t_maski) + previous_col = col + + cate_to_token_mask_list = [ + torch.stack(cate_to_token_mask_listi, dim=0) + for cate_to_token_mask_listi in cate_to_token_mask_list + ] + + # # padding mask + # padding_mask = tokenized['attention_mask'] + # attention_mask = attention_mask & padding_mask.unsqueeze(1).bool() & padding_mask.unsqueeze(2).bool() + + return attention_mask, position_ids.to(torch.long), cate_to_token_mask_list diff --git a/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn.h b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn.h new file mode 100644 index 0000000..c7408eb --- /dev/null +++ b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn.h @@ -0,0 +1,64 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +#pragma once + +#include "ms_deform_attn_cpu.h" + +#ifdef WITH_CUDA +#include "ms_deform_attn_cuda.h" +#endif + +namespace groundingdino { + +at::Tensor +ms_deform_attn_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step) +{ + if (value.type().is_cuda()) + { +#ifdef WITH_CUDA + return ms_deform_attn_cuda_forward( + value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +std::vector +ms_deform_attn_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step) +{ + if (value.type().is_cuda()) + { +#ifdef WITH_CUDA + return ms_deform_attn_cuda_backward( + value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +} // namespace groundingdino \ No newline at end of file diff --git a/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cpu.cpp b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cpu.cpp new file mode 100644 index 0000000..551243f --- /dev/null +++ b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cpu.cpp @@ -0,0 +1,43 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +#include + +#include +#include + +namespace groundingdino { + +at::Tensor +ms_deform_attn_cpu_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step) +{ + AT_ERROR("Not implement on cpu"); +} + +std::vector +ms_deform_attn_cpu_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step) +{ + AT_ERROR("Not implement on cpu"); +} + +} // namespace groundingdino diff --git a/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cpu.h b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cpu.h new file mode 100644 index 0000000..b2b88e8 --- /dev/null +++ b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cpu.h @@ -0,0 +1,35 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +#pragma once +#include + +namespace groundingdino { + +at::Tensor +ms_deform_attn_cpu_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step); + +std::vector +ms_deform_attn_cpu_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step); + +} // namespace groundingdino diff --git a/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cuda.cu b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cuda.cu new file mode 100644 index 0000000..d04fae8 --- /dev/null +++ b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cuda.cu @@ -0,0 +1,156 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +#include +#include "ms_deform_im2col_cuda.cuh" + +#include +#include +#include +#include + +namespace groundingdino { + +at::Tensor ms_deform_attn_cuda_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step) +{ + AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); + AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); + AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); + AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); + AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); + + AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); + AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); + AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); + AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); + AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); + + const int batch = value.size(0); + const int spatial_size = value.size(1); + const int num_heads = value.size(2); + const int channels = value.size(3); + + const int num_levels = spatial_shapes.size(0); + + const int num_query = sampling_loc.size(1); + const int num_point = sampling_loc.size(4); + + const int im2col_step_ = std::min(batch, im2col_step); + + AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); + + auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); + + const int batch_n = im2col_step_; + auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); + auto per_value_size = spatial_size * num_heads * channels; + auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; + auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; + for (int n = 0; n < batch/im2col_step_; ++n) + { + auto columns = output_n.select(0, n); + AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { + ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), + value.data() + n * im2col_step_ * per_value_size, + spatial_shapes.data(), + level_start_index.data(), + sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, + attn_weight.data() + n * im2col_step_ * per_attn_weight_size, + batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, + columns.data()); + + })); + } + + output = output.view({batch, num_query, num_heads*channels}); + + return output; +} + + +std::vector ms_deform_attn_cuda_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step) +{ + + AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); + AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); + AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); + AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); + AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); + AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); + + AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); + AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); + AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); + AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); + AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); + AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); + + const int batch = value.size(0); + const int spatial_size = value.size(1); + const int num_heads = value.size(2); + const int channels = value.size(3); + + const int num_levels = spatial_shapes.size(0); + + const int num_query = sampling_loc.size(1); + const int num_point = sampling_loc.size(4); + + const int im2col_step_ = std::min(batch, im2col_step); + + AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); + + auto grad_value = at::zeros_like(value); + auto grad_sampling_loc = at::zeros_like(sampling_loc); + auto grad_attn_weight = at::zeros_like(attn_weight); + + const int batch_n = im2col_step_; + auto per_value_size = spatial_size * num_heads * channels; + auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; + auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; + auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); + + for (int n = 0; n < batch/im2col_step_; ++n) + { + auto grad_output_g = grad_output_n.select(0, n); + AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { + ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), + grad_output_g.data(), + value.data() + n * im2col_step_ * per_value_size, + spatial_shapes.data(), + level_start_index.data(), + sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, + attn_weight.data() + n * im2col_step_ * per_attn_weight_size, + batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, + grad_value.data() + n * im2col_step_ * per_value_size, + grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, + grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); + + })); + } + + return { + grad_value, grad_sampling_loc, grad_attn_weight + }; +} + +} // namespace groundingdino \ No newline at end of file diff --git a/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cuda.h b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cuda.h new file mode 100644 index 0000000..ad1311a --- /dev/null +++ b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_attn_cuda.h @@ -0,0 +1,33 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +#pragma once +#include + +namespace groundingdino { + +at::Tensor ms_deform_attn_cuda_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step); + +std::vector ms_deform_attn_cuda_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step); + +} // namespace groundingdino \ No newline at end of file diff --git a/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_im2col_cuda.cuh b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_im2col_cuda.cuh new file mode 100644 index 0000000..6bc2acb --- /dev/null +++ b/fengshen/models/Lyrics/groundingdino/csrc/MsDeformAttn/ms_deform_im2col_cuda.cuh @@ -0,0 +1,1327 @@ +/*! +************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************** +* Modified from DCN (https://github.com/msracver/Deformable-ConvNets) +* Copyright (c) 2018 Microsoft +************************************************************************** +*/ + +#include +#include +#include + +#include +#include + +#include + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N, const int num_threads) +{ + return (N + num_threads - 1) / num_threads; +} + + +template +__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, + const int &height, const int &width, const int &nheads, const int &channels, + const scalar_t &h, const scalar_t &w, const int &m, const int &c) +{ + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const scalar_t lh = h - h_low; + const scalar_t lw = w - w_low; + const scalar_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + } + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + } + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + } + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + } + + const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + + +template +__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, + const int &height, const int &width, const int &nheads, const int &channels, + const scalar_t &h, const scalar_t &w, const int &m, const int &c, + const scalar_t &top_grad, + const scalar_t &attn_weight, + scalar_t* &grad_value, + scalar_t* grad_sampling_loc, + scalar_t* grad_attn_weight) +{ + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const scalar_t lh = h - h_low; + const scalar_t lw = w - w_low; + const scalar_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + const scalar_t top_grad_value = top_grad * attn_weight; + scalar_t grad_h_weight = 0, grad_w_weight = 0; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + grad_h_weight -= hw * v1; + grad_w_weight -= hh * v1; + atomicAdd(grad_value+ptr1, w1*top_grad_value); + } + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + grad_h_weight -= lw * v2; + grad_w_weight += hh * v2; + atomicAdd(grad_value+ptr2, w2*top_grad_value); + } + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + grad_h_weight += hw * v3; + grad_w_weight -= lh * v3; + atomicAdd(grad_value+ptr3, w3*top_grad_value); + } + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + grad_h_weight += lw * v4; + grad_w_weight += lh * v4; + atomicAdd(grad_value+ptr4, w4*top_grad_value); + } + + const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + *grad_attn_weight = top_grad * val; + *grad_sampling_loc = width * grad_w_weight * top_grad_value; + *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value; +} + + +template +__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, + const int &height, const int &width, const int &nheads, const int &channels, + const scalar_t &h, const scalar_t &w, const int &m, const int &c, + const scalar_t &top_grad, + const scalar_t &attn_weight, + scalar_t* &grad_value, + scalar_t* grad_sampling_loc, + scalar_t* grad_attn_weight) +{ + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const scalar_t lh = h - h_low; + const scalar_t lw = w - w_low; + const scalar_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + const scalar_t top_grad_value = top_grad * attn_weight; + scalar_t grad_h_weight = 0, grad_w_weight = 0; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + grad_h_weight -= hw * v1; + grad_w_weight -= hh * v1; + atomicAdd(grad_value+ptr1, w1*top_grad_value); + } + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + grad_h_weight -= lw * v2; + grad_w_weight += hh * v2; + atomicAdd(grad_value+ptr2, w2*top_grad_value); + } + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + grad_h_weight += hw * v3; + grad_w_weight -= lh * v3; + atomicAdd(grad_value+ptr3, w3*top_grad_value); + } + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + grad_h_weight += lw * v4; + grad_w_weight += lh * v4; + atomicAdd(grad_value+ptr4, w4*top_grad_value); + } + + const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + atomicAdd(grad_attn_weight, top_grad * val); + atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value); + atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value); +} + + +template +__global__ void ms_deformable_im2col_gpu_kernel(const int n, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *data_col) +{ + CUDA_KERNEL_LOOP(index, n) + { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + scalar_t *data_col_ptr = data_col + index; + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + scalar_t col = 0; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride); + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight; + } + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + } + } + *data_col_ptr = col; + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; + __shared__ scalar_t cache_grad_attn_weight[blockSize]; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + if (tid == 0) + { + scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; + int sid=2; + for (unsigned int tid = 1; tid < blockSize; ++tid) + { + _grad_w += cache_grad_sampling_loc[sid]; + _grad_h += cache_grad_sampling_loc[sid + 1]; + _grad_a += cache_grad_attn_weight[tid]; + sid += 2; + } + + + *grad_sampling_loc = _grad_w; + *(grad_sampling_loc + 1) = _grad_h; + *grad_attn_weight = _grad_a; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; + __shared__ scalar_t cache_grad_attn_weight[blockSize]; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s=blockSize/2; s>0; s>>=1) + { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; + } + __syncthreads(); + } + + if (tid == 0) + { + *grad_sampling_loc = cache_grad_sampling_loc[0]; + *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; + *grad_attn_weight = cache_grad_attn_weight[0]; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + extern __shared__ int _s[]; + scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; + scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + if (tid == 0) + { + scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; + int sid=2; + for (unsigned int tid = 1; tid < blockDim.x; ++tid) + { + _grad_w += cache_grad_sampling_loc[sid]; + _grad_h += cache_grad_sampling_loc[sid + 1]; + _grad_a += cache_grad_attn_weight[tid]; + sid += 2; + } + + + *grad_sampling_loc = _grad_w; + *(grad_sampling_loc + 1) = _grad_h; + *grad_attn_weight = _grad_a; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + extern __shared__ int _s[]; + scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; + scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) + { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; + if (tid + (s << 1) < spre) + { + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; + } + } + __syncthreads(); + } + + if (tid == 0) + { + *grad_sampling_loc = cache_grad_sampling_loc[0]; + *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; + *grad_attn_weight = cache_grad_attn_weight[0]; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + extern __shared__ int _s[]; + scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; + scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) + { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; + if (tid + (s << 1) < spre) + { + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; + } + } + __syncthreads(); + } + + if (tid == 0) + { + atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]); + atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]); + atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]); + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear_gm( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + grad_sampling_loc, grad_attn_weight); + } + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +void ms_deformable_im2col_cuda(cudaStream_t stream, + const scalar_t* data_value, + const int64_t* data_spatial_shapes, + const int64_t* data_level_start_index, + const scalar_t* data_sampling_loc, + const scalar_t* data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t* data_col) +{ + const int num_kernels = batch_size * num_query * num_heads * channels; + const int num_actual_kernels = batch_size * num_query * num_heads * channels; + const int num_threads = CUDA_NUM_THREADS; + ms_deformable_im2col_gpu_kernel + <<>>( + num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, + batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); + } + +} + +template +void ms_deformable_col2im_cuda(cudaStream_t stream, + const scalar_t* grad_col, + const scalar_t* data_value, + const int64_t * data_spatial_shapes, + const int64_t * data_level_start_index, + const scalar_t * data_sampling_loc, + const scalar_t * data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t* grad_value, + scalar_t* grad_sampling_loc, + scalar_t* grad_attn_weight) +{ + const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels; + const int num_kernels = batch_size * num_query * num_heads * channels; + const int num_actual_kernels = batch_size * num_query * num_heads * channels; + if (channels > 1024) + { + if ((channels & 1023) == 0) + { + ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + else + { + ms_deformable_col2im_gpu_kernel_gm + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + } + else{ + switch(channels) + { + case 1: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 2: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 4: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 8: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 16: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 32: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 64: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 128: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 256: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 512: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 1024: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + default: + if (channels < 64) + { + ms_deformable_col2im_gpu_kernel_shm_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + else + { + ms_deformable_col2im_gpu_kernel_shm_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + } + } + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); + } + +} \ No newline at end of file diff --git a/fengshen/models/Lyrics/groundingdino/csrc/cuda_version.cu b/fengshen/models/Lyrics/groundingdino/csrc/cuda_version.cu new file mode 100644 index 0000000..64569e3 --- /dev/null +++ b/fengshen/models/Lyrics/groundingdino/csrc/cuda_version.cu @@ -0,0 +1,7 @@ +#include + +namespace groundingdino { +int get_cudart_version() { + return CUDART_VERSION; +} +} // namespace groundingdino diff --git a/fengshen/models/Lyrics/groundingdino/csrc/vision.cpp b/fengshen/models/Lyrics/groundingdino/csrc/vision.cpp new file mode 100644 index 0000000..c1f2c50 --- /dev/null +++ b/fengshen/models/Lyrics/groundingdino/csrc/vision.cpp @@ -0,0 +1,58 @@ +// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved + +#include "MsDeformAttn/ms_deform_attn.h" + +namespace groundingdino { + +#ifdef WITH_CUDA +extern int get_cudart_version(); +#endif + +std::string get_cuda_version() { +#ifdef WITH_CUDA + std::ostringstream oss; + + // copied from + // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231 + auto printCudaStyleVersion = [&](int v) { + oss << (v / 1000) << "." << (v / 10 % 100); + if (v % 10 != 0) { + oss << "." << (v % 10); + } + }; + printCudaStyleVersion(get_cudart_version()); + return oss.str(); +#else + return std::string("not available"); +#endif +} + +// similar to +// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp +std::string get_compiler_version() { + std::ostringstream ss; +#if defined(__GNUC__) +#ifndef __clang__ + { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; } +#endif +#endif + +#if defined(__clang_major__) + { + ss << "clang " << __clang_major__ << "." << __clang_minor__ << "." + << __clang_patchlevel__; + } +#endif + +#if defined(_MSC_VER) + { ss << "MSVC " << _MSC_FULL_VER; } +#endif + return ss.str(); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); + m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); +} + +} // namespace groundingdino \ No newline at end of file diff --git a/fengshen/models/Lyrics/groundingdino/fuse_modules.py b/fengshen/models/Lyrics/groundingdino/fuse_modules.py new file mode 100644 index 0000000..cbafee1 --- /dev/null +++ b/fengshen/models/Lyrics/groundingdino/fuse_modules.py @@ -0,0 +1,297 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ + +import torch +import torch.nn as nn +import torch.nn.functional as F +from timm.models.layers import DropPath + + +class FeatureResizer(nn.Module): + """ + This class takes as input a set of embeddings of dimension C1 and outputs a set of + embedding of dimension C2, after a linear transformation, dropout and normalization (LN). + """ + + def __init__(self, input_feat_size, output_feat_size, dropout, do_ln=True): + super().__init__() + self.do_ln = do_ln + # Object feature encoding + self.fc = nn.Linear(input_feat_size, output_feat_size, bias=True) + self.layer_norm = nn.LayerNorm(output_feat_size, eps=1e-12) + self.dropout = nn.Dropout(dropout) + + def forward(self, encoder_features): + x = self.fc(encoder_features) + if self.do_ln: + x = self.layer_norm(x) + output = self.dropout(x) + return output + + +def l1norm(X, dim, eps=1e-8): + """L1-normalize columns of X""" + norm = torch.abs(X).sum(dim=dim, keepdim=True) + eps + X = torch.div(X, norm) + return X + + +def l2norm(X, dim, eps=1e-8): + """L2-normalize columns of X""" + norm = torch.pow(X, 2).sum(dim=dim, keepdim=True).sqrt() + eps + X = torch.div(X, norm) + return X + + +def func_attention(query, context, smooth=1, raw_feature_norm="softmax", eps=1e-8): + """ + query: (n_context, queryL, d) + context: (n_context, sourceL, d) + """ + batch_size_q, queryL = query.size(0), query.size(1) + batch_size, sourceL = context.size(0), context.size(1) + + # Get attention + # --> (batch, d, queryL) + queryT = torch.transpose(query, 1, 2) + + # (batch, sourceL, d)(batch, d, queryL) + # --> (batch, sourceL, queryL) + attn = torch.bmm(context, queryT) + if raw_feature_norm == "softmax": + # --> (batch*sourceL, queryL) + attn = attn.view(batch_size * sourceL, queryL) + attn = nn.Softmax()(attn) + # --> (batch, sourceL, queryL) + attn = attn.view(batch_size, sourceL, queryL) + elif raw_feature_norm == "l2norm": + attn = l2norm(attn, 2) + elif raw_feature_norm == "clipped_l2norm": + attn = nn.LeakyReLU(0.1)(attn) + attn = l2norm(attn, 2) + else: + raise ValueError("unknown first norm type:", raw_feature_norm) + # --> (batch, queryL, sourceL) + attn = torch.transpose(attn, 1, 2).contiguous() + # --> (batch*queryL, sourceL) + attn = attn.view(batch_size * queryL, sourceL) + attn = nn.Softmax()(attn * smooth) + # --> (batch, queryL, sourceL) + attn = attn.view(batch_size, queryL, sourceL) + # --> (batch, sourceL, queryL) + attnT = torch.transpose(attn, 1, 2).contiguous() + + # --> (batch, d, sourceL) + contextT = torch.transpose(context, 1, 2) + # (batch x d x sourceL)(batch x sourceL x queryL) + # --> (batch, d, queryL) + weightedContext = torch.bmm(contextT, attnT) + # --> (batch, queryL, d) + weightedContext = torch.transpose(weightedContext, 1, 2) + + return weightedContext, attnT + + +class BiMultiHeadAttention(nn.Module): + def __init__(self, v_dim, l_dim, embed_dim, num_heads, dropout=0.1, cfg=None): + super(BiMultiHeadAttention, self).__init__() + + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + self.v_dim = v_dim + self.l_dim = l_dim + + assert ( + self.head_dim * self.num_heads == self.embed_dim + ), f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." + self.scale = self.head_dim ** (-0.5) + self.dropout = dropout + + self.v_proj = nn.Linear(self.v_dim, self.embed_dim) + self.l_proj = nn.Linear(self.l_dim, self.embed_dim) + self.values_v_proj = nn.Linear(self.v_dim, self.embed_dim) + self.values_l_proj = nn.Linear(self.l_dim, self.embed_dim) + + self.out_v_proj = nn.Linear(self.embed_dim, self.v_dim) + self.out_l_proj = nn.Linear(self.embed_dim, self.l_dim) + + self.stable_softmax_2d = True + self.clamp_min_for_underflow = True + self.clamp_max_for_overflow = True + + self._reset_parameters() + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def _reset_parameters(self): + nn.init.xavier_uniform_(self.v_proj.weight) + self.v_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.l_proj.weight) + self.l_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.values_v_proj.weight) + self.values_v_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.values_l_proj.weight) + self.values_l_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.out_v_proj.weight) + self.out_v_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(self.out_l_proj.weight) + self.out_l_proj.bias.data.fill_(0) + + def forward(self, v, l, attention_mask_v=None, attention_mask_l=None): + """_summary_ + + Args: + v (_type_): bs, n_img, dim + l (_type_): bs, n_text, dim + attention_mask_v (_type_, optional): _description_. bs, n_img + attention_mask_l (_type_, optional): _description_. bs, n_text + + Returns: + _type_: _description_ + """ + # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO': + # import ipdb; ipdb.set_trace() + bsz, tgt_len, _ = v.size() + + query_states = self.v_proj(v) * self.scale + key_states = self._shape(self.l_proj(l), -1, bsz) + value_v_states = self._shape(self.values_v_proj(v), -1, bsz) + value_l_states = self._shape(self.values_l_proj(l), -1, bsz) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_v_states = value_v_states.view(*proj_shape) + value_l_states = value_l_states.view(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) # bs*nhead, nimg, ntxt + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}" + ) + + if self.stable_softmax_2d: + attn_weights = attn_weights - attn_weights.max() + + if self.clamp_min_for_underflow: + attn_weights = torch.clamp( + attn_weights, min=-50000 + ) # Do not increase -50000, data type half has quite limited range + if self.clamp_max_for_overflow: + attn_weights = torch.clamp( + attn_weights, max=50000 + ) # Do not increase 50000, data type half has quite limited range + + attn_weights_T = attn_weights.transpose(1, 2) + attn_weights_l = attn_weights_T - torch.max(attn_weights_T, dim=-1, keepdim=True)[0] + if self.clamp_min_for_underflow: + attn_weights_l = torch.clamp( + attn_weights_l, min=-50000 + ) # Do not increase -50000, data type half has quite limited range + if self.clamp_max_for_overflow: + attn_weights_l = torch.clamp( + attn_weights_l, max=50000 + ) # Do not increase 50000, data type half has quite limited range + + # mask vison for language + if attention_mask_v is not None: + attention_mask_v = ( + attention_mask_v[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) + ) + attn_weights_l.masked_fill_(attention_mask_v, float("-inf")) + + attn_weights_l = attn_weights_l.softmax(dim=-1) + + # mask language for vision + if attention_mask_l is not None: + attention_mask_l = ( + attention_mask_l[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) + ) + attn_weights.masked_fill_(attention_mask_l, float("-inf")) + attn_weights_v = attn_weights.softmax(dim=-1) + + attn_probs_v = F.dropout(attn_weights_v, p=self.dropout, training=self.training) + attn_probs_l = F.dropout(attn_weights_l, p=self.dropout, training=self.training) + + attn_output_v = torch.bmm(attn_probs_v, value_l_states) + attn_output_l = torch.bmm(attn_probs_l, value_v_states) + + if attn_output_v.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output_v` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is {attn_output_v.size()}" + ) + + if attn_output_l.size() != (bsz * self.num_heads, src_len, self.head_dim): + raise ValueError( + f"`attn_output_l` should be of size {(bsz, self.num_heads, src_len, self.head_dim)}, but is {attn_output_l.size()}" + ) + + attn_output_v = attn_output_v.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output_v = attn_output_v.transpose(1, 2) + attn_output_v = attn_output_v.reshape(bsz, tgt_len, self.embed_dim) + + attn_output_l = attn_output_l.view(bsz, self.num_heads, src_len, self.head_dim) + attn_output_l = attn_output_l.transpose(1, 2) + attn_output_l = attn_output_l.reshape(bsz, src_len, self.embed_dim) + + attn_output_v = self.out_v_proj(attn_output_v) + attn_output_l = self.out_l_proj(attn_output_l) + + return attn_output_v, attn_output_l + + +# Bi-Direction MHA (text->image, image->text) +class BiAttentionBlock(nn.Module): + def __init__( + self, + v_dim, + l_dim, + embed_dim, + num_heads, + dropout=0.1, + drop_path=0.0, + init_values=1e-4, + cfg=None, + ): + """ + Inputs: + embed_dim - Dimensionality of input and attention feature vectors + hidden_dim - Dimensionality of hidden layer in feed-forward network + (usually 2-4x larger than embed_dim) + num_heads - Number of heads to use in the Multi-Head Attention block + dropout - Amount of dropout to apply in the feed-forward network + """ + super(BiAttentionBlock, self).__init__() + + # pre layer norm + self.layer_norm_v = nn.LayerNorm(v_dim) + self.layer_norm_l = nn.LayerNorm(l_dim) + self.attn = BiMultiHeadAttention( + v_dim=v_dim, l_dim=l_dim, embed_dim=embed_dim, num_heads=num_heads, dropout=dropout + ) + + # add layer scale for training stability + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.temp_v = nn.Parameter(init_values * torch.ones((v_dim)), requires_grad=True) + self.temp_l = nn.Parameter(init_values * torch.ones((l_dim)), requires_grad=True) + + def forward(self, v, l, attention_mask_v=None, attention_mask_l=None): + v = self.layer_norm_v(v) + l = self.layer_norm_l(l) + delta_v, delta_l = self.attn( + v, l, attention_mask_v=attention_mask_v, attention_mask_l=attention_mask_l + ) + # v, l = v + delta_v, l + delta_l + v = v + self.drop_path(self.temp_v * delta_v) + l = l + self.drop_path(self.temp_l * delta_l) + return v, l + + # def forward(self, v:List[torch.Tensor], l, attention_mask_v=None, attention_mask_l=None) diff --git a/fengshen/models/Lyrics/groundingdino/modeling_groundingdino.py b/fengshen/models/Lyrics/groundingdino/modeling_groundingdino.py new file mode 100644 index 0000000..f9af220 --- /dev/null +++ b/fengshen/models/Lyrics/groundingdino/modeling_groundingdino.py @@ -0,0 +1,338 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Conditional DETR model and criterion classes. +# Copyright (c) 2021 Microsoft. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Modified from DETR (https://github.com/facebookresearch/detr) +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# ------------------------------------------------------------------------ +# Modified from Deformable DETR (https://github.com/fundamentalvision/Deformable-DETR) +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# ------------------------------------------------------------------------ +import copy +from typing import List + +import torch +import torch.nn.functional as F +from torch import nn +from transformers import AutoTokenizer, BertModel, BertConfig +from fengshen.models.groundedblip.backbone import Joiner, SwinTransformer +from fengshen.models.groundedblip.groundingdino.bertwarper import ( + BertModelWarper, + generate_masks_with_special_tokens_and_transfer_map, +) +from fengshen.models.groundedblip.groundingdino.transformer import Transformer +from fengshen.models.groundedblip.groundingdino.utils import ( + MLP, + ContrastiveEmbed, + NestedTensor, + inverse_sigmoid, + nested_tensor_from_tensor_list, +) + + + +class GroundingDINO(nn.Module): + """This is the Cross-Attention Detector module that performs object detection""" + + def __init__( + self, + args, + # max_text_len=256, + ): + """Initializes the model. + Parameters: + backbone: torch module of the backbone to be used. See backbone.py + transformer: torch module of the transformer architecture. See transformer.py + num_queries: number of object queries, ie detection slot. This is the maximal number of objects + Conditional DETR can detect in a single image. For COCO, we recommend 100 queries. + aux_loss: True if auxiliary decoding losses (loss at each decoder layer) are to be used. + """ + super().__init__() + self.num_queries = args.num_queries + self.transformer = Transformer(args) + self.hidden_dim = hidden_dim = self.transformer.d_model + self.num_feature_levels = args.num_feature_levels + self.nheads = args.nheads + self.max_text_len = 256 + self.sub_sentence_present = args.sub_sentence_present + + # setting query dim + self.query_dim = args.query_dim + assert args.query_dim == 4 + + # for dn training + self.num_patterns = args.num_patterns + self.dn_number = args.dn_number + self.dn_box_noise_scale = args.dn_box_noise_scale + self.dn_label_noise_ratio = args.dn_label_noise_ratio + self.dn_labelbook_size = args.dn_labelbook_size + + # bert + self.bert_config = BertConfig.from_pretrained(args.text_encoder_type) + self.tokenizer = AutoTokenizer.from_pretrained(args.text_encoder_type) + self.bert = BertModel(self.bert_config) + self.bert.pooler.dense.weight.requires_grad_(False) + self.bert.pooler.dense.bias.requires_grad_(False) + self.bert = BertModelWarper(bert_model=self.bert) + + self.feat_map = nn.Linear(self.bert.config.hidden_size, self.hidden_dim, bias=True) + nn.init.constant_(self.feat_map.bias.data, 0) + nn.init.xavier_uniform_(self.feat_map.weight.data) + # freeze + + # special tokens + self.specical_tokens = self.tokenizer.convert_tokens_to_ids(["[CLS]", "[SEP]", ".", "?"]) + + self.backbone = Joiner(args) + + # prepare input projection layers + if self.num_feature_levels > 1: + num_backbone_outs = len(self.backbone.num_channels) + input_proj_list = [] + for _ in range(num_backbone_outs): + in_channels = self.backbone.num_channels[_] + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, hidden_dim, kernel_size=1), + nn.GroupNorm(32, hidden_dim), + ) + ) + for _ in range(self.num_feature_levels - num_backbone_outs): + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(32, hidden_dim), + ) + ) + in_channels = hidden_dim + self.input_proj = nn.ModuleList(input_proj_list) + else: + assert args.two_stage_type == "no", "two_stage_type should be no if num_feature_levels=1 !!!" + self.input_proj = nn.ModuleList( + [ + nn.Sequential( + nn.Conv2d(self.backbone.num_channels[-1], hidden_dim, kernel_size=1), + nn.GroupNorm(32, hidden_dim), + ) + ] + ) + + self.aux_loss = args.aux_loss + self.box_pred_damping = box_pred_damping = None + + self.iter_update = args.iter_update + assert args.iter_update, "Why not iter_update?" + + # prepare pred layers + self.dec_pred_bbox_embed_share = args.dec_pred_bbox_embed_share + # prepare class & box embed + _class_embed = ContrastiveEmbed() + + _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3) + nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0) + nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0) + + if args.dec_pred_bbox_embed_share: + box_embed_layerlist = [_bbox_embed for i in range(self.transformer.num_decoder_layers)] + else: + box_embed_layerlist = [ + copy.deepcopy(_bbox_embed) for i in range(self.transformer.num_decoder_layers) + ] + class_embed_layerlist = [_class_embed for i in range(self.transformer.num_decoder_layers)] + self.bbox_embed = nn.ModuleList(box_embed_layerlist) + self.class_embed = nn.ModuleList(class_embed_layerlist) + self.transformer.decoder.bbox_embed = self.bbox_embed + self.transformer.decoder.class_embed = self.class_embed + + # two stage + self.two_stage_type = args.two_stage_type + assert args.two_stage_type in ["no", "standard"], "unknown param {} of two_stage_type".format( + args.two_stage_type + ) + if args.two_stage_type != "no": + if args.two_stage_bbox_embed_share: + assert args.dec_pred_bbox_embed_share + self.transformer.enc_out_bbox_embed = _bbox_embed + else: + self.transformer.enc_out_bbox_embed = copy.deepcopy(_bbox_embed) + + if args.two_stage_class_embed_share: + assert args.dec_pred_bbox_embed_share + self.transformer.enc_out_class_embed = _class_embed + else: + self.transformer.enc_out_class_embed = copy.deepcopy(_class_embed) + + self.refpoint_embed = None + + self._reset_parameters() + + def _reset_parameters(self): + # init input_proj + for proj in self.input_proj: + nn.init.xavier_uniform_(proj[0].weight, gain=1) + nn.init.constant_(proj[0].bias, 0) + + def set_image_tensor(self, samples: NestedTensor): + if isinstance(samples, (list, torch.Tensor)): + samples = nested_tensor_from_tensor_list(samples) + self.features, self.poss = self.backbone(samples) + + def unset_image_tensor(self): + if hasattr(self, 'features'): + del self.features + if hasattr(self,'poss'): + del self.poss + + def set_image_features(self, features , poss): + self.features = features + self.poss = poss + + def init_ref_points(self, use_num_queries): + self.refpoint_embed = nn.Embedding(use_num_queries, self.query_dim) + + def forward(self, samples: NestedTensor, targets: List = None, **kw): + """The forward expects a NestedTensor, which consists of: + - samples.tensor: batched images, of shape [batch_size x 3 x H x W] + - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels + + It returns a dict with the following elements: + - "pred_logits": the classification logits (including no-object) for all queries. + Shape= [batch_size x num_queries x num_classes] + - "pred_boxes": The normalized boxes coordinates for all queries, represented as + (center_x, center_y, width, height). These values are normalized in [0, 1], + relative to the size of each individual image (disregarding possible padding). + See PostProcess for information on how to retrieve the unnormalized bounding box. + - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of + dictionnaries containing the two above keys for each decoder layer. + """ + if targets is None: + captions = kw["captions"] + else: + captions = [t["caption"] for t in targets] + + # encoder texts + tokenized = self.tokenizer(captions, padding="longest", return_tensors="pt").to( + "cuda" + ) + ( + text_self_attention_masks, + position_ids, + cate_to_token_mask_list, + ) = generate_masks_with_special_tokens_and_transfer_map( + tokenized, self.specical_tokens, self.tokenizer + ) + + if text_self_attention_masks.shape[1] > self.max_text_len: + text_self_attention_masks = text_self_attention_masks[ + :, : self.max_text_len, : self.max_text_len + ] + position_ids = position_ids[:, : self.max_text_len] + tokenized["input_ids"] = tokenized["input_ids"][:, : self.max_text_len] + tokenized["attention_mask"] = tokenized["attention_mask"][:, : self.max_text_len] + tokenized["token_type_ids"] = tokenized["token_type_ids"][:, : self.max_text_len] + + # extract text embeddings + if self.sub_sentence_present: + tokenized_for_encoder = {k: v for k, v in tokenized.items() if k != "attention_mask"} + tokenized_for_encoder["attention_mask"] = text_self_attention_masks + tokenized_for_encoder["position_ids"] = position_ids + else: + # import ipdb; ipdb.set_trace() + tokenized_for_encoder = tokenized + + bert_output = self.bert(**tokenized_for_encoder) # bs, 195, 768 + + encoded_text = self.feat_map(bert_output["last_hidden_state"]) # bs, 195, d_model + text_token_mask = tokenized.attention_mask.bool() # bs, 195 + # text_token_mask: True for nomask, False for mask + # text_self_attention_masks: True for nomask, False for mask + + if encoded_text.shape[1] > self.max_text_len: + encoded_text = encoded_text[:, : self.max_text_len, :] + text_token_mask = text_token_mask[:, : self.max_text_len] + position_ids = position_ids[:, : self.max_text_len] + text_self_attention_masks = text_self_attention_masks[ + :, : self.max_text_len, : self.max_text_len + ] + + text_dict = { + "encoded_text": encoded_text, # bs, 195, d_model + "text_token_mask": text_token_mask, # bs, 195 + "position_ids": position_ids, # bs, 195 + "text_self_attention_masks": text_self_attention_masks, # bs, 195,195 + } + + # import ipdb; ipdb.set_trace() + if isinstance(samples, (list, torch.Tensor)): + samples = nested_tensor_from_tensor_list(samples).to("cuda") + features, poss = self.backbone(samples) + + srcs = [] + masks = [] + for l, feat in enumerate(features): + src, mask = feat.decompose() + srcs.append(self.input_proj[l](src)) + masks.append(mask) + assert mask is not None + if self.num_feature_levels > len(srcs): + _len_srcs = len(srcs) + for l in range(_len_srcs, self.num_feature_levels): + if l == _len_srcs: + src = self.input_proj[l](features[-1].tensors) + else: + src = self.input_proj[l](srcs[-1]) + m = samples.mask + mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0] + pos_l = self.backbone.position_embedding(NestedTensor(src, mask)).to(src.dtype) + srcs.append(src) + masks.append(mask) + poss.append(pos_l) + + input_query_bbox = input_query_label = attn_mask = dn_meta = None + hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer( + srcs, masks, input_query_bbox, poss, input_query_label, attn_mask, text_dict + ) + + out = {'hidden_state':torch.cat([hs[-1],reference[-1]], dim=-1)} + + # deformable-detr-like anchor update + outputs_coord_list = [] + for dec_lid, (layer_ref_sig, layer_bbox_embed, layer_hs) in enumerate( + zip(reference[:-1], self.bbox_embed, hs) + ): + layer_delta_unsig = layer_bbox_embed(layer_hs) + layer_outputs_unsig = layer_delta_unsig + inverse_sigmoid(layer_ref_sig) + layer_outputs_unsig = layer_outputs_unsig.sigmoid() + outputs_coord_list.append(layer_outputs_unsig) + outputs_coord_list = torch.stack(outputs_coord_list) + + # output + outputs_class = torch.stack( + [ + layer_cls_embed(layer_hs, text_dict) + for layer_cls_embed, layer_hs in zip(self.class_embed, hs) + ] + ) + # out = {"pred_logits": outputs_class[-1], "pred_boxes": outputs_coord_list[-1]} + out['pred_logits'] = outputs_class[-1] + out['pred_boxes'] = outputs_coord_list[-1] + + # out = torch.cat([hs[-1],reference[-1]], dim=-1) + + return out + + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [ + {"pred_logits": a, "pred_boxes": b} + for a, b in zip(outputs_class[:-1], outputs_coord[:-1]) + ] diff --git a/fengshen/models/Lyrics/groundingdino/ms_deform_attn.py b/fengshen/models/Lyrics/groundingdino/ms_deform_attn.py new file mode 100644 index 0000000..e87aa83 --- /dev/null +++ b/fengshen/models/Lyrics/groundingdino/ms_deform_attn.py @@ -0,0 +1,421 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from: +# https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/functions/ms_deform_attn_func.py +# https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/modules/ms_deform_attn.py +# https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/multi_scale_deform_attn.py +# ------------------------------------------------------------------------------------------------ + +import math +import warnings +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.init import constant_, xavier_uniform_ + +try: + from lyrica import _C +except: + warnings.warn("Failed to load custom C++ ops. Running on CPU mode Only!") + + +# helpers +def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + +class MultiScaleDeformableAttnFunction(Function): + @staticmethod + def forward( + ctx, + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + im2col_step, + ): + ctx.im2col_step = im2col_step + output = _C.ms_deform_attn_forward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + ctx.im2col_step, + ) + ctx.save_for_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + ) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + ( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + ) = ctx.saved_tensors + grad_value, grad_sampling_loc, grad_attn_weight = _C.ms_deform_attn_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + grad_output, + ctx.im2col_step, + ) + + return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None + + +def multi_scale_deformable_attn_pytorch( + value: torch.Tensor, + value_spatial_shapes: torch.Tensor, + sampling_locations: torch.Tensor, + attention_weights: torch.Tensor, +) -> torch.Tensor: + + bs, _, num_heads, embed_dims = value.shape + _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape + value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level, (H_, W_) in enumerate(value_spatial_shapes): + # bs, H_*W_, num_heads, embed_dims -> + # bs, H_*W_, num_heads*embed_dims -> + # bs, num_heads*embed_dims, H_*W_ -> + # bs*num_heads, embed_dims, H_, W_ + value_l_ = ( + value_list[level].flatten(2).transpose(1, 2).reshape(bs * num_heads, embed_dims, H_, W_) + ) + # bs, num_queries, num_heads, num_points, 2 -> + # bs, num_heads, num_queries, num_points, 2 -> + # bs*num_heads, num_queries, num_points, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1) + # bs*num_heads, embed_dims, num_queries, num_points + sampling_value_l_ = F.grid_sample( + value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False + ) + sampling_value_list.append(sampling_value_l_) + # (bs, num_queries, num_heads, num_levels, num_points) -> + # (bs, num_heads, num_queries, num_levels, num_points) -> + # (bs, num_heads, 1, num_queries, num_levels*num_points) + attention_weights = attention_weights.transpose(1, 2).reshape( + bs * num_heads, 1, num_queries, num_levels * num_points + ) + output = ( + (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) + .sum(-1) + .view(bs, num_heads * embed_dims, num_queries) + ) + return output.transpose(1, 2).contiguous() + + +class MultiScaleDeformableAttention(nn.Module): + """Multi-Scale Deformable Attention Module used in Deformable-DETR + + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. + + Args: + embed_dim (int): The embedding dimension of Attention. Default: 256. + num_heads (int): The number of attention heads. Default: 8. + num_levels (int): The number of feature map used in Attention. Default: 4. + num_points (int): The number of sampling points for each query + in each head. Default: 4. + img2col_steps (int): The step used in image_to_column. Defualt: 64. + dropout (float): Dropout layer used in output. Default: 0.1. + batch_first (bool): if ``True``, then the input and output tensor will be + provided as `(bs, n, embed_dim)`. Default: False. `(n, bs, embed_dim)` + """ + + def __init__( + self, + embed_dim: int = 256, + num_heads: int = 8, + num_levels: int = 4, + num_points: int = 4, + img2col_step: int = 64, + batch_first: bool = False, + ): + super().__init__() + if embed_dim % num_heads != 0: + raise ValueError( + "embed_dim must be divisible by num_heads, but got {} and {}".format( + embed_dim, num_heads + ) + ) + head_dim = embed_dim // num_heads + + self.batch_first = batch_first + + if not _is_power_of_2(head_dim): + warnings.warn( + """ + You'd better set d_model in MSDeformAttn to make sure that + each dim of the attention head a power of 2, which is more efficient. + """ + ) + + self.im2col_step = img2col_step + self.embed_dim = embed_dim + self.num_heads = num_heads + self.num_levels = num_levels + self.num_points = num_points + self.sampling_offsets = nn.Linear(embed_dim, num_heads * num_levels * num_points * 2) + self.attention_weights = nn.Linear(embed_dim, num_heads * num_levels * num_points) + self.value_proj = nn.Linear(embed_dim, embed_dim) + self.output_proj = nn.Linear(embed_dim, embed_dim) + + self.init_weights() + + def _reset_parameters(self): + return self.init_weights() + + def init_weights(self): + """ + Default initialization for Parameters of Module. + """ + constant_(self.sampling_offsets.weight.data, 0.0) + thetas = torch.arange(self.num_heads, dtype=torch.float32) * ( + 2.0 * math.pi / self.num_heads + ) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = ( + (grid_init / grid_init.abs().max(-1, keepdim=True)[0]) + .view(self.num_heads, 1, 1, 2) + .repeat(1, self.num_levels, self.num_points, 1) + ) + for i in range(self.num_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + constant_(self.attention_weights.weight.data, 0.0) + constant_(self.attention_weights.bias.data, 0.0) + xavier_uniform_(self.value_proj.weight.data) + constant_(self.value_proj.bias.data, 0.0) + xavier_uniform_(self.output_proj.weight.data) + constant_(self.output_proj.bias.data, 0.0) + + def freeze_sampling_offsets(self): + print("Freeze sampling offsets") + self.sampling_offsets.weight.requires_grad = False + self.sampling_offsets.bias.requires_grad = False + + def freeze_attention_weights(self): + print("Freeze attention weights") + self.attention_weights.weight.requires_grad = False + self.attention_weights.bias.requires_grad = False + + def forward( + self, + query: torch.Tensor, + key: Optional[torch.Tensor] = None, + value: Optional[torch.Tensor] = None, + query_pos: Optional[torch.Tensor] = None, + key_padding_mask: Optional[torch.Tensor] = None, + reference_points: Optional[torch.Tensor] = None, + spatial_shapes: Optional[torch.Tensor] = None, + level_start_index: Optional[torch.Tensor] = None, + **kwargs + ) -> torch.Tensor: + + """Forward Function of MultiScaleDeformableAttention + + Args: + query (torch.Tensor): Query embeddings with shape + `(num_query, bs, embed_dim)` + key (torch.Tensor): Key embeddings with shape + `(num_key, bs, embed_dim)` + value (torch.Tensor): Value embeddings with shape + `(num_key, bs, embed_dim)` + query_pos (torch.Tensor): The position embedding for `query`. Default: None. + key_padding_mask (torch.Tensor): ByteTensor for `query`, with shape `(bs, num_key)`, + indicating which elements within `key` to be ignored in attention. + reference_points (torch.Tensor): The normalized reference points + with shape `(bs, num_query, num_levels, 2)`, + all elements is range in [0, 1], top-left (0, 0), + bottom-right (1, 1), including padding are. + or `(N, Length_{query}, num_levels, 4)`, add additional + two dimensions `(h, w)` to form reference boxes. + spatial_shapes (torch.Tensor): Spatial shape of features in different levels. + With shape `(num_levels, 2)`, last dimension represents `(h, w)`. + level_start_index (torch.Tensor): The start index of each level. A tensor with + shape `(num_levels, )` which can be represented as + `[0, h_0 * w_0, h_0 * w_0 + h_1 * w_1, ...]`. + + Returns: + torch.Tensor: forward results with shape `(num_query, bs, embed_dim)` + """ + + if value is None: + value = query + + if query_pos is not None: + query = query + query_pos + + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.permute(1, 0, 2) + value = value.permute(1, 0, 2) + + bs, num_query, _ = query.shape + bs, num_value, _ = value.shape + + assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value + + value = self.value_proj(value) + if key_padding_mask is not None: + value = value.masked_fill(key_padding_mask[..., None], float(0)) + value = value.view(bs, num_value, self.num_heads, -1) + sampling_offsets = self.sampling_offsets(query).view( + bs, num_query, self.num_heads, self.num_levels, self.num_points, 2 + ) + attention_weights = self.attention_weights(query).view( + bs, num_query, self.num_heads, self.num_levels * self.num_points + ) + attention_weights = attention_weights.softmax(-1) + attention_weights = attention_weights.view( + bs, + num_query, + self.num_heads, + self.num_levels, + self.num_points, + ) + + # bs, num_query, num_heads, num_levels, num_points, 2 + if reference_points.shape[-1] == 2: + offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = ( + reference_points[:, :, None, :, None, :] + + sampling_offsets / offset_normalizer[None, None, None, :, None, :] + ) + elif reference_points.shape[-1] == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + + sampling_offsets + / self.num_points + * reference_points[:, :, None, :, None, 2:] + * 0.5 + ) + else: + raise ValueError( + "Last dim of reference_points must be 2 or 4, but get {} instead.".format( + reference_points.shape[-1] + ) + ) + + if torch.cuda.is_available() and value.is_cuda: + halffloat = False + bhalffloat = False + if value.dtype == torch.float16: + halffloat = True + value = value.float() + sampling_locations = sampling_locations.float() + attention_weights = attention_weights.float() + elif value.dtype == torch.bfloat16: + bhalffloat = True + value = value.float() + sampling_locations = sampling_locations.float() + attention_weights = attention_weights.float() + + output = MultiScaleDeformableAttnFunction.apply( + value, + spatial_shapes, + level_start_index, + sampling_locations, + attention_weights, + self.im2col_step, + ) + + if halffloat: + output = output.half() + elif bhalffloat: + output = torch.tensor(output, dtype=torch.bfloat16) + else: + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights + ) + + output = self.output_proj(output) + + if not self.batch_first: + output = output.permute(1, 0, 2) + + return output + + +def create_dummy_class(klass, dependency, message=""): + """ + When a dependency of a class is not available, create a dummy class which throws ImportError + when used. + + Args: + klass (str): name of the class. + dependency (str): name of the dependency. + message: extra message to print + Returns: + class: a class object + """ + err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, klass) + if message: + err = err + " " + message + + class _DummyMetaClass(type): + # throw error on class attribute access + def __getattr__(_, __): # noqa: B902 + raise ImportError(err) + + class _Dummy(object, metaclass=_DummyMetaClass): + # throw error on constructor + def __init__(self, *args, **kwargs): + raise ImportError(err) + + return _Dummy + + +def create_dummy_func(func, dependency, message=""): + """ + When a dependency of a function is not available, create a dummy function which throws + ImportError when used. + + Args: + func (str): name of the function. + dependency (str or list[str]): name(s) of the dependency. + message: extra message to print + Returns: + function: a function object + """ + err = "Cannot import '{}', therefore '{}' is not available.".format(dependency, func) + if message: + err = err + " " + message + + if isinstance(dependency, (list, tuple)): + dependency = ",".join(dependency) + + def _dummy(*args, **kwargs): + raise ImportError(err) + + return _dummy diff --git a/fengshen/models/Lyrics/groundingdino/setup.py b/fengshen/models/Lyrics/groundingdino/setup.py new file mode 100644 index 0000000..12d2328 --- /dev/null +++ b/fengshen/models/Lyrics/groundingdino/setup.py @@ -0,0 +1,121 @@ +# coding=utf-8 +# Copyright 2022 The IDEA Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ------------------------------------------------------------------------------------------------ +# Modified from +# https://github.com/fundamentalvision/Deformable-DETR/blob/main/models/ops/setup.py +# https://github.com/facebookresearch/detectron2/blob/main/setup.py +# https://github.com/open-mmlab/mmdetection/blob/master/setup.py +# https://github.com/Oneflow-Inc/libai/blob/main/setup.py +# ------------------------------------------------------------------------------------------------ + +import glob +import os +import subprocess + +import torch +from setuptools import find_packages, setup +from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension + +# groundingdino version info +version = "0.1.0" +package_name = "groundingdino" +cwd = os.path.dirname(os.path.abspath(__file__)) + +requirements = ["torch", "torchvision"] + +torch_ver = [int(x) for x in torch.__version__.split(".")[:2]] + + +def get_extensions(): + this_dir = os.path.dirname(os.path.abspath(__file__)) + extensions_dir = os.path.join(this_dir, "csrc") + + main_source = os.path.join(extensions_dir, "vision.cpp") + sources = glob.glob(os.path.join(extensions_dir, "**", "*.cpp")) + source_cuda = glob.glob(os.path.join(extensions_dir, "**", "*.cu")) + glob.glob( + os.path.join(extensions_dir, "*.cu") + ) + + sources = [main_source] + sources + + # We need these variables to build with CUDA when we create the Docker image + # It solves https://github.com/IDEA-Research/Grounded-Segment-Anything/issues/53 + # and https://github.com/IDEA-Research/Grounded-Segment-Anything/issues/84 when running + # inside a Docker container. + am_i_docker = os.environ.get('AM_I_DOCKER', '').casefold() in ['true', '1', 't'] + use_cuda = os.environ.get('BUILD_WITH_CUDA', '').casefold() in ['true', '1', 't'] + + extension = CppExtension + + extra_compile_args = {"cxx": []} + define_macros = [] + + if (torch.cuda.is_available() and CUDA_HOME is not None) or \ + (am_i_docker and use_cuda): + print("Compiling with CUDA") + extension = CUDAExtension + sources += source_cuda + define_macros += [("WITH_CUDA", None)] + extra_compile_args["nvcc"] = [ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ] + else: + print("Compiling without CUDA") + define_macros += [("WITH_HIP", None)] + extra_compile_args["nvcc"] = [] + return None + + sources = [os.path.join(extensions_dir, s) for s in sources] + include_dirs = [extensions_dir] + + ext_modules = [ + extension( + "lyrica._C", + sources, + include_dirs=include_dirs, + define_macros=define_macros, + extra_compile_args=extra_compile_args, + ) + ] + + return ext_modules + + +if __name__ == "__main__": + print(f"Building wheel {package_name}-{version}") + + # with open("LICENSE", "r", encoding="utf-8") as f: + # license = f.read() + + + setup( + name="deformable detr cuda operator", + version="0.1.0", + # author="International Digital Economy Academy, Shilong Liu", + # url="https://github.com/IDEA-Research/GroundingDINO", + description="deformable detr cuda operator", + # license=license, + packages=find_packages( + exclude=( + "configs", + "tests", + ) + ), + ext_modules=get_extensions(), + cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, + ) diff --git a/fengshen/models/Lyrics/groundingdino/transformer.py b/fengshen/models/Lyrics/groundingdino/transformer.py new file mode 100644 index 0000000..ddbb4d8 --- /dev/null +++ b/fengshen/models/Lyrics/groundingdino/transformer.py @@ -0,0 +1,908 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# DINO +# Copyright (c) 2022 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Conditional DETR Transformer class. +# Copyright (c) 2021 Microsoft. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Modified from DETR (https://github.com/facebookresearch/detr) +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. +# ------------------------------------------------------------------------ + +from typing import Optional + +import torch +import torch.utils.checkpoint as checkpoint +from torch import Tensor, nn + +from fengshen.models.groundedblip.groundingdino.utils import inverse_sigmoid + +from fengshen.models.groundedblip.groundingdino.fuse_modules import BiAttentionBlock +from fengshen.models.groundedblip.groundingdino.ms_deform_attn import MultiScaleDeformableAttention as MSDeformAttn +from fengshen.models.groundedblip.groundingdino.transformer_vanilla import TransformerEncoderLayer +from fengshen.models.groundedblip.groundingdino.utils import ( + MLP, + _get_activation_fn, + _get_clones, + gen_encoder_output_proposals, + gen_sineembed_for_position, + get_sine_pos_embed, +) + + +class Transformer(nn.Module): + def __init__( + self, + args, + ): + super().__init__() + self.num_feature_levels = args.num_feature_levels + self.num_encoder_layers = args.enc_layers + self.num_unicoder_layers = args.num_unicoder_layers + self.num_decoder_layers = args.dec_layers + self.num_queries = args.num_queries + assert args.query_dim == 4 + + # choose encoder layer type + encoder_layer = DeformableTransformerEncoderLayer( + args.hidden_dim, args.dim_feedforward, args.dropout, args.transformer_activation, args.num_feature_levels, args.nheads, args.enc_n_points + ) + + text_enhance_layer = TransformerEncoderLayer( + d_model=args.hidden_dim, + nhead=args.nheads // 2, + dim_feedforward=args.dim_feedforward // 2, + dropout=args.text_dropout, + ) + + feature_fusion_layer = BiAttentionBlock( + v_dim=args.hidden_dim, + l_dim=args.hidden_dim, + embed_dim=args.dim_feedforward // 2, + num_heads=args.nheads // 2, + dropout=args.fusion_dropout, + drop_path=args.fusion_droppath, + ) + + encoder_norm = nn.LayerNorm(args.hidden_dim) if args.pre_norm else None + assert encoder_norm is None + self.encoder = TransformerEncoder( + encoder_layer, + args.enc_layers, + d_model=args.hidden_dim, + num_queries=args.num_queries, + text_enhance_layer=text_enhance_layer, + feature_fusion_layer=feature_fusion_layer, + use_checkpoint=args.use_checkpoint, + use_transformer_ckpt=args.use_transformer_ckpt, + ) + + # choose decoder layer type + decoder_layer = DeformableTransformerDecoderLayer( + args.hidden_dim, + args.dim_feedforward, + args.dropout, + args.transformer_activation, + args.num_feature_levels, + args.nheads, + args.dec_n_points, + use_text_cross_attention=args.use_text_cross_attention, + ) + + decoder_norm = nn.LayerNorm(args.hidden_dim) + self.decoder = TransformerDecoder( + decoder_layer, + args.dec_layers, + decoder_norm, + return_intermediate=args.return_intermediate_dec, + d_model=args.hidden_dim, + query_dim=args.query_dim, + num_feature_levels=args.num_feature_levels, + ) + + self.d_model = args.hidden_dim + self.nhead = args.nheads + self.dec_layers = args.dec_layers + self.num_queries = args.num_queries # useful for single stage model only + self.num_patterns = args.num_patterns + if not isinstance(args.num_patterns, int): + Warning("num_patterns should be int but {}".format(type(args.num_patterns))) + self.num_patterns = 0 + + if args.num_feature_levels > 1: + if self.num_encoder_layers > 0: + self.level_embed = nn.Parameter(torch.Tensor(args.num_feature_levels, args.hidden_dim)) + else: + self.level_embed = None + + self.learnable_tgt_init = args.learnable_tgt_init + assert args.learnable_tgt_init, "why not learnable_tgt_init" + self.embed_init_tgt = args.embed_init_tgt + if (args.two_stage_type != "no" and args.embed_init_tgt) or (args.two_stage_type == "no"): + self.tgt_embed = nn.Embedding(self.num_queries, args.hidden_dim) + nn.init.normal_(self.tgt_embed.weight.data) + else: + self.tgt_embed = None + + # for two stage + self.two_stage_type = args.two_stage_type + assert args.two_stage_type in ["no", "standard"], "unknown param {} of two_stage_type".format( + args.two_stage_type + ) + if args.two_stage_type == "standard": + # anchor selection at the output of encoder + self.enc_output = nn.Linear(args.hidden_dim, args.hidden_dim) + self.enc_output_norm = nn.LayerNorm(args.hidden_dim) + self.two_stage_wh_embedding = None + + if args.two_stage_type == "no": + self.init_ref_points(args.num_queries) # init self.refpoint_embed + + self.enc_out_class_embed = None + self.enc_out_bbox_embed = None + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + for m in self.modules(): + if isinstance(m, MSDeformAttn): + m._reset_parameters() + if self.num_feature_levels > 1 and self.level_embed is not None: + nn.init.normal_(self.level_embed) + + def get_valid_ratio(self, mask): + _, H, W = mask.shape + valid_H = torch.sum(~mask[:, :, 0], 1) + valid_W = torch.sum(~mask[:, 0, :], 1) + valid_ratio_h = valid_H.float() / H + valid_ratio_w = valid_W.float() / W + valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1) + return valid_ratio + + def init_ref_points(self, use_num_queries): + self.refpoint_embed = nn.Embedding(use_num_queries, 4) + + def forward(self, srcs, masks, refpoint_embed, pos_embeds, tgt, attn_mask=None, text_dict=None): + """ + Input: + - srcs: List of multi features [bs, ci, hi, wi] + - masks: List of multi masks [bs, hi, wi] + - refpoint_embed: [bs, num_dn, 4]. None in infer + - pos_embeds: List of multi pos embeds [bs, ci, hi, wi] + - tgt: [bs, num_dn, d_model]. None in infer + + """ + # prepare input for encoder + src_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes = [] + # import time + # start_time = time.time() + for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)): + bs, c, h, w = src.shape + spatial_shape = (h, w) + spatial_shapes.append(spatial_shape) + + src = src.flatten(2).transpose(1, 2) # bs, hw, c + mask = mask.flatten(1) # bs, hw + pos_embed = pos_embed.flatten(2).transpose(1, 2) # bs, hw, c + if self.num_feature_levels > 1 and self.level_embed is not None: + lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1) + else: + lvl_pos_embed = pos_embed + lvl_pos_embed_flatten.append(lvl_pos_embed) + src_flatten.append(src) + mask_flatten.append(mask) + + # end_time = time.time() + # print(end_time-start_time) + + src_flatten = torch.cat(src_flatten, 1) # bs, \sum{hxw}, c + mask_flatten = torch.cat(mask_flatten, 1) # bs, \sum{hxw} + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) # bs, \sum{hxw}, c + spatial_shapes = torch.as_tensor( + spatial_shapes, dtype=torch.long, device=src_flatten.device + ) + level_start_index = torch.cat( + (spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]) + ) + valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) + + # two stage + enc_topk_proposals = enc_refpoint_embed = None + # end_time = time.time() + # print(end_time-start_time) + ######################################################### + # Begin Encoder + ######################################################### + memory, memory_text = self.encoder( + src_flatten, + pos=lvl_pos_embed_flatten, + level_start_index=level_start_index, + spatial_shapes=spatial_shapes, + valid_ratios=valid_ratios, + key_padding_mask=mask_flatten, + memory_text=text_dict["encoded_text"], + text_attention_mask=~text_dict["text_token_mask"], + # we ~ the mask . False means use the token; True means pad the token + position_ids=text_dict["position_ids"], + text_self_attention_masks=text_dict["text_self_attention_masks"], + ) + # end_time = time.time() + # print(end_time-start_time) + ######################################################### + # End Encoder + # - memory: bs, \sum{hw}, c + # - mask_flatten: bs, \sum{hw} + # - lvl_pos_embed_flatten: bs, \sum{hw}, c + # - enc_intermediate_output: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c) + # - enc_intermediate_refpoints: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c) + ######################################################### + text_dict["encoded_text"] = memory_text + # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1': + # if memory.isnan().any() | memory.isinf().any(): + # import ipdb; ipdb.set_trace() + + if self.two_stage_type == "standard": + output_memory, output_proposals = gen_encoder_output_proposals( + memory, mask_flatten, spatial_shapes + ) + output_memory = self.enc_output_norm(self.enc_output(output_memory)) + # print(output_memory.size()) + + if text_dict is not None: + enc_outputs_class_unselected = self.enc_out_class_embed(output_memory, text_dict) + else: + enc_outputs_class_unselected = self.enc_out_class_embed(output_memory) + + topk_logits = enc_outputs_class_unselected.max(-1)[0] + # print('topk_logits:', topk_logits.shape) + enc_outputs_coord_unselected = ( + self.enc_out_bbox_embed(output_memory) + output_proposals + ) # (bs, \sum{hw}, 4) unsigmoid + topk = self.num_queries + + topk_proposals = torch.topk(topk_logits, topk, dim=1)[1] # bs, nq + # box是根据mask(左上角点)来初始化的 + # print('topk_proposals:', topk_proposals.shape) + # gather boxes + refpoint_embed_undetach = torch.gather( + enc_outputs_coord_unselected, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + ) # unsigmoid + refpoint_embed_ = refpoint_embed_undetach.detach() + init_box_proposal = torch.gather( + output_proposals, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + ).sigmoid() # sigmoid + + # gather tgt + tgt_undetach = torch.gather( + output_memory, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model) + ) + if self.embed_init_tgt: + tgt_ = ( + self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1) + ) # nq, bs, d_model + else: + tgt_ = tgt_undetach.detach() + + if refpoint_embed is not None: + refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1) + tgt = torch.cat([tgt, tgt_], dim=1) + else: + refpoint_embed, tgt = refpoint_embed_, tgt_ + + elif self.two_stage_type == "no": + tgt_ = ( + self.tgt_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1) + ) # nq, bs, d_model + refpoint_embed_ = ( + self.refpoint_embed.weight[:, None, :].repeat(1, bs, 1).transpose(0, 1) + ) # nq, bs, 4 + + if refpoint_embed is not None: + refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_], dim=1) + tgt = torch.cat([tgt, tgt_], dim=1) + else: + refpoint_embed, tgt = refpoint_embed_, tgt_ + + if self.num_patterns > 0: + tgt_embed = tgt.repeat(1, self.num_patterns, 1) + refpoint_embed = refpoint_embed.repeat(1, self.num_patterns, 1) + tgt_pat = self.patterns.weight[None, :, :].repeat_interleave( + self.num_queries, 1 + ) # 1, n_q*n_pat, d_model + tgt = tgt_embed + tgt_pat + + init_box_proposal = refpoint_embed_.sigmoid() + + else: + raise NotImplementedError("unknown two_stage_type {}".format(self.two_stage_type)) + + ######################################################### + # End preparing tgt + # - tgt: bs, NQ, d_model + # - refpoint_embed(unsigmoid): bs, NQ, d_model + ######################################################### + + ######################################################### + # Begin Decoder + ######################################################### + hs, references = self.decoder( + tgt=tgt.transpose(0, 1), + memory=memory.transpose(0, 1), + memory_key_padding_mask=mask_flatten, + pos=lvl_pos_embed_flatten.transpose(0, 1), + refpoints_unsigmoid=refpoint_embed.transpose(0, 1), + level_start_index=level_start_index, + spatial_shapes=spatial_shapes, + valid_ratios=valid_ratios, + tgt_mask=attn_mask, + memory_text=text_dict["encoded_text"], + text_attention_mask=~text_dict["text_token_mask"], + # we ~ the mask . False means use the token; True means pad the token + ) + + ######################################################### + # End Decoder + # hs: n_dec, bs, nq, d_model + # references: n_dec+1, bs, nq, query_dim + ######################################################### + + ######################################################### + # Begin postprocess + ######################################################### + if self.two_stage_type == "standard": + hs_enc = tgt_undetach.unsqueeze(0) + ref_enc = refpoint_embed_undetach.sigmoid().unsqueeze(0) + else: + hs_enc = ref_enc = None + ######################################################### + # End postprocess + # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or (n_enc, bs, nq, d_model) or None + # ref_enc: (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or (n_enc, bs, nq, d_model) or None + ######################################################### + + return hs, references, hs_enc, ref_enc, init_box_proposal + # hs: (n_dec, bs, nq, d_model) + # references: sigmoid coordinates. (n_dec+1, bs, bq, 4) + # hs_enc: (n_enc+1, bs, nq, d_model) or (1, bs, nq, d_model) or None + # ref_enc: sigmoid coordinates. \ + # (n_enc+1, bs, nq, query_dim) or (1, bs, nq, query_dim) or None + + +class TransformerEncoder(nn.Module): + def __init__( + self, + encoder_layer, + num_layers, + d_model=256, + num_queries=300, + enc_layer_share=False, + text_enhance_layer=None, + feature_fusion_layer=None, + use_checkpoint=False, + use_transformer_ckpt=False, + ): + """_summary_ + + Args: + encoder_layer (_type_): _description_ + num_layers (_type_): _description_ + norm (_type_, optional): _description_. Defaults to None. + d_model (int, optional): _description_. Defaults to 256. + num_queries (int, optional): _description_. Defaults to 300. + enc_layer_share (bool, optional): _description_. Defaults to False. + + """ + super().__init__() + # prepare layers + self.layers = [] + self.text_layers = [] + self.fusion_layers = [] + if num_layers > 0: + self.layers = _get_clones(encoder_layer, num_layers, layer_share=enc_layer_share) + + self.text_layers = _get_clones( + text_enhance_layer, num_layers, layer_share=enc_layer_share + ) + + self.fusion_layers = _get_clones( + feature_fusion_layer, num_layers, layer_share=enc_layer_share + ) + else: + self.layers = [] + del encoder_layer + + if text_enhance_layer is not None: + self.text_layers = [] + del text_enhance_layer + if feature_fusion_layer is not None: + self.fusion_layers = [] + del feature_fusion_layer + + self.query_scale = None + self.num_queries = num_queries + self.num_layers = num_layers + self.d_model = d_model + + self.use_checkpoint = use_checkpoint + self.use_transformer_ckpt = use_transformer_ckpt + + @staticmethod + def get_reference_points(spatial_shapes, valid_ratios, device): + reference_points_list = [] + for lvl, (H_, W_) in enumerate(spatial_shapes): + + ref_y, ref_x = torch.meshgrid( + torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device), + torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device), + ) + ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_) + ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_) + ref = torch.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = torch.cat(reference_points_list, 1) + reference_points = reference_points[:, :, None] * valid_ratios[:, None] + return reference_points + + def forward( + self, + # for images + src: Tensor, + pos: Tensor, + spatial_shapes: Tensor, + level_start_index: Tensor, + valid_ratios: Tensor, + key_padding_mask: Tensor, + # for texts + memory_text: Tensor = None, + text_attention_mask: Tensor = None, + pos_text: Tensor = None, + text_self_attention_masks: Tensor = None, + position_ids: Tensor = None, + ): + """ + Input: + - src: [bs, sum(hi*wi), 256] + - pos: pos embed for src. [bs, sum(hi*wi), 256] + - spatial_shapes: h,w of each level [num_level, 2] + - level_start_index: [num_level] start point of level in sum(hi*wi). + - valid_ratios: [bs, num_level, 2] + - key_padding_mask: [bs, sum(hi*wi)] + + - memory_text: bs, n_text, 256 + - text_attention_mask: bs, n_text + False for no padding; True for padding + - pos_text: bs, n_text, 256 + + - position_ids: bs, n_text + Intermedia: + - reference_points: [bs, sum(hi*wi), num_level, 2] + Outpus: + - output: [bs, sum(hi*wi), 256] + """ + + output = src + # import time + # start_time = time.time() + # preparation and reshape + if self.num_layers > 0: + reference_points = self.get_reference_points( + spatial_shapes, valid_ratios, device=src.device + ) + # end_time = time.time() + # print('encoder: ',end_time-start_time) + if self.text_layers: + # generate pos_text + bs, n_text, text_dim = memory_text.shape + if pos_text is None and position_ids is None: + pos_text = ( + torch.arange(n_text, device=memory_text.device) + .float() + .unsqueeze(0) + .unsqueeze(-1) + .repeat(bs, 1, 1) + ) + pos_text = get_sine_pos_embed(pos_text, num_pos_feats=256, exchange_xy=False) + if position_ids is not None: + pos_text = get_sine_pos_embed( + position_ids[..., None], num_pos_feats=256, exchange_xy=False + ) + # end_time = time.time() + # print('encoder1: ',end_time-start_time) + # main process + for layer_id, layer in enumerate(self.layers): + # if output.isnan().any() or memory_text.isnan().any(): + # if os.environ.get('IPDB_SHILONG_DEBUG', None) == 'INFO': + # import ipdb; ipdb.set_trace() + if self.fusion_layers: + if self.use_checkpoint: + output, memory_text = checkpoint.checkpoint( + self.fusion_layers[layer_id], + output, + memory_text, + key_padding_mask, + text_attention_mask, + ) + else: + output, memory_text = self.fusion_layers[layer_id]( + v=output, + l=memory_text, + attention_mask_v=key_padding_mask, + attention_mask_l=text_attention_mask, + ) + + if self.text_layers: + memory_text = self.text_layers[layer_id]( + src=memory_text.transpose(0, 1), + src_mask=~text_self_attention_masks, # note we use ~ for mask here + src_key_padding_mask=text_attention_mask, + pos=(pos_text.transpose(0, 1) if pos_text is not None else None), + ).transpose(0, 1) + + # main process + if self.use_transformer_ckpt: + output = checkpoint.checkpoint( + layer, + output, + pos, + reference_points, + spatial_shapes, + level_start_index, + key_padding_mask, + ) + else: + output = layer( + src=output, + pos=pos, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + key_padding_mask=key_padding_mask, + ) + # end_time = time.time() + # print('encoder_recussive: ',end_time-start_time) + return output, memory_text + + +class TransformerDecoder(nn.Module): + def __init__( + self, + decoder_layer, + num_layers, + norm=None, + return_intermediate=False, + d_model=256, + query_dim=4, + num_feature_levels=1, + ): + super().__init__() + if num_layers > 0: + self.layers = _get_clones(decoder_layer, num_layers) + else: + self.layers = [] + self.num_layers = num_layers + self.norm = norm + self.return_intermediate = return_intermediate + assert return_intermediate, "support return_intermediate only" + self.query_dim = query_dim + assert query_dim in [2, 4], "query_dim should be 2/4 but {}".format(query_dim) + self.num_feature_levels = num_feature_levels + + self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model, 2) + self.query_pos_sine_scale = None + + self.query_scale = None + self.bbox_embed = None + self.class_embed = None + + self.d_model = d_model + + self.ref_anchor_head = None + + def forward( + self, + tgt, + memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + refpoints_unsigmoid: Optional[Tensor] = None, # num_queries, bs, 2 + # for memory + level_start_index: Optional[Tensor] = None, # num_levels + spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2 + valid_ratios: Optional[Tensor] = None, + # for text + memory_text: Optional[Tensor] = None, + text_attention_mask: Optional[Tensor] = None, + ): + """ + Input: + - tgt: nq, bs, d_model + - memory: hw, bs, d_model + - pos: hw, bs, d_model + - refpoints_unsigmoid: nq, bs, 2/4 + - valid_ratios/spatial_shapes: bs, nlevel, 2 + """ + output = tgt + + intermediate = [] + reference_points = refpoints_unsigmoid.sigmoid() + ref_points = [reference_points] + + for layer_id, layer in enumerate(self.layers): + + if reference_points.shape[-1] == 4: + reference_points_input = ( + reference_points[:, :, None] + * torch.cat([valid_ratios, valid_ratios], -1)[None, :] + ) # nq, bs, nlevel, 4 + else: + assert reference_points.shape[-1] == 2 + reference_points_input = reference_points[:, :, None] * valid_ratios[None, :] + query_sine_embed = gen_sineembed_for_position( + reference_points_input[:, :, 0, :] + ) # nq, bs, 256*2 + + # conditional query + raw_query_pos = self.ref_point_head(query_sine_embed) # nq, bs, 256 + pos_scale = self.query_scale(output) if self.query_scale is not None else 1 + query_pos = pos_scale * raw_query_pos + # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1': + # if query_pos.isnan().any() | query_pos.isinf().any(): + # import ipdb; ipdb.set_trace() + + # main process + output = layer( + tgt=output, + tgt_query_pos=query_pos, + tgt_query_sine_embed=query_sine_embed, + tgt_key_padding_mask=tgt_key_padding_mask, + tgt_reference_points=reference_points_input, + memory_text=memory_text, + text_attention_mask=text_attention_mask, + memory=memory, + memory_key_padding_mask=memory_key_padding_mask, + memory_level_start_index=level_start_index, + memory_spatial_shapes=spatial_shapes, + memory_pos=pos, + self_attn_mask=tgt_mask, + cross_attn_mask=memory_mask, + ) + if output.isnan().any() | output.isinf().any(): + print(f"output layer_id {layer_id} is nan") + try: + num_nan = output.isnan().sum().item() + num_inf = output.isinf().sum().item() + print(f"num_nan {num_nan}, num_inf {num_inf}") + except Exception as e: + print(e) + # if os.environ.get("SHILONG_AMP_INFNAN_DEBUG") == '1': + # import ipdb; ipdb.set_trace() + + # iter update + if self.bbox_embed is not None: + # box_holder = self.bbox_embed(output) + # box_holder[..., :self.query_dim] += inverse_sigmoid(reference_points) + # new_reference_points = box_holder[..., :self.query_dim].sigmoid() + + reference_before_sigmoid = inverse_sigmoid(reference_points) + delta_unsig = self.bbox_embed[layer_id](output) + outputs_unsig = delta_unsig + reference_before_sigmoid + new_reference_points = outputs_unsig.sigmoid() + + reference_points = new_reference_points.detach() + # if layer_id != self.num_layers - 1: + ref_points.append(new_reference_points) + + intermediate.append(self.norm(output)) + + return [ + [itm_out.transpose(0, 1) for itm_out in intermediate], + [itm_refpoint.transpose(0, 1) for itm_refpoint in ref_points], + ] + + +class DeformableTransformerEncoderLayer(nn.Module): + def __init__( + self, + d_model=256, + d_ffn=1024, + dropout=0.1, + activation="relu", + n_levels=4, + n_heads=8, + n_points=4, + ): + super().__init__() + + # self attention + self.self_attn = MSDeformAttn( + embed_dim=d_model, + num_levels=n_levels, + num_heads=n_heads, + num_points=n_points, + batch_first=True, + ) + self.dropout1 = nn.Dropout(dropout) + self.norm1 = nn.LayerNorm(d_model) + + # ffn + self.linear1 = nn.Linear(d_model, d_ffn) + self.activation = _get_activation_fn(activation, d_model=d_ffn) + self.dropout2 = nn.Dropout(dropout) + self.linear2 = nn.Linear(d_ffn, d_model) + self.dropout3 = nn.Dropout(dropout) + self.norm2 = nn.LayerNorm(d_model) + + @staticmethod + def with_pos_embed(tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_ffn(self, src): + src2 = self.linear2(self.dropout2(self.activation(self.linear1(src)))) + src = src + self.dropout3(src2) + src = self.norm2(src) + return src + + def forward( + self, src, pos, reference_points, spatial_shapes, level_start_index, key_padding_mask=None + ): + # self attention + # import ipdb; ipdb.set_trace() + src2 = self.self_attn( + query=self.with_pos_embed(src, pos), + reference_points=reference_points, + value=src, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + key_padding_mask=key_padding_mask, + ) + src = src + self.dropout1(src2) + src = self.norm1(src) + + # ffn + src = self.forward_ffn(src) + + return src + + +class DeformableTransformerDecoderLayer(nn.Module): + def __init__( + self, + d_model=256, + d_ffn=1024, + dropout=0.1, + activation="relu", + n_levels=4, + n_heads=8, + n_points=4, + use_text_feat_guide=False, + use_text_cross_attention=False, + ): + super().__init__() + + # cross attention + self.cross_attn = MSDeformAttn( + embed_dim=d_model, + num_levels=n_levels, + num_heads=n_heads, + num_points=n_points, + batch_first=True, + ) + self.dropout1 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.norm1 = nn.LayerNorm(d_model) + + # cross attention text + if use_text_cross_attention: + self.ca_text = nn.MultiheadAttention(d_model, n_heads, dropout=dropout) + self.catext_dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.catext_norm = nn.LayerNorm(d_model) + + # self attention + self.self_attn = nn.MultiheadAttention(d_model, n_heads, dropout=dropout) + self.dropout2 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.norm2 = nn.LayerNorm(d_model) + + # ffn + self.linear1 = nn.Linear(d_model, d_ffn) + self.activation = _get_activation_fn(activation, d_model=d_ffn, batch_dim=1) + self.dropout3 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.linear2 = nn.Linear(d_ffn, d_model) + self.dropout4 = nn.Dropout(dropout) if dropout > 0 else nn.Identity() + self.norm3 = nn.LayerNorm(d_model) + + self.key_aware_proj = None + self.use_text_feat_guide = use_text_feat_guide + assert not use_text_feat_guide + self.use_text_cross_attention = use_text_cross_attention + + def rm_self_attn_modules(self): + self.self_attn = None + self.dropout2 = None + self.norm2 = None + + @staticmethod + def with_pos_embed(tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_ffn(self, tgt): + with torch.cuda.amp.autocast(enabled=False): + tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) + tgt = tgt + self.dropout4(tgt2) + tgt = self.norm3(tgt) + return tgt + + def forward( + self, + # for tgt + tgt: Optional[Tensor], # nq, bs, d_model + tgt_query_pos: Optional[Tensor] = None, # pos for query. MLP(Sine(pos)) + tgt_query_sine_embed: Optional[Tensor] = None, # pos for query. Sine(pos) + tgt_key_padding_mask: Optional[Tensor] = None, + tgt_reference_points: Optional[Tensor] = None, # nq, bs, 4 + memory_text: Optional[Tensor] = None, # bs, num_token, d_model + text_attention_mask: Optional[Tensor] = None, # bs, num_token + # for memory + memory: Optional[Tensor] = None, # hw, bs, d_model + memory_key_padding_mask: Optional[Tensor] = None, + memory_level_start_index: Optional[Tensor] = None, # num_levels + memory_spatial_shapes: Optional[Tensor] = None, # bs, num_levels, 2 + memory_pos: Optional[Tensor] = None, # pos for memory + # sa + self_attn_mask: Optional[Tensor] = None, # mask used for self-attention + cross_attn_mask: Optional[Tensor] = None, # mask used for cross-attention + ): + """ + Input: + - tgt/tgt_query_pos: nq, bs, d_model + - + """ + assert cross_attn_mask is None + + # self attention + if self.self_attn is not None: + # import ipdb; ipdb.set_trace() + q = k = self.with_pos_embed(tgt, tgt_query_pos) + tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0] + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + + if self.use_text_cross_attention: + tgt2 = self.ca_text( + self.with_pos_embed(tgt, tgt_query_pos), + memory_text.transpose(0, 1), + memory_text.transpose(0, 1), + key_padding_mask=text_attention_mask, + )[0] + tgt = tgt + self.catext_dropout(tgt2) + tgt = self.catext_norm(tgt) + + tgt2 = self.cross_attn( + query=self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1), + reference_points=tgt_reference_points.transpose(0, 1).contiguous(), + value=memory.transpose(0, 1), + spatial_shapes=memory_spatial_shapes, + level_start_index=memory_level_start_index, + key_padding_mask=memory_key_padding_mask, + ).transpose(0, 1) + tgt = tgt + self.dropout1(tgt2) + tgt = self.norm1(tgt) + + # ffn + tgt = self.forward_ffn(tgt) + + return tgt diff --git a/fengshen/models/Lyrics/groundingdino/transformer_vanilla.py b/fengshen/models/Lyrics/groundingdino/transformer_vanilla.py new file mode 100644 index 0000000..4b33313 --- /dev/null +++ b/fengshen/models/Lyrics/groundingdino/transformer_vanilla.py @@ -0,0 +1,126 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ +# Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +DETR Transformer class. + +Copy-paste from torch.nn.Transformer with modifications: + * positional encodings are passed in MHattention + * extra LN at the end of encoder is removed + * decoder returns a stack of activations from all decoding layers +""" +from typing import Optional + +import torch +import torch.nn.functional as F +from torch import Tensor, nn + +from fengshen.models.groundedblip.groundingdino.utils import ( + MLP, + _get_activation_fn, + _get_clones, +) + + +class TextTransformer(nn.Module): + def __init__(self, num_layers, d_model=256, nheads=8, dim_feedforward=2048, dropout=0.1): + super().__init__() + self.num_layers = num_layers + self.d_model = d_model + self.nheads = nheads + self.dim_feedforward = dim_feedforward + self.norm = None + + single_encoder_layer = TransformerEncoderLayer( + d_model=d_model, nhead=nheads, dim_feedforward=dim_feedforward, dropout=dropout + ) + self.layers = _get_clones(single_encoder_layer, num_layers) + + def forward(self, memory_text: torch.Tensor, text_attention_mask: torch.Tensor): + """ + + Args: + text_attention_mask: bs, num_token + memory_text: bs, num_token, d_model + + Raises: + RuntimeError: _description_ + + Returns: + output: bs, num_token, d_model + """ + + output = memory_text.transpose(0, 1) + + for layer in self.layers: + output = layer(output, src_key_padding_mask=text_attention_mask) + + if self.norm is not None: + output = self.norm(output) + + return output.transpose(0, 1) + + +class TransformerEncoderLayer(nn.Module): + def __init__( + self, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False, + ): + super().__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + self.nhead = nhead + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward( + self, + src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + ): + # repeat attn mask + if src_mask.dim() == 3 and src_mask.shape[0] == src.shape[1]: + # bs, num_q, num_k + src_mask = src_mask.repeat(self.nhead, 1, 1) + + if src.dtype == torch.half: + q = k = self.with_pos_embed(src, pos).half() + # q = k = self.with_pos_embed(src, pos) + elif src.dtype == torch.bfloat16: + q = k = torch.tensor(self.with_pos_embed(src, pos), dtype=torch.bfloat16) + else: + q = k = self.with_pos_embed(src, pos) + + src2 = self.self_attn(q, k, value=src, attn_mask=src_mask)[0] + + # src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask)[0] + src = src + self.dropout1(src2) + src = self.norm1(src) + src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = src + self.dropout2(src2) + src = self.norm2(src) + return src diff --git a/fengshen/models/Lyrics/groundingdino/transforms.py b/fengshen/models/Lyrics/groundingdino/transforms.py new file mode 100644 index 0000000..6b72b32 --- /dev/null +++ b/fengshen/models/Lyrics/groundingdino/transforms.py @@ -0,0 +1,311 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Transforms and data augmentation for both image + bbox. +""" +import os +import random + +import PIL +import torch +import torchvision.transforms as T +import torchvision.transforms.functional as F + +from fengshen.models.groundedblip.groundingdino.utils import box_xyxy_to_cxcywh +from fengshen.models.groundedblip.groundingdino.utils import interpolate + + +def crop(image, target, region): + cropped_image = F.crop(image, *region) + + target = target.copy() + i, j, h, w = region + + # should we do something wrt the original size? + target["size"] = torch.tensor([h, w]) + + fields = ["labels", "area", "iscrowd", "positive_map"] + + if "boxes" in target: + boxes = target["boxes"] + max_size = torch.as_tensor([w, h], dtype=torch.float32) + cropped_boxes = boxes - torch.as_tensor([j, i, j, i]) + cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) + cropped_boxes = cropped_boxes.clamp(min=0) + area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1) + target["boxes"] = cropped_boxes.reshape(-1, 4) + target["area"] = area + fields.append("boxes") + + if "masks" in target: + # FIXME should we update the area here if there are no boxes? + target["masks"] = target["masks"][:, i : i + h, j : j + w] + fields.append("masks") + + # remove elements for which the boxes or masks that have zero area + if "boxes" in target or "masks" in target: + # favor boxes selection when defining which elements to keep + # this is compatible with previous implementation + if "boxes" in target: + cropped_boxes = target["boxes"].reshape(-1, 2, 2) + keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1) + else: + keep = target["masks"].flatten(1).any(1) + + for field in fields: + if field in target: + target[field] = target[field][keep] + + if os.environ.get("IPDB_SHILONG_DEBUG", None) == "INFO": + # for debug and visualization only. + if "strings_positive" in target: + target["strings_positive"] = [ + _i for _i, _j in zip(target["strings_positive"], keep) if _j + ] + + return cropped_image, target + + +def hflip(image, target): + flipped_image = F.hflip(image) + + w, h = image.size + + target = target.copy() + if "boxes" in target: + boxes = target["boxes"] + boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor( + [w, 0, w, 0] + ) + target["boxes"] = boxes + + if "masks" in target: + target["masks"] = target["masks"].flip(-1) + + return flipped_image, target + + +def resize(image, target, size, max_size=None): + # size can be min_size (scalar) or (w, h) tuple + + def get_size_with_aspect_ratio(image_size, size, max_size=None): + w, h = image_size + if max_size is not None: + min_original_size = float(min((w, h))) + max_original_size = float(max((w, h))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (w <= h and w == size) or (h <= w and h == size): + return (h, w) + + if w < h: + ow = size + oh = int(size * h / w) + else: + oh = size + ow = int(size * w / h) + + return (oh, ow) + + def get_size(image_size, size, max_size=None): + if isinstance(size, (list, tuple)): + return size[::-1] + else: + return get_size_with_aspect_ratio(image_size, size, max_size) + + size = get_size(image.size, size, max_size) + rescaled_image = F.resize(image, size) + + if target is None: + return rescaled_image, None + + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) + ratio_width, ratio_height = ratios + + target = target.copy() + if "boxes" in target: + boxes = target["boxes"] + scaled_boxes = boxes * torch.as_tensor( + [ratio_width, ratio_height, ratio_width, ratio_height] + ) + target["boxes"] = scaled_boxes + + if "area" in target: + area = target["area"] + scaled_area = area * (ratio_width * ratio_height) + target["area"] = scaled_area + + h, w = size + target["size"] = torch.tensor([h, w]) + + if "masks" in target: + target["masks"] = ( + interpolate(target["masks"][:, None].float(), size, mode="nearest")[:, 0] > 0.5 + ) + + return rescaled_image, target + + +def pad(image, target, padding): + # assumes that we only pad on the bottom right corners + padded_image = F.pad(image, (0, 0, padding[0], padding[1])) + if target is None: + return padded_image, None + target = target.copy() + # should we do something wrt the original size? + target["size"] = torch.tensor(padded_image.size[::-1]) + if "masks" in target: + target["masks"] = torch.nn.functional.pad(target["masks"], (0, padding[0], 0, padding[1])) + return padded_image, target + + +class ResizeDebug(object): + def __init__(self, size): + self.size = size + + def __call__(self, img, target): + return resize(img, target, self.size) + + +class RandomCrop(object): + def __init__(self, size): + self.size = size + + def __call__(self, img, target): + region = T.RandomCrop.get_params(img, self.size) + return crop(img, target, region) + + +class RandomSizeCrop(object): + def __init__(self, min_size: int, max_size: int, respect_boxes: bool = False): + # respect_boxes: True to keep all boxes + # False to tolerence box filter + self.min_size = min_size + self.max_size = max_size + self.respect_boxes = respect_boxes + + def __call__(self, img: PIL.Image.Image, target: dict): + init_boxes = len(target["boxes"]) + max_patience = 10 + for i in range(max_patience): + w = random.randint(self.min_size, min(img.width, self.max_size)) + h = random.randint(self.min_size, min(img.height, self.max_size)) + region = T.RandomCrop.get_params(img, [h, w]) + result_img, result_target = crop(img, target, region) + if ( + not self.respect_boxes + or len(result_target["boxes"]) == init_boxes + or i == max_patience - 1 + ): + return result_img, result_target + return result_img, result_target + + +class CenterCrop(object): + def __init__(self, size): + self.size = size + + def __call__(self, img, target): + image_width, image_height = img.size + crop_height, crop_width = self.size + crop_top = int(round((image_height - crop_height) / 2.0)) + crop_left = int(round((image_width - crop_width) / 2.0)) + return crop(img, target, (crop_top, crop_left, crop_height, crop_width)) + + +class RandomHorizontalFlip(object): + def __init__(self, p=0.5): + self.p = p + + def __call__(self, img, target): + if random.random() < self.p: + return hflip(img, target) + return img, target + + +class RandomResize(object): + def __init__(self, sizes, max_size=None): + assert isinstance(sizes, (list, tuple)) + self.sizes = sizes + self.max_size = max_size + + def __call__(self, img, target=None): + size = random.choice(self.sizes) + return resize(img, target, size, self.max_size) + + +class RandomPad(object): + def __init__(self, max_pad): + self.max_pad = max_pad + + def __call__(self, img, target): + pad_x = random.randint(0, self.max_pad) + pad_y = random.randint(0, self.max_pad) + return pad(img, target, (pad_x, pad_y)) + + +class RandomSelect(object): + """ + Randomly selects between transforms1 and transforms2, + with probability p for transforms1 and (1 - p) for transforms2 + """ + + def __init__(self, transforms1, transforms2, p=0.5): + self.transforms1 = transforms1 + self.transforms2 = transforms2 + self.p = p + + def __call__(self, img, target): + if random.random() < self.p: + return self.transforms1(img, target) + return self.transforms2(img, target) + + +class ToTensor(object): + def __call__(self, img, target): + return F.to_tensor(img), target + + +class RandomErasing(object): + def __init__(self, *args, **kwargs): + self.eraser = T.RandomErasing(*args, **kwargs) + + def __call__(self, img, target): + return self.eraser(img), target + + +class Normalize(object): + def __init__(self, mean, std): + self.mean = mean + self.std = std + + def __call__(self, image, target=None): + image = F.normalize(image, mean=self.mean, std=self.std) + if target is None: + return image, None + target = target.copy() + h, w = image.shape[-2:] + if "boxes" in target: + boxes = target["boxes"] + boxes = box_xyxy_to_cxcywh(boxes) + boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32) + target["boxes"] = boxes + return image, target + + +class Compose(object): + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, image, target): + for t in self.transforms: + image, target = t(image, target) + return image, target + + def __repr__(self): + format_string = self.__class__.__name__ + "(" + for t in self.transforms: + format_string += "\n" + format_string += " {0}".format(t) + format_string += "\n)" + return format_string diff --git a/fengshen/models/Lyrics/groundingdino/utils.py b/fengshen/models/Lyrics/groundingdino/utils.py new file mode 100644 index 0000000..adaf263 --- /dev/null +++ b/fengshen/models/Lyrics/groundingdino/utils.py @@ -0,0 +1,471 @@ +# ------------------------------------------------------------------------ +# Grounding DINO +# url: https://github.com/IDEA-Research/GroundingDINO +# Copyright (c) 2023 IDEA. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------ + +import copy +import math + +import torch +import torch.nn.functional as F +from torch import Tensor, nn +from typing import List, Optional +from collections import OrderedDict +import torch.distributed as dist +import torchvision + + +def _get_clones(module, N, layer_share=False): + # import ipdb; ipdb.set_trace() + if layer_share: + return nn.ModuleList([module for i in range(N)]) + else: + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + +def get_sine_pos_embed( + pos_tensor: torch.Tensor, + num_pos_feats: int = 128, + temperature: int = 10000, + exchange_xy: bool = True, +): + """generate sine position embedding from a position tensor + Args: + pos_tensor (torch.Tensor): shape: [..., n]. + num_pos_feats (int): projected shape for each float in the tensor. + temperature (int): temperature in the sine/cosine function. + exchange_xy (bool, optional): exchange pos x and pos y. \ + For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. Defaults to True. + Returns: + pos_embed (torch.Tensor): shape: [..., n*num_pos_feats]. + """ + scale = 2 * math.pi + dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device) + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + + def sine_func(x: torch.Tensor): + sin_x = x * scale / dim_t + sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2) + return sin_x + + pos_res = [sine_func(x) for x in pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1)] + if exchange_xy: + pos_res[0], pos_res[1] = pos_res[1], pos_res[0] + pos_res = torch.cat(pos_res, dim=-1) + return pos_res + + +def gen_encoder_output_proposals( + memory: Tensor, memory_padding_mask: Tensor, spatial_shapes: Tensor, learnedwh=None +): + """ + Input: + - memory: bs, \sum{hw}, d_model + - memory_padding_mask: bs, \sum{hw} + - spatial_shapes: nlevel, 2 + - learnedwh: 2 + Output: + - output_memory: bs, \sum{hw}, d_model + - output_proposals: bs, \sum{hw}, 4 + """ + N_, S_, C_ = memory.shape + proposals = [] + _cur = 0 + for lvl, (H_, W_) in enumerate(spatial_shapes): + mask_flatten_ = memory_padding_mask[:, _cur : (_cur + H_ * W_)].view(N_, H_, W_, 1) + valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1) + valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1) + + # import ipdb; ipdb.set_trace() + + grid_y, grid_x = torch.meshgrid( + torch.linspace(0, H_ - 1, H_, dtype=torch.float32, device=memory.device), + torch.linspace(0, W_ - 1, W_, dtype=torch.float32, device=memory.device), + ) + grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) # H_, W_, 2 + + scale = torch.cat([valid_W.unsqueeze(-1), valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2) + grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale + + if learnedwh is not None: + # import ipdb; ipdb.set_trace() + wh = torch.ones_like(grid) * learnedwh.sigmoid() * (2.0**lvl) + else: + wh = torch.ones_like(grid) * 0.05 * (2.0**lvl) + + # scale = torch.cat([W_[None].unsqueeze(-1), H_[None].unsqueeze(-1)], 1).view(1, 1, 1, 2).repeat(N_, 1, 1, 1) + # grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale + # wh = torch.ones_like(grid) / scale + proposal = torch.cat((grid, wh), -1).view(N_, -1, 4) + proposals.append(proposal) + _cur += H_ * W_ + # import ipdb; ipdb.set_trace() + output_proposals = torch.cat(proposals, 1) + output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all( + -1, keepdim=True + ) + output_proposals = torch.log(output_proposals / (1 - output_proposals)) # unsigmoid + output_proposals = output_proposals.masked_fill(memory_padding_mask.unsqueeze(-1), float("inf")) + output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) + + output_memory = memory + output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float(0)) + output_memory = output_memory.masked_fill(~output_proposals_valid, float(0)) + + # output_memory = output_memory.masked_fill(memory_padding_mask.unsqueeze(-1), float('inf')) + # output_memory = output_memory.masked_fill(~output_proposals_valid, float('inf')) + + return output_memory, output_proposals + + +class RandomBoxPerturber: + def __init__( + self, x_noise_scale=0.2, y_noise_scale=0.2, w_noise_scale=0.2, h_noise_scale=0.2 + ) -> None: + self.noise_scale = torch.Tensor( + [x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale] + ) + + def __call__(self, refanchors: Tensor) -> Tensor: + nq, bs, query_dim = refanchors.shape + device = refanchors.device + + noise_raw = torch.rand_like(refanchors) + noise_scale = self.noise_scale.to(device)[:query_dim] + + new_refanchors = refanchors * (1 + (noise_raw - 0.5) * noise_scale) + return new_refanchors.clamp_(0, 1) + + +def sigmoid_focal_loss( + inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2, no_reduction=False +): + """ + Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + alpha: (optional) Weighting factor in range (0,1) to balance + positive vs negative examples. Default = -1 (no weighting). + gamma: Exponent of the modulating factor (1 - p_t) to + balance easy vs hard examples. + Returns: + Loss tensor + """ + prob = inputs.sigmoid() + ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") + p_t = prob * targets + (1 - prob) * (1 - targets) + loss = ce_loss * ((1 - p_t) ** gamma) + + if alpha >= 0: + alpha_t = alpha * targets + (1 - alpha) * (1 - targets) + loss = alpha_t * loss + + if no_reduction: + return loss + + return loss.mean(1).sum() / num_boxes + + +class MLP(nn.Module): + """Very simple multi-layer perceptron (also called FFN)""" + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList( + nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]) + ) + + def forward(self, x): + if self.layers[0].weight.dtype == torch.half: + x = x.half() + elif self.layers[0].weight.dtype == torch.bfloat16: + x = torch.tensor(x, dtype=torch.bfloat16) + for i, layer in enumerate(self.layers): + x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +def _get_activation_fn(activation, d_model=256, batch_dim=0): + """Return an activation function given a string""" + if activation == "relu": + return F.relu + if activation == "gelu": + return F.gelu + if activation == "glu": + return F.glu + if activation == "prelu": + return nn.PReLU() + if activation == "selu": + return F.selu + + raise RuntimeError(f"activation should be relu/gelu, not {activation}.") + + +def gen_sineembed_for_position(pos_tensor): + # n_query, bs, _ = pos_tensor.size() + # sineembed_tensor = torch.zeros(n_query, bs, 256) + scale = 2 * math.pi + dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device) + dim_t = 10000 ** (2 * (torch.div(dim_t, 2, rounding_mode='floor')) / 128) + x_embed = pos_tensor[:, :, 0] * scale + y_embed = pos_tensor[:, :, 1] * scale + pos_x = x_embed[:, :, None] / dim_t + pos_y = y_embed[:, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2) + pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2) + if pos_tensor.size(-1) == 2: + pos = torch.cat((pos_y, pos_x), dim=2) + elif pos_tensor.size(-1) == 4: + w_embed = pos_tensor[:, :, 2] * scale + pos_w = w_embed[:, :, None] / dim_t + pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()), dim=3).flatten(2) + + h_embed = pos_tensor[:, :, 3] * scale + pos_h = h_embed[:, :, None] / dim_t + pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()), dim=3).flatten(2) + + pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2) + else: + raise ValueError("Unknown pos_tensor shape(-1):{}".format(pos_tensor.size(-1))) + return pos + + +class ContrastiveEmbed(nn.Module): + def __init__(self, max_text_len=256): + """ + Args: + max_text_len: max length of text. + """ + super().__init__() + self.max_text_len = max_text_len + + def forward(self, x, text_dict): + """_summary_ + + Args: + x (_type_): _description_ + text_dict (_type_): _description_ + { + 'encoded_text': encoded_text, # bs, 195, d_model + 'text_token_mask': text_token_mask, # bs, 195 + # True for used tokens. False for padding tokens + } + Returns: + _type_: _description_ + """ + assert isinstance(text_dict, dict) + + y = text_dict["encoded_text"] + text_token_mask = text_dict["text_token_mask"] + # print('y:', y.shape) + # print('text_token_mask:', text_token_mask.shape) + # print('layer_hs:', x.shape) + res = x @ y.transpose(-1, -2) + res.masked_fill_(~text_token_mask[:, None, :], float("-inf")) + + # padding to max_text_len + new_res = torch.full((*res.shape[:-1], self.max_text_len), float("-inf"), device=res.device) + new_res[..., : res.shape[-1]] = res + + return new_res + +def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): + # TODO make this more general + if tensor_list[0].ndim == 3: + if torchvision._is_tracing(): + # nested_tensor_from_tensor_list() does not export well to ONNX + # call _onnx_nested_tensor_from_tensor_list() instead + return _onnx_nested_tensor_from_tensor_list(tensor_list) + + # TODO make it support different-sized images + max_size = _max_by_axis([list(img.shape) for img in tensor_list]) + # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) + batch_shape = [len(tensor_list)] + max_size + b, c, h, w = batch_shape + dtype = tensor_list[0].dtype + device = tensor_list[0].device + tensor = torch.zeros(batch_shape, dtype=dtype, device=device) + mask = torch.ones((b, h, w), dtype=torch.bool, device=device) + for img, pad_img, m in zip(tensor_list, tensor, mask): + pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + # 不包含的地方是0 + m[: img.shape[1], : img.shape[2]] = False + else: + raise ValueError("not supported") + return NestedTensor(tensor, mask) + +def inverse_sigmoid(x, eps=1e-3): + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1 / x2) + +class NestedTensor(object): + def __init__(self, tensors, mask: Optional[Tensor]): + self.tensors = tensors + self.mask = mask + if mask == "auto": + self.mask = torch.zeros_like(tensors).to(tensors.device) + if self.mask.dim() == 3: + self.mask = self.mask.sum(0).to(bool) + elif self.mask.dim() == 4: + self.mask = self.mask.sum(1).to(bool) + else: + raise ValueError( + "tensors dim must be 3 or 4 but {}({})".format( + self.tensors.dim(), self.tensors.shape + ) + ) + + def imgsize(self): + res = [] + for i in range(self.tensors.shape[0]): + mask = self.mask[i] + maxH = (~mask).sum(0).max() + maxW = (~mask).sum(1).max() + res.append(torch.Tensor([maxH, maxW])) + return res + + def to(self, device): + # type: (Device) -> NestedTensor # noqa + cast_tensor = self.tensors.to(device) + mask = self.mask + if mask is not None: + assert mask is not None + cast_mask = mask.to(device) + else: + cast_mask = None + return NestedTensor(cast_tensor, cast_mask) + + def to_img_list_single(self, tensor, mask): + assert tensor.dim() == 3, "dim of tensor should be 3 but {}".format(tensor.dim()) + maxH = (~mask).sum(0).max() + maxW = (~mask).sum(1).max() + img = tensor[:, :maxH, :maxW] + return img + + def to_img_list(self): + """remove the padding and convert to img list + + Returns: + [type]: [description] + """ + if self.tensors.dim() == 3: + return self.to_img_list_single(self.tensors, self.mask) + else: + res = [] + for i in range(self.tensors.shape[0]): + tensor_i = self.tensors[i] + mask_i = self.mask[i] + res.append(self.to_img_list_single(tensor_i, mask_i)) + return res + + @property + def device(self): + return self.tensors.device + + def decompose(self): + return self.tensors, self.mask + + def __repr__(self): + return str(self.tensors) + + @property + def shape(self): + return {"tensors.shape": self.tensors.shape, "mask.shape": self.mask.shape} + +@torch.jit.unused +def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: + max_size = [] + for i in range(tensor_list[0].dim()): + max_size_i = torch.max( + torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32) + ).to(torch.int64) + max_size.append(max_size_i) + max_size = tuple(max_size) + + # work around for + # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + # m[: img.shape[1], :img.shape[2]] = False + # which is not yet supported in onnx + padded_imgs = [] + padded_masks = [] + for img in tensor_list: + padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] + padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) + padded_imgs.append(padded_img) + + m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) + padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) + padded_masks.append(padded_mask.to(torch.bool)) + + tensor = torch.stack(padded_imgs) + mask = torch.stack(padded_masks) + + return NestedTensor(tensor, mask=mask) + +def _max_by_axis(the_list): + # type: (List[List[int]]) -> List[int] + maxes = the_list[0] + for sublist in the_list[1:]: + for index, item in enumerate(sublist): + maxes[index] = max(maxes[index], item) + return maxes + +def clean_state_dict(state_dict): + new_state_dict = OrderedDict() + for k, v in state_dict.items(): + if k[:7] == "module.": + k = k[7:] # remove `module.` + new_state_dict[k] = v + return new_state_dict + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + +def get_rank(): + if not is_dist_avail_and_initialized(): + return 0 + return dist.get_rank() + +def is_main_process(): + return get_rank() == 0 + +def box_xyxy_to_cxcywh(x): + x0, y0, x1, y1 = x.unbind(-1) + b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)] + return torch.stack(b, dim=-1) + +__torchvision_need_compat_flag = float(torchvision.__version__.split(".")[1]) < 7 +if __torchvision_need_compat_flag: + from torchvision.ops import _new_empty_tensor + from torchvision.ops.misc import _output_size + +def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corners=None): + # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor + """ + Equivalent to nn.functional.interpolate, but with support for empty batch sizes. + This will eventually be supported natively by PyTorch, and this + class can go away. + """ + if __torchvision_need_compat_flag < 0.7: + if input.numel() > 0: + return torch.nn.functional.interpolate(input, size, scale_factor, mode, align_corners) + + output_shape = _output_size(2, input, size, scale_factor) + output_shape = list(input.shape[:-2]) + list(output_shape) + return _new_empty_tensor(input, output_shape) + else: + return torchvision.ops.misc.interpolate(input, size, scale_factor, mode, align_corners) \ No newline at end of file diff --git a/fengshen/models/Lyrics/modeling_lyrics.py b/fengshen/models/Lyrics/modeling_lyrics.py new file mode 100644 index 0000000..52593bf --- /dev/null +++ b/fengshen/models/Lyrics/modeling_lyrics.py @@ -0,0 +1,2235 @@ +# coding=utf-8 +# Copyright 2023 The Salesforce Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch BLIP-2 model.""" + +import math +import re +from dataclasses import dataclass +from typing import Callable, Optional, Tuple, Union, List +import warnings +import random +import torchvision +import torch +import torch.utils.checkpoint +import copy +from torch import nn +from torch.nn import CrossEntropyLoss +from torch.nn.utils.rnn import pad_sequence +import torch.nn.functional as F +import torch.distributed as dist +from transformers.activations import ACT2FN +from transformers.modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, +) +from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from transformers.utils import ( + ModelOutput, + logging, +) +# from transformers.models.blip_2.configuration_blip_2 import Blip2Config, Blip2QFormerConfig +from transformers.models.blip_2.modeling_blip_2 import Blip2ForConditionalGenerationModelOutput +from transformers import ( + Blip2PreTrainedModel, + Blip2VisionModel, + AutoModelForCausalLM, + AutoModelForSeq2SeqLM, + Blip2QFormerModel, + PreTrainedTokenizer, + LogitsProcessorList, + LogitsProcessor, + StoppingCriteriaList, + GenerationConfig, +) +from fengshen.models.Lyrics.groundingdino.modeling_groundingdino import GroundingDINO +from fengshen.models.Lyrics.ram.models.ram import RAM +from fengshen.models.Lyrics.configuration_lyrics import LyricsConfig, LyricsQFormerConfig + + +logger = logging.get_logger(__name__) + + +class InvalidScoreLogitsProcessor(LogitsProcessor): + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + if torch.isnan(scores).any() or torch.isinf(scores).any(): + scores.zero_() + scores[..., 5] = 5e4 + return scores + + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding( + config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id + ) + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size + ) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + # transformer为layernorm, lavis为LayerNorm + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)) + ) + self.position_embedding_type = getattr( + config, "position_embedding_type", "absolute" + ) + + self.config = config + + def forward( + self, + input_ids=None, + position_ids=None, + query_embeds=None, + past_key_values_length=0, + ): + if input_ids is not None: + seq_length = input_ids.size()[1] + else: + seq_length = 0 + + if position_ids is None: + position_ids = self.position_ids[ + :, past_key_values_length: seq_length + past_key_values_length + ].clone() + # print(position_ids) + + if input_ids is not None: + embeddings = self.word_embeddings(input_ids) + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings = embeddings + position_embeddings + + if query_embeds is not None: + embeddings = torch.cat((query_embeds, embeddings), dim=1) + else: + embeddings = query_embeds + + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class LyricsQFormerMultiHeadAttention(nn.Module): + def __init__(self, config, is_cross_attention=False, is_detection=False): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention heads (%d)" + % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention and is_detection: + self.key = nn.Linear(config.detection_encoder_hidden_size, self.all_head_size) # 260, 256 + 4 + self.value = nn.Linear(config.detection_encoder_hidden_size, self.all_head_size) + elif is_cross_attention: + self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size) + self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding( + 2 * config.max_position_embeddings - 1, self.attention_head_size) + self.save_attention = False + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + if self.key.weight.dtype == torch.half: + encoder_hidden_states = encoder_hidden_states.half() + # encoder_hidden_states = encoder_hidden_states + elif self.key.weight.dtype == torch.bfloat16: + encoder_hidden_states = torch.tensor(encoder_hidden_states, dtype=torch.bfloat16) + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + mixed_query_layer = self.query(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, + device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, + device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding( + distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to( + dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum( + "bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum( + "bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum( + "bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + if is_cross_attention and self.save_attention: + self.save_attention_map(attention_probs) + attention_probs.register_hook(self.save_attn_gradients) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = torch.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + outputs = outputs + (past_key_value,) + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->Blip2QFormer +class LyricsQFormerSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class LyricsQFormerAttention(nn.Module): + def __init__(self, config, is_cross_attention=False, is_detection = False): + super().__init__() + self.attention = LyricsQFormerMultiHeadAttention(config, is_cross_attention, is_detection) + self.output = LyricsQFormerSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Blip2QFormer +class LyricsQFormerIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->Blip2QFormer +class LyricsQFormerOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class LyricsQFormerLayer(nn.Module): + def __init__(self, config, layer_idx): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = LyricsQFormerAttention(config) + + self.layer_idx = layer_idx + self.num_vit_query_tokens = config.num_vit_query_tokens + + if layer_idx % config.cross_attention_frequency == 0: + self.crossattention = LyricsQFormerAttention(config, is_cross_attention=True) + self.detection_crossattention = LyricsQFormerAttention(config, is_cross_attention=True, is_detection=True) + self.has_cross_attention = True + else: + self.has_cross_attention = False + + self.intermediate = LyricsQFormerIntermediate(config) + self.output = LyricsQFormerOutput(config) + + self.intermediate_query = LyricsQFormerIntermediate(config) + self.output_query = LyricsQFormerOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + detection_encoder_hidden_states=None, + detection_encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + query_length=0, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:-1] + + present_key_value = self_attention_outputs[-1] + + if query_length > 0: + query_attention_output = attention_output[:, :query_length, :] + + if self.has_cross_attention: + if encoder_hidden_states is None: + raise ValueError( + "encoder_hidden_states must be given for cross-attention layers") + if detection_encoder_hidden_states is None: + raise ValueError( + "detection_encoder_hidden_states must be given for cross-attention layers") + if attention_mask is not None: + cross_attention_mask = attention_mask[:, :self.num_vit_query_tokens] + detection_cross_attention_mask = attention_mask[:, self.num_vit_query_tokens:] + else: + cross_attention_mask = None + detection_cross_attention_mask = None + cross_attention_outputs = self.crossattention( + query_attention_output[:, :self.num_vit_query_tokens, :], + cross_attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + vit_query_attention_output = cross_attention_outputs[0] + # add cross attentions if we output attention weights + query_attention_probs = cross_attention_outputs[1:-1] + + detection_cross_attention_outputs = self.detection_crossattention( + query_attention_output[:, self.num_vit_query_tokens:, :], + detection_cross_attention_mask, + head_mask, + detection_encoder_hidden_states, + detection_encoder_attention_mask, + output_attentions=output_attentions, + ) + detection_query_attention_output = detection_cross_attention_outputs[0] + # add cross attentions if we output attention weights + detection_query_attention_probs = detection_cross_attention_outputs[1:-1] + + if output_attentions == True: + padding_attention = torch.zeros((query_attention_probs[0].size(0), + query_attention_probs[0].size(1), + detection_query_attention_probs[0].size(2) - query_attention_probs[0].size(2))) + query_attention_probs = torch.cat([query_attention_probs[0], padding_attention], dim = -1) + + outputs = outputs + (torch.cat([query_attention_probs[0], detection_query_attention_probs], dim=1),) + else: + outputs = outputs + cross_attention_outputs[1:-1] + + query_attention_output = torch.cat([vit_query_attention_output, detection_query_attention_output], dim=1) + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk_query, + self.chunk_size_feed_forward, + self.seq_len_dim, + query_attention_output, + ) + + if attention_output.shape[1] > query_length: + layer_output_text = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output[:, query_length:, :], + ) + layer_output = torch.cat([layer_output, layer_output_text], dim=1) + else: + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + outputs = (layer_output,) + outputs + + # present_key_value是self attention的key,value, 用于在decoder中以前的词的key,value不用再重复计算 + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + def feed_forward_chunk_query(self, attention_output): + intermediate_output = self.intermediate_query(attention_output) + layer_output = self.output_query(intermediate_output, attention_output) + return layer_output + + +class LyricsQFormerEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [LyricsQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + detection_encoder_hidden_states=None, + detection_encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + query_length=0, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions else None + + next_decoder_cache = () if use_cache else None + + for i in range(self.config.num_hidden_layers): + + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if getattr(self.config, "gradient_checkpointing", False) and self.training: + if use_cache: + logger.warn( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, past_key_value, output_attentions, query_length) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + detection_encoder_hidden_states, + detection_encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + detection_encoder_hidden_states, + detection_encoder_attention_mask, + past_key_value, + output_attentions, + query_length, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + # 这里的cross attention是经过pad的 + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if layer_module.has_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class LyricsQFormerModel(Blip2PreTrainedModel): + """ + Querying Transformer (Q-Former), used in BLIP-2. + """ + + def __init__(self, config: LyricsQFormerConfig): + super().__init__(config) + self.config = config + + self.embeddings = BertEmbeddings(config) + + # self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + # self.dropout = nn.Dropout(config.hidden_dropout_prob) + + self.encoder = LyricsQFormerEncoder(config) + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, + attention_mask: torch.Tensor, + input_shape: Tuple[int], + device: torch.device, + is_decoder: bool, + has_query: bool = False, + ) -> torch.Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (:obj:`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (:obj:`Tuple[int]`): + The shape of the input to the model. + device: (:obj:`torch.device`): + The device of the input to the model. + + Returns: + :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if is_decoder: + batch_size, seq_length = input_shape + + seq_ids = torch.arange(seq_length, device=device) + causal_mask = ( + seq_ids[None, None, :].repeat(batch_size, seq_length, 1) + <= seq_ids[None, :, None] + ) + + # add a prefix ones mask to the causal mask + # causal and attention masks must have same type with pytorch version < 1.3 + causal_mask = causal_mask.to(attention_mask.dtype) + + if causal_mask.shape[1] < attention_mask.shape[1]: + prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1] + if has_query: # UniLM style attention mask + causal_mask = torch.cat( + [ + torch.zeros( + (batch_size, prefix_seq_len, seq_length), + device=device, + dtype=causal_mask.dtype, + ), + causal_mask, + ], + axis=1, + ) + causal_mask = torch.cat( + [ + torch.ones( + (batch_size, causal_mask.shape[1], prefix_seq_len), + device=device, + dtype=causal_mask.dtype, + ), + causal_mask, + ], + axis=-1, + ) + extended_attention_mask = ( + causal_mask[:, None, :, :] * attention_mask[:, None, None, :] + ) + else: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to( + dtype=self.dtype + ) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def forward( + self, + input_ids=None, + query_embeds=None, + attention_mask=None, + position_ids=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + detection_encoder_hidden_states=None, + detection_encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + is_decoder=False, + ): + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of: + shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and + value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are + used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key + value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape + `(batch_size, sequence_length)`. + use_cache (`bool`, `optional`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is None: + assert ( + query_embeds is not None + ), "You have to specify query_embeds when input_ids is None" + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] - + self.config.query_length if past_key_values is not None else 0 + ) + + query_length = query_embeds.shape[1] if query_embeds is not None else 0 + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + query_embeds=query_embeds, + past_key_values_length=past_key_values_length, + ) + + input_shape = embedding_output.size()[:-1] + batch_size, seq_length = input_shape + device = embedding_output.device + if attention_mask is None: + attention_mask = torch.ones( + ((batch_size, seq_length + past_key_values_length)), device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if is_decoder: + extended_attention_mask = self.get_extended_attention_mask( + attention_mask, + input_ids.shape, + device, + is_decoder, + has_query=(query_embeds is not None), + ) + else: + extended_attention_mask = self.get_extended_attention_mask( + attention_mask, input_shape, device, is_decoder + ) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if type(encoder_hidden_states) == list: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size() + else: + ( + encoder_batch_size, + encoder_sequence_length, + _, + ) = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if type(encoder_attention_mask) == list: + encoder_extended_attention_mask = [ + self.invert_attention_mask(mask) for mask in encoder_attention_mask] + elif encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + if detection_encoder_hidden_states is not None: + if type(detection_encoder_hidden_states) == list: + detection_encoder_batch_size, detection_encoder_sequence_length, _ = detection_encoder_hidden_states[0].size() + else: + ( + detection_encoder_batch_size, + detection_encoder_sequence_length, + _, + ) = detection_encoder_hidden_states.size() + detection_encoder_hidden_shape = (detection_encoder_batch_size, detection_encoder_sequence_length) + + if type(detection_encoder_attention_mask) == list: + detection_encoder_extended_attention_mask = [ + self.invert_attention_mask(mask) for mask in detection_encoder_attention_mask] + elif detection_encoder_attention_mask is None: + detection_encoder_attention_mask = torch.ones(detection_encoder_hidden_shape, device=device) + detection_encoder_extended_attention_mask = self.invert_attention_mask(detection_encoder_attention_mask) + else: + detection_encoder_extended_attention_mask = self.invert_attention_mask(detection_encoder_attention_mask) + else: + detection_encoder_extended_attention_mask = None + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + detection_encoder_hidden_states=detection_encoder_hidden_states, + detection_encoder_attention_mask=detection_encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + query_length=query_length, + ) + sequence_output = encoder_outputs[0] + pooled_output = sequence_output[:, 0, :] + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + +# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->BlipText + + +class BertPredictionHeadTransform(nn.Module): + def __init__(self, config: LyricsQFormerConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + +# 把注意力方式改一下,就可以做MLM了 +class LyricsQFormerOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + + def forward(self, sequence_output: torch.Tensor) -> torch.Tensor: + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class LyricsQFormerWithLMHead(Blip2PreTrainedModel): + base_model_prefix = "bert" + + def __init__(self, config: LyricsQFormerConfig): + super().__init__(config) + + self.bert = LyricsQFormerModel(config) + self.cls = LyricsQFormerOnlyMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + def forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + query_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + detection_encoder_hidden_states=None, + detection_encoder_attention_mask=None, + labels=None, + past_key_values=None, + use_cache=True, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + return_logits=False, + is_decoder=True, + reduction="mean", + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are + ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]`` + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + Returns: + Example:: + >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig + >>> import torch + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased') + >>> config = BertConfig.from_pretrained("bert-base-cased") + >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config) + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + >>> prediction_logits = outputs.logits + """ + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + if labels is not None: + use_cache = False + if past_key_values is not None: + query_embeds = None + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + query_embeds=query_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + detection_encoder_hidden_states=detection_encoder_hidden_states, + detection_encoder_attention_mask=detection_encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + is_decoder=is_decoder, + ) + + sequence_output = outputs[0] + if query_embeds is not None: + sequence_output = outputs[0][:, query_embeds.shape[1]:, :] + prediction_scores = self.cls(sequence_output) + + if return_logits: + return prediction_scores[:, :-1, :].contiguous() + + # 区分is_decoder + # 没mask掉的用-100代替 + lm_loss = None + if is_decoder == True: + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1) + lm_loss = loss_fct( + shifted_prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1), + ) + if reduction == "none": + lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + else: + if labels is not None: + # we are doing mask prediction; do not need shift; but do not calculate cls_token + # bs, seq, vocab + prediction_scores = prediction_scores[:, 1:, :].contiguous() + # bs, seq + labels = labels[:, 1:].contiguous() + # print('prediction_scores:', prediction_scores.size()) + # print('labels:', labels.size()) + # print('max_labels:', torch.max(labels, dim=1)) + loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1) + mlm_loss = loss_fct( + prediction_scores.view(-1, self.config.vocab_size), + labels.view(-1), + ) + if reduction == "none": + mlm_loss = mlm_loss.view(prediction_scores.size(0), -1).sum(1) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((mlm_loss,) + output) if mlm_loss is not None else output + + if is_decoder == True: + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + else: + return MaskedLMOutput( + loss=mlm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs + ): + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_ids.shape) + + # cut decoder_input_ids if past is used + if past_key_values is not None: + input_ids = input_ids[:, -1:] + + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "past_key_values": past_key_values, + "is_decoder": True, + } + + def _reorder_cache(self, past, beam_idx): + reordered_past = () + for layer_past in past: + reordered_past += ( + tuple( + past_state.index_select(0, beam_idx) for past_state in layer_past + ), + ) + return reordered_past + + +@dataclass +class LyricsOutput(ModelOutput): + loss: Optional[torch.FloatTensor] = None + + loss_itc: Optional[torch.FloatTensor] = None + + loss_itm: Optional[torch.FloatTensor] = None + + loss_lm: Optional[torch.FloatTensor] = None + + loss_mlm: Optional[torch.FloatTensor] = None + +# 用来封装BLIPQFormerWithLMHead的输出的 +class LyricsQFormerForConditionalGeneration(Blip2PreTrainedModel): + config_class = LyricsConfig + main_input_name = "pixel_values" + + def __init__(self, config: LyricsConfig): + super().__init__(config) + + self.vision_model = Blip2VisionModel(config.vision_config) + + self.query_tokens = nn.Parameter(torch.zeros( + 1, config.num_query_tokens, config.qformer_config.hidden_size)) + + self.qformer = LyricsQFormerWithLMHead(config.qformer_config) + + self.decoder_input_ids = config.qformer_config.bos_token_id + self.decoder_pad_token_id = config.qformer_config.pad_token_id + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.patch_embedding + + def forward( + self, + pixel_values: torch.FloatTensor, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]: + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + image_embeds = vision_outputs[0] + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(pixel_values.device) + + query_outputs = self.qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + use_cache=True, + return_dict=return_dict, + ) + query_output = query_outputs[0] + + lm_output = self.qformer( + input_ids, + attention_mask=attention_mask, + past_key_values=query_output.past_key_values, + return_dict=return_dict, + labels=labels, + ) + + if not return_dict: + outputs = (outputs[0], outputs[1], image_embeds, vision_outputs[0]) + vision_outputs[2:] + return tuple(output for output in outputs if output is not None) + + return Blip2ForConditionalGenerationModelOutput( + loss=lm_output.loss, + decoder_logits=lm_output.logits, + vision_outputs=vision_outputs, + qformer_outputs=query_outputs, + language_model_outputs=lm_output, + ) + +class LyricsLMForConditionalGeneration(Blip2PreTrainedModel): + config_class = LyricsConfig + main_input_name = "pixel_values" + + def __init__(self, config: LyricsConfig): + super().__init__(config) + + # 直接传ids进来,所以不需要tokenizer + self.vision_model = Blip2VisionModel(config.vision_config) + self.ram = RAM(config.ram_config) + self.grounding_dino = GroundingDINO(config.detection_config) + + self.query_tokens = nn.Parameter(torch.zeros( + 1, config.num_query_tokens, config.qformer_config.hidden_size)) + + # 只用了encoder那部分模型,没用有cls的模型 + self.qformer = LyricsQFormerModel(config.qformer_config) + + self.language_projection = nn.Linear( + config.qformer_config.hidden_size, config.text_config.hidden_size) + if config.use_decoder_only_language_model: + language_model = AutoModelForCausalLM.from_config(config.text_config) + else: + language_model = AutoModelForSeq2SeqLM.from_config(config.text_config) + self.language_model = language_model + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def get_output_embeddings(self) -> nn.Module: + return self.language_model.get_output_embeddings() + + def get_encoder(self): + return self.language_model.get_encoder() + + def get_decoder(self): + return self.language_model.get_decoder() + + def _tie_weights(self): + if not self.config.use_decoder_only_language_model: + self.language_model.encoder.embed_tokens = self.language_model.shared + self.language_model.decoder.embed_tokens = self.language_model.shared + + def _preprocess_accelerate(self): + r""" + Some pre-processing hacks to make the model `accelerate` compatible. Check + https://github.com/huggingface/transformers/pull/21707 for more details. + """ + hf_device_map = self.hf_device_map + + if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1: + # warn users about unexpected behavior when using multi-GPU + BLIP-2 + `accelerate`. + logger.warning( + "The `language_model` is not in the `hf_device_map` dictionary and you are running your script" + " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`." + " Please pass a `device_map` that contains `language_model` to remove this warning." + " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for", + " more details on creating a `device_map` for large models.", + ) + + if hasattr(self.language_model, "_hf_hook"): + self.language_model._hf_hook.io_same_device = True # For `generate` compatibility + + def forward( + self, + pixel_values: torch.FloatTensor, + ram_pixel_values: torch.FloatTensor, + grounding_pixel_values: torch.FloatTensor, + input_ids: torch.FloatTensor, + attention_mask: torch.FloatTensor, + labels: torch.FloatTensor = None, + # 因为label不会出现在image之前,所以这里不需要labels_before_image, 按照input_ids_before_image补-100就可以了 + qformer_input_ids: torch.FloatTensor = None, + qformer_attention_mask: torch.FloatTensor = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + # step 1: forward the images through the vision encoder, + # to get image embeddings of shape (batch_size, seq_len, hidden_size) + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + image_embeds = vision_outputs[0] + + tags_english, tags_chinese = self.ram.generate_tag(ram_pixel_values) + + input_tags = [tag.replace(' |', ',').lower().strip() + "." if not tag.endswith(".") else tag.replace(' |', ',').lower().strip() for tag in tags_english] + + # outputs = self.grounding_dino(grounding_image[None], captions=input_tags) + grounding_outputs = self.grounding_dino(grounding_pixel_values, captions=input_tags) + + detection_image_embeds = grounding_outputs["hidden_state"] # (bs, nq, 256) + + # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention + image_attention_mask = torch.ones( + image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + detection_image_attention_mask = torch.ones(detection_image_embeds.size()[:-1], dtype=torch.long).to(pixel_values.device) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(image_embeds.device) + + if qformer_input_ids == None: + # print('no_hava_instruct') + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + detection_encoder_hidden_states=detection_image_embeds, + detection_encoder_attention_mask=detection_image_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + query_output = query_outputs[0] + else: + # print('hava_instruct') + text_qformer_atts = qformer_attention_mask + qformer_atts = torch.cat([query_atts, text_qformer_atts],dim=1) + query_outputs = self.qformer( + qformer_input_ids, + query_embeds=query_tokens, + attention_mask=qformer_atts, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + detection_encoder_hidden_states=detection_image_embeds, + detection_encoder_attention_mask=detection_image_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + query_output = query_outputs[0][:,:query_tokens.size(1),:] + + # print(query_output.size()) + # step 2.5 generate the lm input by prompt and output + language_model_inputs = self.language_projection(query_output) + language_model_attention_mask = torch.ones( + language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device + ) + + # 确保language_model_inputs的batch + assert language_model_inputs.shape[0] == input_ids.shape[0] + inputs_embeds = self.language_model.get_input_embeddings()(input_ids) + + inputs_embeds = torch.cat( + [ + language_model_inputs, + inputs_embeds.to(language_model_inputs.device) + ], dim=1) + + attention_mask = torch.cat( + [ + language_model_attention_mask, + attention_mask.to(language_model_attention_mask.device) + ], dim=1 + ) + + # labels也需要对应的处理,把前面空缺的-100加进去 + if labels is not None: + labels = torch.cat( + [ + torch.tensor([-100]).expand(query_tokens.shape[:-1] + ).to(language_model_inputs.device), + labels, + ], dim=1 + ) + + # step 3: use the language model + + if self.config.use_decoder_only_language_model: + # print('model is a use_decoder_only_language_model') + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + # labels=labels, + ) + + logits = outputs.logits if return_dict else outputs[0] + loss = None + if labels is not None: + labels = labels.to(logits.device) + logits = logits[:, -labels.size(1) :, :] + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous().to(logits.device) + + # Flatten the tokens + loss_fct = CrossEntropyLoss(reduction="mean") + + loss = loss_fct(shift_logits.view(-1, self.config.text_config.vocab_size), shift_labels.view(-1)) + + + else: + raise Exception("not impl") + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + ) + loss = outputs.loss if return_dict else outputs[0] + logits = outputs.logits if return_dict else outputs[1] + + if not return_dict: + output = (logits, vision_outputs, query_outputs, outputs) + return ((loss,) + output) if loss is not None else output + + return Blip2ForConditionalGenerationModelOutput( + loss=loss, + logits=logits, + vision_outputs=vision_outputs, + qformer_outputs=query_outputs, + language_model_outputs=outputs, + ) + + @torch.no_grad() + def generate( + self, + pixel_values: torch.FloatTensor, + ram_pixel_values: torch.FloatTensor, + grounding_pixel_values: torch.FloatTensor, + input_ids: torch.FloatTensor, + attention_mask: torch.FloatTensor, + qformer_input_ids: torch.FloatTensor = None, + qformer_attention_mask: torch.FloatTensor = None, + **generate_kwargs, + ) -> torch.LongTensor: + """ + Overrides `generate` function to be able to use the model as a conditional generator. + + Args: + pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width)): + Input images to be processed. + input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + The sequence used as a prompt for the generation. + + Returns: + captions (list): A list of strings of length batch_size * num_captions. + """ + if hasattr(self, "hf_device_map"): + # preprocess for `accelerate` + self._preprocess_accelerate() + # print('data type: ', pixel_values.dtype) + batch_size = pixel_values.shape[0] + image_embeds = self.vision_model(pixel_values, return_dict=True).last_hidden_state + + tags_english, tags_chinese = self.ram.generate_tag(ram_pixel_values) + + input_tags = [tag.replace(' |', ',').lower().strip() + "." if not tag.endswith(".") else tag.replace(' |', ',').lower().strip() for tag in tags_english] + + # outputs = self.grounding_dino(grounding_image[None], captions=input_tags) + grounding_outputs = self.grounding_dino(grounding_pixel_values, captions=input_tags) + + detection_image_embeds = grounding_outputs["hidden_state"] # (bs, nq, 256) + + image_attention_mask = torch.ones( + image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + detection_image_attention_mask = torch.ones(detection_image_embeds.size()[:-1], dtype=torch.long).to(pixel_values.device) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(image_embeds.device) + + if qformer_input_ids == None: + # print('no_hava_instruct') + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + detection_encoder_hidden_states=detection_image_embeds, + detection_encoder_attention_mask=detection_image_attention_mask, + ) + query_output = query_outputs[0] + else: + # print('hava_instruct') + if qformer_attention_mask == None: + qformer_attention_mask = torch.ones(qformer_input_ids.size(), dtype=torch.long).to(image_embeds.device) + qformer_atts = torch.cat([query_atts, qformer_attention_mask],dim=1) + query_outputs = self.qformer( + qformer_input_ids, + query_embeds=query_tokens, + attention_mask=qformer_atts, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + detection_encoder_hidden_states=detection_image_embeds, + detection_encoder_attention_mask=detection_image_attention_mask, + ) + query_output = query_outputs[0][:,:query_tokens.size(1),:] + # print('query_output:', query_output) + # print('query_output_size:', query_output.size()) + + language_model_inputs = self.language_projection(query_output) + language_attention_mask = torch.ones( + language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device + ) + # print('language_model_inputs:', language_model_inputs) + # print('language_model_inputs_size:', language_model_inputs.size()) + + + if attention_mask == None: + assert batch_size == 1 , print('If you do not pass in llm_instruct_atts, you can only be generated in a single sentence.') + attention_mask = torch.ones_like(input_ids) + attention_mask = torch.cat([language_attention_mask, attention_mask], dim=1) + + inputs_embeds = self.get_input_embeddings()(input_ids) + inputs_embeds = torch.cat([language_model_inputs, inputs_embeds], dim=1) + # print('inputs_embeds:', inputs_embeds) + # print('attention_mask:', attention_mask) + language_outputs = self.language_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + **generate_kwargs, + ) + + # outputs = [] + # for index, output in enumerate(language_outputs): + # output = output[inputs_embeds[index].size(0):] + # outputs.append(output) + + # return outputs + return language_outputs + + +class LyricsQFromerForPretrain(Blip2PreTrainedModel): + config_class = LyricsConfig + + def __init__(self, config: LyricsConfig): + super().__init__(config) + + self.vision_model = Blip2VisionModel(config.vision_config) + self.ram = RAM(config.ram_config) + self.grounding_dino = GroundingDINO(config.detection_config) + + self.query_tokens = nn.Parameter(torch.zeros( + 1, config.num_query_tokens, config.qformer_config.hidden_size)) + + # 同一个LMhead,不同的任务,加一个参数。或者拼起来。估计要加一个linear + # 图片256,目标检测900,语义分割4096,怎么可以赋予不同的权重,时间不是问题,权重是问题 + self.qformer = LyricsQFormerWithLMHead(config.qformer_config) + + self.vision_proj = nn.Linear(self.qformer.config.hidden_size, config.image_text_hidden_size) + self.text_proj = nn.Linear(self.qformer.config.hidden_size, config.image_text_hidden_size) + + self.itm_head = nn.Linear(self.qformer.config.hidden_size, 2) + + self.temp = nn.Parameter(0.07 * torch.ones([])) + + self.max_txt_len = 512 # 512-96 = 416 + self.max_input_len = 600 + + # Initialize weights and apply final processing + self.post_init() + + def generate_bbox_caption(self, logits, boxes, english_tags, chinese_tags, language): + # filter output + # 0最大值, 1索引 + bbox_caption = [] + bbox_caption_tokens = [] + bbox_caption_tokens_with_mask = [] + bbox_caption_labels_with_mask = [] + english_tags_list = [[tag.strip() for tag in sentence.split(' |')] for sentence in english_tags] + chinese_tags_list = [[tag.strip() for tag in sentence.split(' |')] for sentence in chinese_tags] + + for ind in range(logits.size(0)): + single_filt_mask = logits[ind].max(dim=1)[0] > self.box_threshold + single_logits_filt = logits[ind][single_filt_mask] # num_filt, 256 + single_boxes_filt = boxes[ind][single_filt_mask] # num_filt, 4 + + if len(single_filt_mask) == 0: + bbox_caption.append('') + bbox_caption_tokens.append(torch.Tensor([])) + bbox_caption_tokens_with_mask.append(torch.Tensor([])) + bbox_caption_labels_with_mask.append(torch.Tensor([])) + continue + + single_image_bbox_caption = '' + single_image_bbox_caption_tokens = [] + single_image_bbox_caption_tokens_with_mask = [] + single_image_bbox_caption_labels_with_mask = [] + # get phrase + tokenized = self.grounding_dino.tokenizer(english_tags[ind]) + # build pred + pred_phrases = [] + single_image_boxes = [] + single_image_scores = [] + for logit, box in zip(single_logits_filt, single_boxes_filt): + posmap = logit > self.text_threshold + assert isinstance(posmap, torch.Tensor), "posmap must be torch.Tensor" + if posmap.dim() == 1: + non_zero_idx = posmap.nonzero(as_tuple=True)[0].tolist() + # max_idx = posmap.max[1] + token_ids = [tokenized["input_ids"][i] for i in non_zero_idx] + # token_ids = [tokenized["input_ids"][i] for i in non_zero_idx if i == max_idx] + pred_phrase = self.grounding_dino.tokenizer.decode(token_ids) + else: + raise NotImplementedError("posmap must be 1-dim") + pred_phrases.append(pred_phrase + f"({str(logit.max().item())[:4]})") + single_image_scores.append(logit.max().item()) + # box = box * torch.Tensor([self.config.image_size, self.config.image_size, self.config.image_size, self.config.image_size]) + box[:2] -= box[2:] / 2 + box[2:] += box[:2] + single_image_boxes.append(box) + # print("single_image_boxes:", len(single_image_boxes)) + if len(single_image_boxes) == 0: + bbox_caption.append('') + bbox_caption_tokens.append(torch.Tensor([])) + bbox_caption_tokens_with_mask.append(torch.Tensor([])) + bbox_caption_labels_with_mask.append(torch.Tensor([])) + continue + + single_image_boxes = torch.stack(single_image_boxes) + single_image_scores = torch.Tensor(single_image_scores).to("cuda") + # nms_idx = torchvision.ops.nms(single_image_boxes, single_image_scores, self.iou_threshold).to('cpu').numpy().tolist() + nms_idx = torchvision.ops.nms(single_image_boxes, single_image_scores, self.iou_threshold) + single_image_boxes_filt = single_image_boxes[nms_idx] + pred_phrases = [pred_phrases[idx] for idx in nms_idx] + + # print("single_image_boxes_filt:", single_image_boxes_filt.size()) + # print("pred_phrases:", pred_phrases) + if single_image_boxes_filt.size(0) == 0: + bbox_caption.append('') + bbox_caption_tokens.append(torch.Tensor([])) + bbox_caption_tokens_with_mask.append(torch.Tensor([])) + bbox_caption_labels_with_mask.append(torch.Tensor([])) + continue + # 处理一条数据的多个框 + for i in range(single_image_boxes_filt.size(0)): + # ori_box = single_image_boxes_filt[i] / torch.Tensor([self.config.image_size, self.config.image_size, self.config.image_size, self.config.image_size]) + ori_box = single_image_boxes_filt[i] + # ori_box = torch.Tensor([round(coordinate, 3) for coordinate in single_image_boxes_filt[i]]) + name, _ = pred_phrases[i].split('(') + name = name.replace('|', '').strip() + name = re.sub(r'\s-\s', '-', name) + input_name = None + # print('english_tags_list:', english_tags_list[ind]) + # print('name:', name) + if language == 'zh': + for tags_ind in range(len(english_tags_list[ind])): + if name == english_tags_list[ind][tags_ind]: + input_name = chinese_tags_list[ind][tags_ind] + break + if input_name == None: + flag = 0 + for tags_ind in range(len(english_tags_list[ind])): + for name_ind in range(len(name.split()),0,-1): + if ' '.join(name.split()[:name_ind]) == english_tags_list[ind][tags_ind]: + input_name = chinese_tags_list[ind][tags_ind] + flag = 1 + break + for name_ind in range(len(name.split())): + if ' '.join(name.split()[name_ind:]) == english_tags_list[ind][tags_ind]: + input_name = chinese_tags_list[ind][tags_ind] + flag = 1 + break + if flag == 1: + break + if input_name == None: + continue + else: + for tags_ind in range(len(english_tags_list[ind])): + if name == english_tags_list[ind][tags_ind]: + input_name = name + break + if input_name == None: + flag = 0 + for tags_ind in range(len(english_tags_list[ind])): + for name_ind in range(len(name.split()),0,-1): + if ' '.join(name.split()[:name_ind]) == english_tags_list[ind][tags_ind]: + input_name = english_tags_list[ind][tags_ind] + flag = 1 + break + for name_ind in range(len(name.split())): + if ' '.join(name.split()[name_ind:]) == english_tags_list[ind][tags_ind]: + input_name = english_tags_list[ind][tags_ind] + flag = 1 + break + if flag == 1: + break + if input_name == None: + # print('name:', name) + # print('input_name:', input_name) + # print('english_tags_list:', english_tags_list[ind]) + # print('input_name is none') + input_name = name + # if input_name == None: + # bbox_caption.append('') + # bbox_caption_tokens.append(torch.Tensor([])) + # bbox_caption_tokens_with_mask.append(torch.Tensor([])) + # bbox_caption_labels_with_mask.append(torch.Tensor([])) + # continue + + # print('input_name:', input_name) + single_bbox_caption = input_name + ': [' + ', '.join([str(round(coordinate.item(), 3)) for coordinate in ori_box]) + ']' + single_image_bbox_caption = single_image_bbox_caption + ' ' + single_bbox_caption + name_and_bbox_tokens = [] + name_and_bbox_tokens.append(torch.tensor(self.tokenizer(input_name, add_special_tokens=False).input_ids)) + name_and_bbox_tokens.append(torch.tensor(self.tokenizer('[', add_special_tokens=False).input_ids)) + for coordinate in ori_box: + name_and_bbox_tokens.append(torch.tensor(self.tokenizer(str(round(coordinate.item(), 3)), add_special_tokens=False).input_ids)) + name_and_bbox_tokens.append(torch.tensor(self.tokenizer(']', add_special_tokens=False).input_ids)) + + for name_and_bbox_tokens_ind in range(len(name_and_bbox_tokens)): + if name_and_bbox_tokens_ind == 1 or name_and_bbox_tokens_ind == 5: + single_image_bbox_caption_tokens_with_mask.append(name_and_bbox_tokens[name_and_bbox_tokens_ind]) + single_image_bbox_caption_labels_with_mask.append(torch.full_like(name_and_bbox_tokens[name_and_bbox_tokens_ind], -100)) + else: + if random.random() <= 0.15: + single_image_bbox_caption_tokens_with_mask.append(torch.full_like(name_and_bbox_tokens[name_and_bbox_tokens_ind], self.tokenizer.mask_token_id)) + single_image_bbox_caption_labels_with_mask.append(name_and_bbox_tokens[name_and_bbox_tokens_ind]) + else: + single_image_bbox_caption_tokens_with_mask.append(name_and_bbox_tokens[name_and_bbox_tokens_ind]) + single_image_bbox_caption_labels_with_mask.append(torch.full_like(name_and_bbox_tokens[name_and_bbox_tokens_ind], -100)) + single_image_bbox_caption_tokens.append(name_and_bbox_tokens[name_and_bbox_tokens_ind]) + + single_image_bbox_caption_tokens.append(torch.tensor([self.tokenizer.sep_token_id])) + single_image_bbox_caption_tokens_with_mask.append(torch.tensor([self.tokenizer.sep_token_id])) + single_image_bbox_caption_labels_with_mask.append(torch.tensor([-100])) + # try: + if single_image_bbox_caption_tokens == '': + bbox_caption.append('') + bbox_caption_tokens.append(torch.Tensor([])) + bbox_caption_tokens_with_mask.append(torch.Tensor([])) + bbox_caption_labels_with_mask.append(torch.Tensor([])) + continue + single_image_bbox_caption_tokens = torch.cat(single_image_bbox_caption_tokens, dim = -1) + # except: + # print('single_bbox_caption:', single_bbox_caption) + # print('single_image_bbox_caption:', single_image_bbox_caption) + # print('name_and_bbox_tokens:', name_and_bbox_tokens) + # print('single_image_boxes:', single_image_boxes) + # print('single_image_boxes_filt:', single_image_boxes_filt) + # print('single_image_bbox_caption_tokens:', single_image_bbox_caption_tokens) + # print('single_image_boxes_filt:', single_image_boxes_filt.size(1)) + single_image_bbox_caption_tokens_with_mask = torch.cat(single_image_bbox_caption_tokens_with_mask, dim = -1) + single_image_bbox_caption_labels_with_mask = torch.cat(single_image_bbox_caption_labels_with_mask, dim = -1) + + bbox_caption.append(single_image_bbox_caption) + bbox_caption_tokens.append(single_image_bbox_caption_tokens) + bbox_caption_tokens_with_mask.append(single_image_bbox_caption_tokens_with_mask) + bbox_caption_labels_with_mask.append(single_image_bbox_caption_labels_with_mask) + # if torch.distributed.get_rank() == 0: + # print(bbox_caption[0]) + # print(bbox_caption_tokens[0]) + # print(self.tokenizer.decode(bbox_caption_tokens[0])) + # exit() + outputs = {'bbox_caption': bbox_caption, + 'bbox_caption_tokens': bbox_caption_tokens, + 'bbox_caption_tokens_with_mask': bbox_caption_tokens_with_mask, + 'bbox_caption_labels_with_mask': bbox_caption_labels_with_mask, + } + + return outputs + + def prepare_inputs_for_pretrain(self, captions, bbox_caption_tokens, bbox_caption_tokens_with_mask, bbox_caption_labels_with_mask): + text_input_tokens_ids = [] + text_input_tokens_ids_with_mask = [] + text_input_labels = [] + text_input_labels_with_mask = [] + text_input_attentions = [] + text_input_attentions_with_mask = [] + text_input_position_ids = [] + text_input_position_ids_with_mask = [] + batch_size = len(captions) + for ind, caption in enumerate(captions): + if len(caption)>500: + print(caption) + single_caption_tokens = torch.tensor(self.tokenizer(caption, add_special_tokens=False, truncation=True, max_length=87).input_ids) + + # 单纯caption的label + single_caption_labels = torch.tensor(single_caption_tokens) + single_caption_labels_with_mask = torch.full_like(single_caption_tokens, -100) + + # 无框 + if bbox_caption_tokens[ind].size(0) == 0: + single_text_input_tokens_ids = torch.cat([torch.tensor([self.tokenizer.cls_token_id]), single_caption_tokens]) + single_text_input_tokens_ids_with_mask = torch.cat([torch.tensor([self.tokenizer.cls_token_id]), single_caption_tokens]) + text_input_tokens_ids.append(single_text_input_tokens_ids) + text_input_tokens_ids_with_mask.append(single_text_input_tokens_ids_with_mask) + text_input_labels.append(torch.cat([torch.tensor([self.tokenizer.bos_token_id]), single_caption_labels])) + text_input_labels_with_mask.append(torch.cat([torch.tensor([-100]), single_caption_labels_with_mask])) + text_input_attentions.append(torch.ones_like(single_text_input_tokens_ids)) + text_input_attentions_with_mask.append(torch.ones_like(single_text_input_tokens_ids_with_mask)) + text_input_position_ids.append(torch.cat([torch.Tensor([0]), torch.arange(1, len(single_caption_tokens)+1)])) + text_input_position_ids_with_mask.append(torch.cat([torch.Tensor([0]), torch.arange(1, len(single_caption_tokens)+1)])) + continue + # 拼接bbox的token和text的token和label + if len(bbox_caption_tokens[ind]) > self.max_txt_len - 1: + bbox_caption_tokens[ind] = bbox_caption_tokens[ind][:self.max_txt_len-1] + bbox_caption_tokens_with_mask[ind] = bbox_caption_tokens_with_mask[ind][:self.max_txt_len-1] + bbox_caption_labels_with_mask[ind] = bbox_caption_labels_with_mask[ind][:self.max_txt_len-1] + + # LM任务的label + single_bbox_caption_labels = torch.full_like(bbox_caption_tokens[ind], -100) + + single_text_input_tokens_ids = torch.cat([torch.tensor([self.tokenizer.cls_token_id]), bbox_caption_tokens[ind], single_caption_tokens]) + single_text_input_tokens_ids_with_mask = torch.cat([torch.tensor([self.tokenizer.cls_token_id]), bbox_caption_tokens_with_mask[ind], single_caption_tokens]) + + #这里也要改 cls 0 其他初始位置1 + single_text_input_position_ids = torch.cat([torch.Tensor([0]), torch.arange(1, len(bbox_caption_tokens[ind])+1), torch.arange(1, len(single_caption_tokens)+1)]) + single_text_input_position_ids_with_mask = torch.cat([torch.Tensor([0]), torch.arange(1, len(bbox_caption_tokens_with_mask[ind])+1), torch.arange(1, len(single_caption_tokens)+1)]) + + single_text_input_labels = torch.cat([torch.tensor([self.tokenizer.bos_token_id]), single_bbox_caption_labels, single_caption_labels]) + single_text_input_labels_with_mask = torch.cat([torch.tensor([-100]), bbox_caption_labels_with_mask[ind], single_caption_labels_with_mask]) + + # mask与pad不一样,还是要做注意力 + single_text_input_attentions = torch.ones_like(single_text_input_tokens_ids) + single_text_input_attentions_with_mask = torch.ones_like(single_text_input_tokens_ids_with_mask) + # position_ids + # if single_text_input_tokens_ids.size(-1) > self.max_txt_len: + # single_text_input_tokens_ids = single_text_input_tokens_ids[:self.max_txt_len] + # single_text_input_tokens_ids_with_mask = single_text_input_tokens_ids_with_mask[:self.max_txt_len] + # single_text_input_labels = single_text_input_labels[:self.max_txt_len] + # single_text_input_labels_with_mask = single_text_input_labels_with_mask[:self.max_txt_len] + # single_text_input_attentions = single_text_input_attentions[:self.max_txt_len] + # single_text_input_attentions_with_mask = single_text_input_attentions_with_mask[:self.max_txt_len] + text_input_tokens_ids.append(single_text_input_tokens_ids) + text_input_tokens_ids_with_mask.append(single_text_input_tokens_ids_with_mask) + text_input_labels.append(single_text_input_labels) + text_input_labels_with_mask.append(single_text_input_labels_with_mask) + text_input_attentions.append(single_text_input_attentions) + text_input_attentions_with_mask.append(single_text_input_attentions_with_mask) + text_input_position_ids.append(single_text_input_position_ids) + text_input_position_ids_with_mask.append(single_text_input_position_ids_with_mask) + + # 添加一个长度为max_length的tensor + pad_tensor = torch.ones(self.max_input_len) + text_input_tokens_ids.append(pad_tensor) + text_input_tokens_ids_with_mask.append(pad_tensor) + text_input_labels.append(pad_tensor) + text_input_labels_with_mask.append(pad_tensor) + text_input_attentions.append(pad_tensor) + text_input_attentions_with_mask.append(pad_tensor) + text_input_position_ids.append(pad_tensor) + text_input_position_ids_with_mask.append(pad_tensor) + + text_input_tokens_ids = pad_sequence(text_input_tokens_ids, batch_first = True, padding_value = self.tokenizer.pad_token_id)[:batch_size, :self.max_input_len] + text_input_tokens_ids_with_mask = pad_sequence(text_input_tokens_ids_with_mask, batch_first = True, padding_value = self.tokenizer.pad_token_id)[:batch_size, :self.max_input_len] + text_input_labels = pad_sequence(text_input_labels, batch_first = True, padding_value = -100)[:batch_size, :self.max_input_len] + text_input_labels_with_mask = pad_sequence(text_input_labels_with_mask, batch_first = True, padding_value = -100)[:batch_size, :self.max_input_len] + text_input_attentions = pad_sequence(text_input_attentions, batch_first = True, padding_value = 0)[:batch_size, :self.max_input_len] + text_input_attentions_with_mask = pad_sequence(text_input_attentions_with_mask, batch_first = True, padding_value = 0)[:batch_size, :self.max_input_len] + text_input_position_ids = pad_sequence(text_input_position_ids, batch_first = True, padding_value = 0).long()[:batch_size, :self.max_input_len] + text_input_position_ids_with_mask = pad_sequence(text_input_position_ids_with_mask, batch_first = True, padding_value = 0).long()[:batch_size, :self.max_input_len] + + outputs = {"text_input_tokens_ids": text_input_tokens_ids, + "text_input_tokens_ids_with_mask": text_input_tokens_ids_with_mask, + "text_input_labels": text_input_labels, + "text_input_labels_with_mask": text_input_labels_with_mask, + "text_input_attentions": text_input_attentions, + "text_input_attentions_with_mask": text_input_attentions_with_mask, + "text_input_position_ids": text_input_position_ids, + "text_input_position_ids_with_mask": text_input_position_ids_with_mask, + } + return outputs + + def forward(self, image, grounding_image, ram_image, caption, language): + image = image + + image_embeds = self.vision_model(image)[0] + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) + + tags_english, tags_chinese = self.ram.generate_tag(ram_image) + + input_tags = [tag.replace(' |', ',').lower().strip() + "." if not tag.endswith(".") else tag.replace(' |', ',').lower().strip() for tag in tags_english] + + # outputs = self.grounding_dino(grounding_image[None], captions=input_tags) + outputs = self.grounding_dino(grounding_image, captions=input_tags) + logits = outputs["pred_logits"].sigmoid() # (bs, nq, 256) + boxes = outputs["pred_boxes"] # (bs, nq, 4) + detection_image_embeds = outputs["hidden_state"] # (bs, nq, 256) + detection_image_atts = torch.ones(detection_image_embeds.size()[:-1], dtype=torch.long).to(image.device) + + bbox_outputs = self.generate_bbox_caption(logits, boxes, tags_english, tags_chinese, language) + + text_inputs_for_pretrain = self.prepare_inputs_for_pretrain(caption, bbox_outputs['bbox_caption_tokens'], bbox_outputs['bbox_caption_tokens_with_mask'], bbox_outputs['bbox_caption_labels_with_mask']) + + # if torch.distributed.get_rank() == 0: + # print(self.tokenizer.decode(text_inputs_for_pretrain["text_input_tokens_ids"][0])) + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_output = self.qformer.bert( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + detection_encoder_hidden_states=detection_image_embeds, + detection_encoder_attention_mask=detection_image_atts, + use_cache=True, + return_dict=True, + ) + + image_feats = F.normalize( + self.vision_proj(query_output.last_hidden_state), dim=-1 + ) + + text_output = self.qformer.bert( + input_ids=text_inputs_for_pretrain['text_input_tokens_ids'].to('cuda'), + position_ids=text_inputs_for_pretrain["text_input_position_ids"].to('cuda'), + attention_mask=text_inputs_for_pretrain['text_input_attentions'].to('cuda'), + return_dict=True, + ) + text_feat = F.normalize( + self.text_proj(text_output.last_hidden_state[:, 0, :]), dim=-1 + ) + + ###============== Image-text Contrastive ===================### + # print(image_feats.size()) + # print(text_feat.size()) + image_feats_all = concat_all_gather( + image_feats + ) # [batch_size*num_gpu, num_query_tokens, embed_dim] + text_feat_all = concat_all_gather(text_feat) # [batch_size*num_gpu, embed_dim] + + sim_q2t = torch.matmul( + image_feats.unsqueeze(1), text_feat_all.unsqueeze(-1) + ).squeeze() + # [batch_size, batch_size*num_gpu, num_query_tokens] + + # image-text similarity: aggregate across all query tokens + sim_i2t, _ = sim_q2t.max(-1) + sim_i2t = sim_i2t / self.temp + + # text-query similarity: [batch_size, batch_size*num_gpu, num_query_tokens] + sim_t2q = torch.matmul( + text_feat.unsqueeze(1).unsqueeze(1), image_feats_all.permute(0, 2, 1) + ).squeeze() + + # text-image similarity: aggregate across all query tokens + sim_t2i, _ = sim_t2q.max(-1) + sim_t2i = sim_t2i / self.temp # [batch_size, batch_size*num_gpu] + + rank = dist.get_rank() + bs = image.size(0) + targets = torch.linspace(rank * bs, rank * bs + bs - 1, bs, dtype=int).to( + image.device + ) + + loss_itc = ( + F.cross_entropy(sim_i2t, targets, label_smoothing=0.1) + + F.cross_entropy(sim_t2i, targets, label_smoothing=0.1) + ) / 2 + + ###============== Image-text Matching ===================### + # print(text_inputs_for_pretrain['text_input_tokens_ids'].size()) + # print(text_inputs_for_pretrain['text_input_attentions'].size()) + # print(text_inputs_for_pretrain['text_input_position_ids'].size()) + # print(image_embeds) + # print(detection_image_embeds) + text_input_ids_world = concat_all_gather(text_inputs_for_pretrain['text_input_tokens_ids'].to('cuda')) + text_attention_mask_world = concat_all_gather(text_inputs_for_pretrain['text_input_attentions'].to('cuda')) + text_position_ids_world = concat_all_gather(text_inputs_for_pretrain['text_input_position_ids'].to('cuda')) + image_embeds_world = all_gather_with_grad(image_embeds) + detection_image_embeds_world = all_gather_with_grad(detection_image_embeds) + with torch.no_grad(): + weights_t2i = F.softmax(sim_t2i, dim=1) + 1e-4 + weights_t2i[:, rank * bs: rank * bs + bs].fill_diagonal_(0) + weights_i2t = F.softmax(sim_i2t, dim=1) + 1e-4 + weights_i2t[:, rank * bs: rank * bs + bs].fill_diagonal_(0) + + # select a negative image for each text + image_embeds_neg = [] + detection_image_embeds_neg = [] + for b in range(bs): + neg_idx = torch.multinomial(weights_t2i[b], 1).item() + image_embeds_neg.append(image_embeds_world[neg_idx]) + detection_image_embeds_neg.append(detection_image_embeds_world[neg_idx]) + image_embeds_neg = torch.stack(image_embeds_neg, dim=0) + detection_image_embeds_neg = torch.stack(detection_image_embeds_neg, dim=0) + + # select a negative text for each image + text_ids_neg = [] + text_atts_neg = [] + text_position_neg = [] + for b in range(bs): + neg_idx = torch.multinomial(weights_i2t[b], 1).item() + text_ids_neg.append(text_input_ids_world[neg_idx]) + text_atts_neg.append(text_attention_mask_world[neg_idx]) + text_position_neg.append(text_position_ids_world[neg_idx]) + + text_ids_neg = torch.stack(text_ids_neg, dim=0) + text_atts_neg = torch.stack(text_atts_neg, dim=0) + text_position_neg = torch.stack(text_position_neg, dim=0) + + text_ids_all = torch.cat( + [text_inputs_for_pretrain['text_input_tokens_ids'].to('cuda'), text_inputs_for_pretrain['text_input_tokens_ids'].to('cuda'), text_ids_neg], dim=0 + ) # pos, pos, neg + text_atts_all = torch.cat( + [text_inputs_for_pretrain['text_input_attentions'].to('cuda'), text_inputs_for_pretrain['text_input_attentions'].to('cuda'), text_atts_neg], + dim=0, + ) + position_ids_all = torch.cat( + [text_inputs_for_pretrain["text_input_position_ids"].to('cuda'), text_inputs_for_pretrain["text_input_position_ids"].to('cuda'), text_position_neg], + dim=0, + ) + + query_tokens_itm = self.query_tokens.expand(text_ids_all.shape[0], -1, -1) + query_atts_itm = torch.ones(query_tokens_itm.size()[:-1], dtype=torch.long).to( + image.device + ) + attention_mask_all = torch.cat([query_atts_itm, text_atts_all], dim=1) + + image_embeds_all = torch.cat( + [image_embeds, image_embeds_neg, image_embeds], dim=0 + ) # pos, neg, pos + image_atts_all = torch.ones(image_embeds_all.size()[:-1], dtype=torch.long).to( + image.device + ) + + detection_image_embeds_all = torch.cat( + [detection_image_embeds, detection_image_embeds_neg, detection_image_embeds], dim=0 + ) # pos, neg, pos + detection_image_atts_all = torch.ones(detection_image_embeds_all.size()[:-1], dtype=torch.long).to( + image.device + ) + + output_itm = self.qformer.bert( + text_ids_all, + query_embeds=query_tokens_itm, + position_ids=position_ids_all, + attention_mask=attention_mask_all, + encoder_hidden_states=image_embeds_all, + encoder_attention_mask=image_atts_all, + detection_encoder_hidden_states=detection_image_embeds_all, + detection_encoder_attention_mask=detection_image_atts_all, + return_dict=True, + ) + + vl_embeddings = output_itm.last_hidden_state[:, : query_tokens_itm.size(1), :] + vl_output = self.itm_head(vl_embeddings) + logits = vl_output.mean(dim=1) + + itm_labels = torch.cat( + [torch.ones(bs, dtype=torch.long), torch.zeros(2 * bs, dtype=torch.long)], + dim=0, + ).to(image.device) + loss_itm = F.cross_entropy(logits, itm_labels) + + ##================= Image Captioning ========================## + decoder_input_ids = text_inputs_for_pretrain['text_input_tokens_ids'].to('cuda').clone() + decoder_input_ids[:, 0] = self.tokenizer.bos_token_id + # labels = decoder_input_ids.masked_fill( + # decoder_input_ids == self.tokenizer.pad_token_id, -100 + # ) + print('text_input_tokens_ids:', text_inputs_for_pretrain['text_input_tokens_ids'][0]) + print('text_input_tokens:', self.tokenizer.decode(text_inputs_for_pretrain['text_input_tokens_ids'][0])) + print('text_input_labels_ids:', text_inputs_for_pretrain['text_input_labels'][0]) + print('text_input_labels:', self.tokenizer.decode(text_inputs_for_pretrain['text_input_labels'][0].masked_fill(text_inputs_for_pretrain['text_input_labels'][0]==-100, torch.tensor(0)))) + + decoder_labels = text_inputs_for_pretrain['text_input_labels'].to('cuda') + + decoder_query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to( + image.device + ) + decoder_attention_mask = torch.cat([decoder_query_atts, text_inputs_for_pretrain['text_input_attentions'].to('cuda')], dim=1) + # print('decoder_input_ids:', decoder_input_ids.size()) + # print('decoder_labels:', decoder_labels.size()) + lm_output = self.qformer( + decoder_input_ids, + position_ids=text_inputs_for_pretrain["text_input_position_ids"].to('cuda'), + attention_mask=decoder_attention_mask, + past_key_values=query_output.past_key_values, + return_dict=True, + labels=decoder_labels, + is_decoder=True, + ) + + loss_lm = lm_output.loss + + ##================= Mask Language Model ========================## + # encoder_input_ids = text_inputs_for_pretrain['text_input_tokens_ids_with_mask'].clone() + encoder_input_ids = text_inputs_for_pretrain['text_input_tokens_ids_with_mask'].to("cuda") + # encoder_input_ids[:, 0] = self.tokenizer.bos_token_id + print('text_input_tokens_ids_with_mask:', text_inputs_for_pretrain['text_input_tokens_ids_with_mask'][0]) + print('text_input_tokens_with_mask:', self.tokenizer.decode(text_inputs_for_pretrain['text_input_tokens_ids_with_mask'][0])) + print('text_input_labels_ids_with_mask:', text_inputs_for_pretrain['text_input_labels_with_mask'][0]) + print('text_input_labels_with_mask:', self.tokenizer.decode(text_inputs_for_pretrain['text_input_labels_with_mask'][0].masked_fill(text_inputs_for_pretrain['text_input_labels_with_mask'][0]==-100, torch.tensor(0)))) + + encoder_labels = text_inputs_for_pretrain['text_input_labels_with_mask'].to("cuda") + + encoder_query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to( + image.device + ) + encoder_attention_mask = torch.cat([encoder_query_atts, text_inputs_for_pretrain['text_input_attentions_with_mask'].to("cuda")], dim=1) + mlm_output = self.qformer( + encoder_input_ids, + position_ids=text_inputs_for_pretrain["text_input_position_ids_with_mask"].to('cuda'), + attention_mask=encoder_attention_mask, + past_key_values=query_output.past_key_values, + return_dict=True, + labels=encoder_labels, + is_decoder=False, + ) + + loss_mlm = mlm_output.loss + # print(loss_itc) + # print(loss_itm) + # print(loss_lm) + # print(loss_mlm) + # print('mlm_label:', torch.sum(torch.where(text_inputs_for_pretrain['text_input_labels_with_mask']!=-100,1,0),dim=1)) + # print('lm_label:', torch.sum(torch.where(text_inputs_for_pretrain['text_input_labels']!=-100,1,0),dim=1)) + + return LyricsOutput( + loss=loss_itc + loss_itm + loss_lm + loss_mlm, + loss_itc=loss_itc, + loss_itm=loss_itm, + loss_lm=loss_lm, + loss_mlm=loss_mlm, + ) + + +@torch.no_grad() +def concat_all_gather(tensor): + """ + Performs all_gather operation on the provided tensors. + *** Warning ***: torch.distributed.all_gather has no gradient. + """ + tensors_gather = [torch.ones_like(tensor) + for _ in range(torch.distributed.get_world_size())] + torch.distributed.all_gather(tensors_gather, tensor, async_op=False) + + output = torch.cat(tensors_gather, dim=0) + return output + + +class GatherLayer(torch.autograd.Function): + """ + Gather tensors from all workers with support for backward propagation: + This implementation does not cut the gradients as torch.distributed.all_gather does. + """ + + @staticmethod + def forward(ctx, x): + output = [ + torch.zeros_like(x) for _ in range(torch.distributed.get_world_size()) + ] + torch.distributed.all_gather(output, x) + return tuple(output) + + @staticmethod + def backward(ctx, *grads): + all_gradients = torch.stack(grads) + torch.distributed.all_reduce(all_gradients) + return all_gradients[torch.distributed.get_rank()] + + +def all_gather_with_grad(tensors): + """ + Performs all_gather operation on the provided tensors. + Graph remains connected for backward grad computation. + """ + # Queue the gathered tensors + world_size = torch.distributed.get_world_size() + # There is no need for reduction in the single-proc case + if world_size == 1: + return tensors + + # tensor_all = GatherLayer.apply(tensors) + tensor_all = GatherLayer.apply(tensors) + + return torch.cat(tensor_all, dim=0) diff --git a/fengshen/models/Lyrics/ram/configs/med_config.json b/fengshen/models/Lyrics/ram/configs/med_config.json new file mode 100644 index 0000000..49d64f8 --- /dev/null +++ b/fengshen/models/Lyrics/ram/configs/med_config.json @@ -0,0 +1,21 @@ +{ + "architectures": [ + "BertModel" + ], + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "model_type": "bert", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pad_token_id": 0, + "type_vocab_size": 2, + "vocab_size": 30524, + "encoder_width": 768, + "add_cross_attention": true + } \ No newline at end of file diff --git a/fengshen/models/Lyrics/ram/configs/q2l_config.json b/fengshen/models/Lyrics/ram/configs/q2l_config.json new file mode 100644 index 0000000..a8eba56 --- /dev/null +++ b/fengshen/models/Lyrics/ram/configs/q2l_config.json @@ -0,0 +1,22 @@ +{ + "architectures": [ + "BertModel" + ], + "attention_probs_dropout_prob": 0.1, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "model_type": "bert", + "num_attention_heads": 4, + "num_hidden_layers": 2, + "pad_token_id": 0, + "type_vocab_size": 2, + "vocab_size": 30522, + "encoder_width": 768, + "add_cross_attention": true, + "add_tag_cross_attention": false + } \ No newline at end of file diff --git a/fengshen/models/Lyrics/ram/configs/swin/config_swinB_384.json b/fengshen/models/Lyrics/ram/configs/swin/config_swinB_384.json new file mode 100644 index 0000000..d2f3e07 --- /dev/null +++ b/fengshen/models/Lyrics/ram/configs/swin/config_swinB_384.json @@ -0,0 +1,9 @@ +{ + "ckpt": "pretrain_model/swin_base_patch4_window7_224_22k.pth", + "vision_width": 1024, + "image_res": 384, + "window_size": 12, + "embed_dim": 128, + "depths": [ 2, 2, 18, 2 ], + "num_heads": [ 4, 8, 16, 32 ] + } \ No newline at end of file diff --git a/fengshen/models/Lyrics/ram/configs/swin/config_swinL_384.json b/fengshen/models/Lyrics/ram/configs/swin/config_swinL_384.json new file mode 100644 index 0000000..e6443a2 --- /dev/null +++ b/fengshen/models/Lyrics/ram/configs/swin/config_swinL_384.json @@ -0,0 +1,9 @@ +{ + "ckpt": "pretrain_model/swin_large_patch4_window12_384_22k.pth", + "vision_width": 1536, + "image_res": 384, + "window_size": 12, + "embed_dim": 192, + "depths": [ 2, 2, 18, 2 ], + "num_heads": [ 6, 12, 24, 48 ] + } \ No newline at end of file diff --git a/fengshen/models/Lyrics/ram/data/ram_tag_list.txt b/fengshen/models/Lyrics/ram/data/ram_tag_list.txt new file mode 100644 index 0000000..49c840b --- /dev/null +++ b/fengshen/models/Lyrics/ram/data/ram_tag_list.txt @@ -0,0 +1,4585 @@ +3D CG rendering +3D glasses +abacus +abalone +monastery +belly +academy +accessory +accident +accordion +acorn +acrylic paint +act +action +action film +activity +actor +adaptation +add +adhesive tape +adjust +adult +adventure +advertisement +antenna +aerobics +spray can +afro +agriculture +aid +air conditioner +air conditioning +air sock +aircraft cabin +aircraft model +air field +air line +airliner +airman +plane +airplane window +airport +airport runway +airport terminal +airship +airshow +aisle +alarm +alarm clock +mollymawk +album +album cover +alcohol +alcove +algae +alley +almond +aloe vera +alp +alpaca +alphabet +german shepherd +altar +amber +ambulance +bald eagle +American shorthair +amethyst +amphitheater +amplifier +amusement park +amusement ride +anchor +ancient +anemone +angel +angle +animal +animal sculpture +animal shelter +animation +animation film +animator +anime +ankle +anklet +anniversary +trench coat +ant +antelope +antique +antler +anvil +apartment +ape +app +app icon +appear +appearance +appetizer +applause +apple +apple juice +apple pie +apple tree +applesauce +appliance +appointment +approach +apricot +apron +aqua +aquarium +aquarium fish +aqueduct +arcade +arcade machine +arch +arch bridge +archaelogical excavation +archery +archipelago +architect +architecture +archive +archway +area +arena +argument +arm +armadillo +armband +armchair +armoire +armor +army +army base +army tank +array +arrest +arrow +art +art exhibition +art gallery +art print +art school +art studio +art vector illustration +artichoke +article +artifact +artist +artists loft +ash +ashtray +asia temple +asparagus +asphalt road +assemble +assembly +assembly line +association +astronaut +astronomer +athlete +athletic +atlas +atm +atmosphere +atrium +attach +fighter jet +attend +attraction +atv +eggplant +auction +audi +audio +auditorium +aurora +author +auto factory +auto mechanic +auto part +auto show +auto showroom +car battery +automobile make +automobile model +motor vehicle +autumn +autumn forest +autumn leave +autumn park +autumn tree +avatar +avenue +aviator sunglasses +avocado +award +award ceremony +award winner +shed +ax +azalea +baboon +baby +baby bottle +baby carriage +baby clothe +baby elephant +baby food +baby seat +baby shower +back +backdrop +backlight +backpack +backyard +bacon +badge +badger +badlands +badminton +badminton racket +bag +bagel +bagpipe +baguette +bait +baked goods +baker +bakery +baking +baking sheet +balance +balance car +balcony +ball +ball pit +ballerina +ballet +ballet dancer +ballet skirt +balloon +balloon arch +baseball player +ballroom +bamboo +bamboo forest +banana +banana bread +banana leaf +banana tree +band +band aid +bandage +headscarf +bandeau +bangs +bracelet +balustrade +banjo +bank +bank card +bank vault +banknote +banner +banquet +banquet hall +banyan tree +baozi +baptism +bar +bar code +bar stool +barbecue +barbecue grill +barbell +barber +barber shop +barbie +barge +barista +bark +barley +barn +barn owl +barn door +barrel +barricade +barrier +handcart +bartender +baseball +baseball base +baseball bat +baseball hat +baseball stadium +baseball game +baseball glove +baseball pitcher +baseball team +baseball uniform +basement +basil +basin +basket +basket container +basketball +basketball backboard +basketball coach +basketball court +basketball game +basketball hoop +basketball player +basketball stadium +basketball team +bass +bass guitar +bass horn +bassist +bat +bath +bath heater +bath mat +bath towel +swimwear +bathrobe +bathroom +bathroom accessory +bathroom cabinet +bathroom door +bathroom mirror +bathroom sink +toilet paper +bathroom window +batman +wand +batter +battery +battle +battle rope +battleship +bay +bay bridge +bay window +bayberry +bazaar +beach +beach ball +beach chair +beach house +beach hut +beach towel +beach volleyball +lighthouse +bead +beagle +beak +beaker +beam +bean +bean bag chair +beanbag +bear +bear cub +beard +beast +beat +beautiful +beauty +beauty salon +beaver +bed +bedcover +bed frame +bedroom +bedding +bedpan +bedroom window +bedside lamp +bee +beech tree +beef +beekeeper +beeper +beer +beer bottle +beer can +beer garden +beer glass +beer hall +beet +beetle +beige +clock +bell pepper +bell tower +belt +belt buckle +bench +bend +bengal tiger +bento +beret +berry +berth +beverage +bib +bibimbap +bible +bichon +bicycle +bicycle helmet +bicycle wheel +biker +bidet +big ben +bike lane +bike path +bike racing +bike ride +bikini +bikini top +bill +billard +billboard +billiard table +bin +binder +binocular +biology laboratory +biplane +birch +birch tree +bird +bird bath +bird feeder +bird house +bird nest +birdbath +bird cage +birth +birthday +birthday cake +birthday candle +birthday card +birthday party +biscuit +bishop +bison +bit +bite +black +black sheep +blackberry +blackbird +blackboard +blacksmith +blade +blanket +sports coat +bleacher +blender +blessing +blind +eye mask +flasher +snowstorm +block +blog +blood +bloom +blossom +blouse +blow +hair drier +blowfish +blue +blue artist +blue jay +blue sky +blueberry +bluebird +pig +board +board eraser +board game +boardwalk +boat +boat deck +boat house +paddle +boat ride +bobfloat +bobcat +body +bodyboard +bodybuilder +boiled egg +boiler +bolo tie +bolt +bomb +bomber +bonasa umbellu +bone +bonfire +bonnet +bonsai +book +book cover +bookcase +folder +bookmark +bookshelf +bookstore +boom microphone +boost +boot +border +Border collie +botanical garden +bottle +bottle cap +bottle opener +bottle screw +bougainvillea +boulder +bouquet +boutique +boutique hotel +bow +bow tie +bow window +bowl +bowling +bowling alley +bowling ball +bowling equipment +box +box girder bridge +box turtle +boxer +underdrawers +boxing +boxing glove +boxing ring +boy +brace +bracket +braid +brain +brake +brake light +branch +brand +brandy +brass +brass plaque +bread +breadbox +break +breakfast +seawall +chest +brewery +brick +brick building +wall +brickwork +wedding dress +bride +groom +bridesmaid +bridge +bridle +briefcase +bright +brim +broach +broadcasting +broccoli +bronze +bronze medal +bronze sculpture +bronze statue +brooch +creek +broom +broth +brown +brown bear +brownie +brunch +brunette +brush +coyote +brussels sprout +bubble +bubble gum +bubble tea +bucket cabinet +shield +bud +buddha +buffalo +buffet +bug +build +builder +building +building block +building facade +building material +lamp +bull +bulldog +bullet +bullet train +bulletin board +bulletproof vest +bullfighting +megaphone +bullring +bumblebee +bumper +roll +bundle +bungee +bunk bed +bunker +bunny +buoy +bureau +burial chamber +burn +burrito +bus +bus driver +bus interior +bus station +bus stop +bus window +bush +business +business card +business executive +business suit +business team +business woman +businessman +bust +butcher +butchers shop +butte +butter +cream +butterfly +butterfly house +button +buttonwood +buy +taxi +cabana +cabbage +cabin +cabin car +cabinet +cabinetry +cable +cable car +cactus +cafe +canteen +cage +cake +cake stand +calculator +caldron +calendar +calf +call +phone box +calligraphy +calm +camcorder +camel +camera +camera lens +camouflage +camp +camper +campfire +camping +campsite +campus +can +can opener +canal +canary +cancer +candle +candle holder +candy +candy bar +candy cane +candy store +cane +jar +cannon +canopy +canopy bed +cantaloupe +cantilever bridge +canvas +canyon +cap +cape +cape cod +cappuccino +capsule +captain +capture +car +car dealership +car door +car interior +car logo +car mirror +parking lot +car seat +car show +car wash +car window +caramel +card +card game +cardboard +cardboard box +cardigan +cardinal +cargo +cargo aircraft +cargo ship +caribbean +carnation +carnival +carnivore +carousel +carp +carpenter +carpet +slipper +house finch +coach +dalmatian +aircraft carrier +carrot +carrot cake +carry +cart +carton +cartoon +cartoon character +cartoon illustration +cartoon style +carve +case +cash +cashew +casino +casserole +cassette +cassette deck +plaster bandage +casting +castle +cat +cat bed +cat food +cat furniture +cat tree +catacomb +catamaran +catamount +catch +catcher +caterpillar +catfish +cathedral +cattle +catwalk +catwalk show +cauliflower +cave +caviar +CD +CD player +cedar +ceiling +ceiling fan +celebrate +celebration +celebrity +celery +cello +smartphone +cement +graveyard +centerpiece +centipede +ceramic +ceramic tile +cereal +ceremony +certificate +chain +chain saw +chair +chairlift +daybed +chalet +chalice +chalk +chamber +chameleon +champagne +champagne flute +champion +championship +chandelier +changing table +channel +chap +chapel +character sculpture +charcoal +charge +charger +chariot +charity +charity event +charm +graph +chase +chassis +check +checkbook +chessboard +checklist +cheer +cheerlead +cheese +cheeseburger +cheesecake +cheetah +chef +chemical compound +chemist +chemistry +chemistry lab +cheongsam +cherry +cherry blossom +cherry tomato +cherry tree +chess +chestnut +chicken +chicken breast +chicken coop +chicken salad +chicken wing +garbanzo +chiffonier +chihuahua +child +child actor +childs room +chile +chili dog +chimney +chimpanzee +chinaware +chinese cabbage +chinese garden +chinese knot +chinese rose +chinese tower +chip +chipmunk +chisel +chocolate +chocolate bar +chocolate cake +chocolate chip +chocolate chip cookie +chocolate milk +chocolate mousse +truffle +choir +kitchen knife +cutting board +chopstick +christmas +christmas ball +christmas card +christmas decoration +christmas dinner +christmas eve +christmas hat +christmas light +christmas market +christmas ornament +christmas tree +chrysanthemum +church +church tower +cider +cigar +cigar box +cigarette +cigarette case +waistband +cinema +photographer +cinnamon +circle +circuit +circuit board +circus +water tank +citrus fruit +city +city bus +city hall +city nightview +city park +city skyline +city square +city street +city wall +city view +clam +clarinet +clasp +class +classic +classroom +clavicle +claw +clay +pottery +clean +clean room +cleaner +cleaning product +clear +cleat +clementine +client +cliff +climb +climb mountain +climber +clinic +clip +clip art +clipboard +clipper +clivia +cloak +clogs +close-up +closet +cloth +clothe +clothing +clothespin +clothesline +clothing store +cloud +cloud forest +cloudy +clover +joker +clown fish +club +clutch +clutch bag +coal +coast +coat +coatrack +cob +cock +cockatoo +cocker +cockpit +roach +cocktail +cocktail dress +cocktail shaker +cocktail table +cocoa +coconut +coconut tree +coffee +coffee bean +coffee cup +coffee machine +coffee shop +coffeepot +coffin +cognac +spiral +coin +coke +colander +cold +slaw +collaboration +collage +collection +college student +sheepdog +crash +color +coloring book +coloring material +pony +pillar +comb +combination lock +comic +comedy +comedy film +comet +comfort +comfort food +comic book +comic book character +comic strip +commander +commentator +community +commuter +company +compass +compete +contest +competitor +composer +composition +compost +computer +computer box +computer chair +computer desk +keyboard +computer monitor +computer room +computer screen +computer tower +concept car +concert +concert hall +conch +concrete +condiment +condom +condominium +conductor +cone +meeting +conference center +conference hall +meeting room +confetti +conflict +confluence +connect +connector +conservatory +constellation +construction site +construction worker +contain +container +container ship +continent +profile +contract +control +control tower +convenience store +convention +conversation +converter +convertible +transporter +cook +cooking +cooking spray +cooker +cool +cooler +copper +copy +coral +coral reef +rope +corded phone +liquor +corgi +cork +corkboard +cormorant +corn +corn field +cornbread +corner +trumpet +cornice +cornmeal +corral +corridor +corset +cosmetic +cosmetics brush +cosmetics mirror +cosplay +costume +costumer film designer +infant bed +cottage +cotton +cotton candy +couch +countdown +counter +counter top +country artist +country house +country lane +country pop artist +countryside +coupe +couple +couple photo +courgette +course +court +courthouse +courtyard +cousin +coverall +cow +cowbell +cowboy +cowboy boot +cowboy hat +crab +crabmeat +crack +cradle +craft +craftsman +cranberry +crane +crape +crapper +crate +crater lake +lobster +crayon +cream cheese +cream pitcher +create +creature +credit card +crescent +croissant +crest +crew +cricket +cricket ball +cricket team +cricketer +crochet +crock pot +crocodile +crop +crop top +cross +crossbar +crossroad +crosstalk +crosswalk +crouton +crow +crowbar +crowd +crowded +crown +crt screen +crucifix +cruise +cruise ship +cruiser +crumb +crush +crutch +crystal +cub +cube +cucumber +cue +cuff +cufflink +cuisine +farmland +cup +cupcake +cupid +curb +curl +hair roller +currant +currency +curry +curtain +curve +pad +customer +cut +cutlery +cycle +cycling +cyclone +cylinder +cymbal +cypress +cypress tree +dachshund +daffodil +dagger +dahlia +daikon +dairy +daisy +dam +damage +damp +dance +dance floor +dance room +dancer +dandelion +dark +darkness +dart +dartboard +dashboard +date +daughter +dawn +day bed +daylight +deadbolt +death +debate +debris +decanter +deck +decker bus +decor +decorate +decorative picture +deer +defender +deity +delicatessen +deliver +demolition +monster +demonstration +den +denim jacket +dentist +department store +depression +derby +dermopathy +desert +desert road +design +designer +table +table lamp +desktop +desktop computer +dessert +destruction +detective +detergent +dew +dial +diamond +diaper +diaper bag +journal +die +diet +excavator +number +digital clock +dill +dinner +rowboat +dining room +dinner party +dinning table +dinosaur +dip +diploma +direct +director +dirt +dirt bike +dirt field +dirt road +dirt track +disaster +disciple +disco +disco ball +discotheque +disease +plate +dish antenna +dish washer +dishrag +dishes +dishsoap +Disneyland +dispenser +display +display window +trench +dive +diver +diving board +paper cup +dj +doberman +dock +doctor +document +documentary +dog +dog bed +dog breed +dog collar +dog food +dog house +doll +dollar +dollhouse +dolly +dolphin +dome +domicile +domino +donkey +donut +doodle +door +door handle +doormat +doorplate +doorway +dormitory +dough +downtown +dozer +drag +dragon +dragonfly +drain +drama +drama film +draw +drawer +drawing +drawing pin +pigtail +dress +dress hat +dress shirt +dress shoe +dress suit +dresser +dressing room +dribble +drift +driftwood +drill +drink +drinking water +drive +driver +driveway +drone +drop +droplight +dropper +drought +medicine +pharmacy +drum +drummer +drumstick +dry +duchess +duck +duckbill +duckling +duct tape +dude +duet +duffel +canoe +dumbbell +dumpling +dune +dunk +durian +dusk +dust +garbage truck +dustpan +duvet +DVD +dye +eagle +ear +earmuff +earphone +earplug +earring +earthquake +easel +easter +easter bunny +easter egg +eat +restaurant +eclair +eclipse +ecosystem +edit +education +educator +eel +egg +egg roll +egg tart +eggbeater +egret +Eiffel tower +elastic band +senior +electric chair +electric drill +electrician +electricity +electron +electronic +elephant +elevation map +elevator +elevator car +elevator door +elevator lobby +elevator shaft +embankment +embassy +embellishment +ember +emblem +embroidery +emerald +emergency +emergency service +emergency vehicle +emotion +Empire State Building +enamel +enclosure +side table +energy +engagement +engagement ring +engine +engine room +engineer +engineering +english shorthair +ensemble +enter +entertainer +entertainment +entertainment center +entrance +entrance hall +envelope +equestrian +equipment +eraser +erhu +erosion +escalator +escargot +espresso +estate +estuary +eucalyptus tree +evening +evening dress +evening light +evening sky +evening sun +event +evergreen +ewe +excavation +exercise +exhaust hood +exhibition +exit +explorer +explosion +extension cord +extinguisher +extractor +extrude +eye +eye shadow +eyebrow +eyeliner +fabric +fabric store +facade +face +face close-up +face powder +face towel +facial tissue holder +facility +factory +factory workshop +fair +fairground +fairy +falcon +fall +family +family car +family photo +family room +fan +fang +farm +farmer +farmer market +farmhouse +fashion +fashion accessory +fashion designer +fashion girl +fashion illustration +fashion look +fashion model +fashion show +fast food +fastfood restaurant +father +faucet +fault +fauna +fawn +fax +feast +feather +fedora +feed +feedbag +feeding +feeding chair +feline +mountain lion +fence +fender +fern +ferret +ferris wheel +ferry +fertilizer +festival +fiber +fiction +fiction book +field +field road +fig +fight +figure skater +figurine +file +file photo +file cabinet +fill +film camera +film director +film format +film premiere +film producer +filming +filter +fin +hand +finish line +fir +fir tree +fire +fire alarm +fire department +fire truck +fire escape +fire hose +fire pit +fire station +firecracker +fireman +fireplace +firework +firework display +first-aid kit +fish +fish boat +fish market +fish pond +fishbowl +fisherman +fishing +fishing boat +fishing net +fishing pole +fishing village +fitness +fitness course +five +fixture +fjord +flag +flag pole +flake +flame +flamingo +flannel +flap +flare +flash +flask +flat +flatfish +flavor +flea +flea market +fleet +flight +flight attendant +flip +flip-flop +flipchart +float +flock +flood +floor +floor fan +floor mat +floor plan +floor window +floral arrangement +florist +floss +flour +flow +flower +flower basket +flower bed +flower box +flower field +flower girl +flower market +fluid +flush +flute +fly +fly fishing +flyer +horse +foam +fog +foggy +foie gra +foil +folding chair +leaf +folk artist +folk dance +folk rock artist +fondant +hotpot +font +food +food coloring +food court +food processor +food stand +food truck +foosball +foot +foot bridge +football +football coach +football college game +football match +football field +football game +football helmet +football player +football stadium +football team +path +footprint +footrest +footstall +footwear +forbidden city +ford +forehead +forest +forest fire +forest floor +forest path +forest road +forge +fork +forklift +form +formal garden +formation +formula 1 +fort +fortification +forward +fossil +foundation +fountain +fountain pen +fox +frame +freckle +highway +lorry +French +French bulldog +French fries +French toast +freshener +fridge +fried chicken +fried egg +fried rice +friendship +frisbee +frog +frost +frosting +frosty +frozen +fruit +fruit cake +fruit dish +fruit market +fruit salad +fruit stand +fruit tree +fruits shop +fry +frying pan +fudge +fuel +fume hood +fun +funeral +fungi +funnel +fur +fur coat +furniture +futon +gadget +muzzle +galaxy +gallery +game +game board +game controller +ham +gang +garage +garage door +garage kit +garbage +garden +garden asparagus +garden hose +garden spider +gardener +gardening +garfield +gargoyle +wreath +garlic +garment +gas +gas station +gas stove +gasmask +collect +gathering +gauge +gazebo +gear +gecko +geisha +gel +general store +generator +geranium +ghost +gift +gift bag +gift basket +gift box +gift card +gift shop +gift wrap +gig +gin +ginger +gingerbread +gingerbread house +ginkgo tree +giraffe +girl +give +glacier +gladiator +glass bead +glass bottle +glass bowl +glass box +glass building +glass door +glass floor +glass house +glass jar +glass plate +glass table +glass vase +glass wall +glass window +glasses +glaze +glider +earth +glove +glow +glue pudding +go +go for +goal +goalkeeper +goat +goat cheese +gobi +goggles +gold +gold medal +Golden Gate Bridge +golden retriever +goldfish +golf +golf cap +golf cart +golf club +golf course +golfer +goose +gorilla +gothic +gourd +government +government agency +gown +graduate +graduation +grain +grampus +grand prix +grandfather +grandmother +grandparent +granite +granola +grape +grapefruit +wine +grass +grasshopper +grassland +grassy +grater +grave +gravel +gravestone +gravy +gravy boat +gray +graze +grazing +green +greenery +greet +greeting +greeting card +greyhound +grid +griddle +grill +grille +grilled eel +grind +grinder +grits +grocery bag +grotto +ground squirrel +group +group photo +grove +grow +guacamole +guard +guard dog +guest house +guest room +guide +guinea pig +guitar +guitarist +gulf +gull +gun +gundam +gurdwara +guzheng +gym +gymnast +habitat +hacker +hail +hair +hair color +hair spray +hairbrush +haircut +hairgrip +hairnet +hairpin +hairstyle +half +hall +halloween +halloween costume +halloween pumpkin +halter top +hamburg +hamburger +hami melon +hammer +hammock +hamper +hamster +hand dryer +hand glass +hand towel +handbag +handball +handcuff +handgun +handkerchief +handle +handsaw +handshake +handstand +handwriting +hanfu +hang +hangar +hanger +happiness +harbor +harbor seal +hard rock artist +hardback book +safety helmet +hardware +hardware store +hardwood +hardwood floor +mouth organ +pipe organ +harpsichord +harvest +harvester +hassock +hat +hatbox +hautboy +hawthorn +hay +hayfield +hazelnut +head +head coach +headlight +headboard +headdress +headland +headquarter +hearing +heart +heart shape +heat +heater +heather +hedge +hedgehog +heel +helicopter +heliport +helmet +help +hen +henna +herb +herd +hermit crab +hero +heron +hibiscus +hibiscus flower +hide +high bar +high heel +highland +highlight +hike +hiker +hiking boot +hiking equipment +hill +hill country +hill station +hillside +hindu temple +hinge +hip +hip hop artist +hippo +historian +historic +history +hockey +hockey arena +hockey game +hockey player +hockey stick +hoe +hole +vacation +holly +holothurian +home +home appliance +home base +home decor +home interior +home office +home theater +homework +hummus +honey +beehive +honeymoon +hood +hoodie +hook +jump +horizon +hornbill +horned cow +hornet +horror +horror film +horse blanket +horse cart +horse farm +horse ride +horseback +horseshoe +hose +hospital +hospital bed +hospital room +host +inn +hot +hot air balloon +hot dog +hot sauce +hot spring +hotel +hotel lobby +hotel room +hotplate +hourglass +house +house exterior +houseplant +hoverboard +howler +huddle +hug +hula hoop +person +humidifier +hummingbird +humpback whale +hunt +hunting lodge +hurdle +hurricane +husky +hut +hyaena +hybrid +hydrangea +hydrant +seaplane +ice +ice bag +polar bear +ice cave +icecream +ice cream cone +ice cream parlor +ice cube +ice floe +ice hockey player +ice hockey team +lollipop +ice maker +rink +ice sculpture +ice shelf +skate +ice skating +iceberg +icicle +icing +icon +id photo +identity card +igloo +light +iguana +illuminate +illustration +image +impala +incense +independence day +individual +indoor +indoor rower +induction cooker +industrial area +industry +infantry +inflatable boat +information desk +infrastructure +ingredient +inhalator +injection +injury +ink +inking pad +inlet +inscription +insect +install +instrument +insulated cup +interaction +interior design +website +intersection +interview +invertebrate +invitation +ipad +iphone +ipod +iris +iron +ironing board +irrigation system +island +islet +isopod +ivory +ivy +izakaya +jack +jackcrab +jacket +jacuzzi +jade +jaguar +jail cell +jam +japanese garden +jasmine +jaw +jay +jazz +jazz artist +jazz fusion artist +jeans +jeep +jelly +jelly bean +jellyfish +jet +motorboat +jewel +jewellery +jewelry shop +jigsaw puzzle +rickshaw +jockey +jockey cap +jog +joint +journalist +joystick +judge +jug +juggle +juice +juicer +jujube +jump rope +jumpsuit +jungle +junkyard +kale +kaleidoscope +kangaroo +karaoke +karate +karting +kasbah +kayak +kebab +key +keycard +khaki +kick +kilt +kimono +kindergarden classroom +kindergarten +king +king crab +kiss +kit +kitchen +kitchen cabinet +kitchen counter +kitchen floor +kitchen hood +kitchen island +kitchen sink +kitchen table +kitchen utensil +kitchen window +kitchenware +kite +kiwi +knee pad +kneel +knife +rider +knit +knitting needle +knob +knocker +knot +koala +koi +ktv +laboratory +lab coat +label +labrador +maze +lace +lace dress +ladder +ladle +ladybird +lagoon +lake +lake district +lake house +lakeshore +lamb +lamb chop +lamp post +lamp shade +spear +land +land vehicle +landfill +landing +landing deck +landmark +landscape +landslide +lanyard +lantern +lap +laptop +laptop keyboard +larva +lasagne +laser +lash +lasso +latch +latex +latte +laugh +launch +launch event +launch party +laundromat +laundry +laundry basket +laundry room +lava +lavender +lawn +lawn wedding +lawyer +lay +lead +lead singer +lead to +leader +leak +lean +learn +leash +leather +leather jacket +leather shoe +speech +lecture hall +lecture room +ledge +leftover +leg +legend +legging +legislative chamber +lego +legume +lemon +lemon juice +lemonade +lemur +lens +lens flare +lentil +leopard +leotard +tights +leprechaun +lesson +letter +mailbox +letter logo +lettering +lettuce +level +library +license +license plate +lichen +lick +lid +lie +life belt +life jacket +lifeboat +lifeguard +lift +light fixture +light show +light switch +lighting +lightning +lightning rod +lilac +lily +limb +lime +limestone +limo +line +line art +line up +linen +liner +lion +lip balm +lipstick +liquid +liquor store +list +litchi +live +livestock +living room +living space +lizard +load +loading dock +loafer +hallway +locate +lock +lock chamber +locker +loft +log +log cabin +logo +loki +long hair +longboard +loom +loop +lose +lottery +lotus +love +loveseat +luggage +lumber +lumberjack +lunch +lunch box +lush +luxury +luxury yacht +mac +macadamia +macaque +macaroni +macaw +machete +machine +machine gun +magazine +magic +magician +magnet +magnifying glass +magnolia +magpie +mahjong +mahout +maid +chain mail +mail slot +make +makeover +makeup artist +makeup tool +mallard +mallard duck +mallet +mammal +mammoth +man +management +manager +manatee +mandala +mandarin orange +mandarine +mane +manga +manger +mango +mangosteen +mangrove +manhattan +manhole +manhole cover +manicure +mannequin +manor house +mansion +mantid +mantle +manufactured home +manufacturing +manuscript +map +maple +maple leaf +maple syrup +maraca +marathon +marble +march +marching band +mare +marigold +marine +marine invertebrate +marine mammal +puppet +mark +market +market square +market stall +marriage +martial +martial artist +martial arts gym +martini +martini glass +mascara +mascot +mashed potato +masher +mask +massage +mast +mat +matador +match +matchbox +material +mattress +mausoleum +maxi dress +meal +measuring cup +measuring tape +meat +meatball +mechanic +mechanical fan +medal +media +medical equipment +medical image +medical staff +medicine cabinet +medieval +medina +meditation +meerkat +meet +melon +monument +menu +mermaid +net +mess +messenger bag +metal +metal artist +metal detector +meter +mezzanine +microphone +microscope +microwave +midnight +milestone +military uniform +milk +milk can +milk tea +milkshake +mill +mine +miner +mineral +mineral water +miniskirt +miniature +minibus +minister +minivan +mint +mint candy +mirror +miss +missile +mission +mistletoe +mix +mixer +mixing bowl +mixture +moat +mobility scooter +model +model car +modern +modern tower +moisture +mold +molding +mole +monarch +money +monitor +monk +monkey +monkey wrench +monochrome +monocycle +monster truck +moon +moon cake +moonlight +moor +moose +swab +moped +morning +morning fog +morning light +morning sun +mortar +mosaic +mosque +mosquito +moss +motel +moth +mother +motherboard +motif +sport +motor +motorbike +motorcycle +motorcycle helmet +motorcycle racer +motorcyclist +motorsport +mound +mountain +mountain bike +mountain biker +mountain biking +mountain gorilla +mountain lake +mountain landscape +mountain pass +mountain path +mountain range +mountain river +mountain snowy +mountain stream +mountain view +mountain village +mountaineer +mountaineering bag +mouse +mousepad +mousetrap +mouth +mouthwash +move +movie poster +movie ticket +mower +mp3 player +mr +mud +muffin +mug +mulberry +mulch +mule +municipality +mural +muscle +muscle car +museum +mushroom +music +music festival +music stool +music studio +music video performer +musical keyboard +musician +mussel +mustard +mythology +nacho +nail polish +nailfile +nanny +napkin +narrow +national flag +nativity scene +natural history museum +nature +nature reserve +navigation +navratri +navy +nebula +neck +neckband +necklace +neckline +nectar +nectarine +needle +neighbor +neighbourhood +neon +neon light +nerve +nest +new year +newborn +newfoundland +newlywed +news +news conference +newsstand +night +night market +night sky +night view +nightclub +nightstand +noodle +nose +noseband +note +notebook +notepad +notepaper +notice +number icon +nun +nurse +nursery +nursing home +nut +nutcracker +oak +oak tree +oar +oasis +oast house +oatmeal +oats +obelisk +observation tower +observatory +obstacle course +sea +octopus +offer +office +office building +office chair +office cubicle +office desk +office supply +office window +officer +official +oil +oil lamp +oil painting +oilrig +okra +old photo +olive +olive oil +olive tree +omelet +onion +onion ring +opal +open +opening +opening ceremony +opera +opera house +operate +operating room +operation +optical shop +orangutan +orange +orange juice +orange tree +orangery +orbit +orchard +orchestra pit +orchid +order +organization +origami +ornament +osprey +ostrich +otter +out +outcrop +outdoor +outhouse +electric outlet +outline +oval +oven +overall +overcoat +overpass +owl +oyster +teething ring +pack +package +paddock +police van +padlock +paella +pagoda +pain +paint brush +painter +paisley bandanna +palace +palette +paling +pall +palm tree +pan +pancake +panda +panel +panorama +pansy +pant +pantry +pants +pantyhose +papaya +paper +paper bag +paper cutter +paper lantern +paper plate +paper towel +paperback book +paperweight +parachute +parade +paradise +parrot +paramedic +paraquet +parasail +paratrooper +parchment +parish +park +park bench +parking +parking garage +parking meter +parking sign +parliament +parsley +participant +partner +partridge +party +party hat +pass +passage +passbook +passenger +passenger ship +passenger train +passion fruit +passport +pasta +paste +pastry +pasture +patch +patient +pattern +pavement +pavilion +paw +pay +payphone +pea +peace +peach +peacock +peak +peanut +peanut butter +pear +pearl +pebble +pecan +pedestrian +pedestrian bridge +pedestrian street +peel +peeler +pegboard +pegleg +pelican +pen +penalty kick +pencil +pencil case +pencil sharpener +pencil skirt +pendant +pendulum +penguin +peninsula +pennant +penny +piggy bank +peony +pepper +pepper grinder +peppercorn +pepperoni +perch +perform +performance +performance arena +perfume +pergola +persian cat +persimmon +personal care +personal flotation device +pest +pet +pet shop +pet store +petal +petunia +church bench +pheasant +phenomenon +philosopher +phone +phonebook +record player +photo +photo booth +photo frame +photography +physicist +physics laboratory +pianist +piano +plectrum +pick up +pickle +picnic +picnic area +picnic basket +picnic table +picture +picture frame +pie +pigeon +pilgrim +tablet +pillow +pilot +pilot boat +pin +pine +pine cone +pine forest +pine nut +pineapple +table tennis table +table tennis +pink +pint +pipa +pipe +pipe bowl +pirate +pirate flag +pirate ship +pistachio +ski slope +pocket bread +pitaya +pitbull +pitch +pitcher +pitcher plant +pitchfork +pizza +pizza cutter +pizza pan +pizzeria +placard +place +place mat +plaid +plain +plan +planet +planet earth +plank +plant +plantation +planting +plaque +plaster +plastic +plasticine +plateau +platform +platinum +platter +play +play badminton +play baseball +play basketball +play billiard +play football +play pong +play tennis +play volleyball +player +playground +playhouse +playing card +playing chess +playing golf +playing mahjong +playingfield +playpen +playroom +plaza +plier +plot +plow +plug +plug hat +plum +plumber +plumbing fixture +plume +plywood +pocket +pocket watch +pocketknife +pod +podium +poetry +poinsettia +point +pointer +poker card +poker chip +poker table +pole +polecat +police +police car +police dog +police station +politician +polka dot +pollen +pollution +polo +polo neck +polo shirt +pomegranate +pomeranian +poncho +pond +ponytail +poodle +pool +pop +pop artist +popcorn +pope +poppy +porcelain +porch +pork +porridge +portable battery +portal +portfolio +porthole +portrait +portrait session +pose +possum +post +post office +stamp +postcard +poster +poster page +pot +potato +potato chip +potato salad +potholder +potty +pouch +poultry +pound +pour +powder +power line +power plugs and sockets +power see +power station +practice +Prague Castle +prayer +preacher +premiere +prescription +show +presentation +president +press room +pressure cooker +pretzel +prince +princess +print +printed page +printer +printing +prison +produce +product +profession +professional +professor +project picture +projection screen +projector +prom +promenade +propeller +prophet +proposal +protective suit +protest +protester +publication +publicity portrait +ice hockey +pudding +puddle +puff +puffin +pug +pull +pulpit +pulse +pump +pumpkin +pumpkin pie +pumpkin seed +punch bag +punch +student +purple +push +putt +puzzle +tower +pyramid +python +qr code +quail +quarry +quarter +quartz +queen +quesadilla +queue +quiche +quilt +quilting +quote +rabbit +raccoon +race +race track +raceway +race car +racket +radar +radiator +radio +raft +rag doll +rail +railcar +railroad +railroad bridge +railway line +railway station +rain +rain boot +rainbow +rainbow trout +raincoat +rainforest +rainy +raisin +rake +ram +ramp +rapeseed +rapid +rapper +raspberry +rat +ratchet +raven +ravine +ray +razor +razor blade +read +reading +reamer +rear +rear light +rear view +rearview mirror +receipt +receive +reception +recipe +record +record producer +recorder +recording studio +recreation room +recreational vehicle +rectangle +recycling +recycling bin +red +red carpet +red flag +red panda +red wine +redwood +reed +reef +reel +referee +reflect +reflection +reflector +register +rein +reindeer +relax +release +relief +religion +religious +relish +remain +remodel +remote +remove +repair +repair shop +reptile +rescue +rescuer +research +researcher +reservoir +residence +residential neighborhood +resin +resort +resort town +restaurant kitchen +restaurant patio +restroom +retail +retriever +retro +reveal +rhinoceros +rhododendron +rib +ribbon +rice +rice cooker +rice field +ride +ridge +riding +rifle +rim +ring +riot +ripple +rise +rise building +river +river bank +river boat +river valley +riverbed +road +road sign +road trip +roadside +roast chicken +robe +robin +robot +stone +rock arch +rock artist +rock band +rock climber +rock climbing +rock concert +rock face +rock formation +rocker +rocket +rocking chair +rocky +rodent +rodeo +rodeo arena +roe +roe deer +roller +coaster +roller skate +roller skates +rolling pin +romance +romantic +roof +roof garden +room +room divider +root +root beer +rope bridge +rosary +rose +rosemary +rosy cloud +rottweiler +round table +router +row +rowan +royal +rubber stamp +rubble +rubik's cube +ruby +ruffle +rugby +rugby ball +rugby player +ruins +ruler +rum +run +runner +running shoe +rural +rust +rustic +rye +sack +saddle +saddlebag +safari +safe +safety vest +sage +sail +sailboat +sailing +sailor +squirrel monkey +sake +salad +salad bowl +salamander +salami +sale +salmon +salon +salsa +salt +salt and pepper shakers +salt lake +salt marsh +salt shaker +salute +samoyed +samurai +sand +sand bar +sand box +sand castle +sand sculpture +sandal +sandwich +sanitary napkin +santa claus +sapphire +sardine +sari +sashimi +satay +satchel +satellite +satin +sauce +saucer +sauna +sausage +savanna +saw +sawbuck +sax +saxophonist +scaffold +scale +scale model +scallop +scar +strawman +scarf +scene +scenery +schnauzer +school +school bus +school uniform +schoolhouse +schooner +science +science fiction film +science museum +scientist +scissors +wall lamp +scone +scoop +scooter +score +scoreboard +scorpion +scout +scrambled egg +scrap +scraper +scratch +screen +screen door +screenshot +screw +screwdriver +scroll +scrub +scrubbing brush +sculptor +sculpture +sea cave +sea ice +sea lion +sea turtle +sea urchin +seabass +seabed +seabird +seafood +seahorse +seal +sea view +seashell +seaside resort +season +seat +seat belt +seaweed +secretary +security +sedan +see +seed +seesaw +segway +selfie +sell +seminar +sense +sensor +server +server room +service +set +sewing machine +shadow +shake +shaker +shampoo +shape +share +shark +sharpener +sharpie +shaver +shaving cream +shawl +shear +shears +sheep +sheet +sheet music +shelf +shell +shellfish +shelter +shelve +shepherd +sherbert +shiba inu +shine +shipping +shipping container +shipwreck +shipyard +shirt +shirtless +shoal +shoe +shoe box +shoe shop +shoe tree +shoot +shooting basketball guard +shop window +shopfront +shopper +shopping +shopping bag +shopping basket +shopping cart +mall +shopping street +shore +shoreline +short +short hair +shorts +shot glass +shotgun +shoulder +shoulder bag +shovel +showcase +shower +shower cap +shower curtain +shower door +shower head +shredder +shrew +shrimp +shrine +shrub +shutter +siamese +siberia +sibling +side +side cabinet +side dish +sidecar +sideline +siding +sign +signage +signal +signature +silk +silk stocking +silo +silver +silver medal +silverware +sing +singe +singer +sink +sip +sit +sitting +skate park +skateboard +skateboarder +skater +skating rink +skeleton +sketch +skewer +ski +ski boot +ski equipment +ski jacket +ski lift +ski pole +ski resort +snowboard +skier +skiing shoes +skin +skull +skullcap +sky +sky tower +skylight +skyline +skyscraper +slalom +slate +sleigh +sleep +sleeping bag +sleepwear +sleeve +slice +slide +slider +sling +slope +slot +slot machine +sloth +slow cooker +slug +slum +smell +smile +smoke +snack +snail +snake +snapper +snapshot +snorkel +snout +snow +snow leopard +snow mountain +snowball +snowboarder +snowfield +snowflake +snowman +snowmobile +snowplow +snowshoe +snowy +soap +soap bubble +soap dispenser +soccer goalkeeper +socialite +sock +socket +soda +softball +software +solar battery +soldier +solo +solution +sombrero +song +sound +soup +soup bowl +soupspoon +sour cream +souvenir +soybean milk +spa +space +space shuttle +space station +spacecraft +spaghetti +span +wrench +spark +sparkle +sparkler +sparkling wine +sparrow +spatula +speaker +spectator +speech bubble +speed limit +speed limit sign +speedboat +speedometer +sphere +spice +spice rack +spider +spider web +spike +spin +spinach +spire +splash +sponge +spoon +sport association +sport equipment +sport team +sports ball +sports equipment +sports meet +sportswear +dot +spray +spread +spring +spring roll +sprinkle +sprinkler +sprout +spruce +spruce forest +squad +square +squash +squat +squeeze +squid +squirrel +water gun +stab +stable +stack +stadium +staff +stage +stage light +stagecoach +stain +stainless steel +stair +stairs +stairwell +stall +stallion +stand +standing +staple +stapler +star +stare +starfish +starfruit +starling +state park +state school +station +stationary bicycle +stationery +statue +steak +steak knife +steam +steam engine +steam locomotive +steam train +steamed bread +steel +steering wheel +stem +stencil +step stool +stereo +stethoscope +stew +stick +stick insect +sticker +still life +stilt +stingray +stir +stirrer +stirrup +sew +stock +stocking +stomach +stone building +stone carving +stone house +stone mill +stool +stop +stop at +stop light +stop sign +stop watch +traffic light +storage box +storage room +tank +store +storefront +stork +storm +storm cloud +stormy +stove +poker +straddle +strainer +strait +strap +straw +straw hat +strawberry +stream +street art +street artist +street corner +street dog +street food +street light +street market +street photography +street scene +street sign +street vendor +stretch +stretcher +strike +striker +string +string cheese +strip +stripe +stroll +structure +studio +studio shot +stuff +stuffed animal +stuffed toy +stuffing +stump +stunning +stunt +stupa +style +stylus +submarine +submarine sandwich +submarine water +suburb +subway +subway station +subwoofer +succulent +suede +sugar +sugar bowl +sugar cane +sugar cube +suit +suite +summer +summer evening +summit +sun +sun hat +sunbathe +sunday +sundial +sunflower +sunflower field +sunflower seed +sunglasses +sunny +sunrise +sunset +sunshade +sunshine +super bowl +sports car +superhero +supermarket +supermarket shelf +supermodel +supporter +surf +surface +surfboard +surfer +surgeon +surgery +surround +sushi +sushi bar +suspenders +suspension +suspension bridge +suv +swallow +swallowtail butterfly +swamp +swan +swan boat +sweat pant +sweatband +sweater +sweatshirt +sweet +sweet potato +swim +swim cap +swimmer +swimming hole +swimming pool +swing +swing bridge +swinge +swirl +switch +swivel chair +sword +swordfish +symbol +symmetry +synagogue +syringe +syrup +system +t shirt +t-shirt +tabasco sauce +tabby +table tennis racket +table top +tablecloth +tablet computer +tableware +tachometer +tackle +taco +tae kwon do +tai chi +tail +tailor +take +takeoff +talk +tambourine +tan +tangerine +tape +tapestry +tarmac +taro +tarp +tart +tassel +taste +tatami +tattoo +tattoo artist +tavern +tea +tea bag +tea party +tea plantation +tea pot +tea set +teach +teacher +teacup +teal +team photo +team presentation +tear +technician +technology +teddy +tee +teenager +telegraph pole +zoom lens +telescope +television +television camera +television room +television studio +temperature +temple +tempura +tennis +tennis court +tennis match +tennis net +tennis player +tennis racket +tent +tequila +terminal +terrace +terrain +terrarium +territory +test +test match +test tube +text +text message +textile +texture +thanksgiving +thanksgiving dinner +theater +theatre actor +therapy +thermometer +thermos +thermos bottle +thermostat +thicket +thimble +thing +thinking +thistle +throne +throne room +throw +throw pillow +thunder +thunderstorm +thyme +tiara +tick +ticket +ticket booth +tide pool +tie +tiger +tight +tile +tile flooring +tile roof +tile wall +tin +tinfoil +tinsel +tiramisu +tire +tissue +toast +toaster +tobacco +tobacco pipe +toddler +toe +tofu +toilet bowl +toilet seat +toiletry +tokyo tower +tomato +tomato sauce +tomato soup +tomb +tong +tongs +tool +toolbox +toothbrush +toothpaste +toothpick +topiary garden +topping +torch +tornado +tortilla +tortoise +tote bag +totem pole +totoro +toucan +touch +touchdown +tour +tour bus +tour guide +tourist +tourist attraction +tournament +tow truck +towel +towel bar +tower block +tower bridge +town +town square +toy +toy car +toy gun +toyshop +track +tractor +trade +tradition +traditional +traffic +traffic cone +traffic congestion +traffic jam +traffic sign +trail +trailer +trailer truck +train +train bridge +train car +train interior +train track +train window +trainer +training +training bench +training ground +trolley +trampoline +transformer +transparency +travel +tray +treadmill +treat +tree +tree branch +tree farm +tree frog +tree house +tree root +tree trunk +trial +triangle +triathlon +tribe +tributary +trick +tricycle +trim +trio +tripod +trombone +troop +trophy +trophy cup +tropic +trout +truck +truck driver +tub +tube +tugboat +tulip +tuna +tundra +tunnel +turbine +turkey +turn +turnip +turquoise +turret +turtle +tusk +tv actor +tv cabinet +tv drama +tv genre +tv personality +tv show +tv sitcom +tv tower +twig +twilight +twin +twine +twist +type +type on +typewriter +ukulele +ultraman +umbrella +underclothes +underwater +unicorn +uniform +universe +university +up +urban +urinal +urn +use +utensil +utility room +vacuum +valley +valve +vampire +van +vanilla +vanity +variety +vase +vault +vector cartoon illustration +vector icon +vegetable +vegetable garden +vegetable market +vegetation +vehicle +veil +vein +velvet +vending machine +vendor +vent +vespa +vessel +vest +vet +veteran +veterinarians office +viaduct +video +video camera +video game +videotape +view mirror +vigil +villa +village +vine +vinegar +vineyard +violence +violet +violin +violinist +violist +vision +visor +vodka +volcano +volleyball +volleyball court +volleyball player +volunteer +voyage +vulture +waffle +waffle iron +wagon +wagon wheel +waist +waiter +waiting hall +waiting room +walk +walking +walking cane +wall clock +wallpaper +walnut +walrus +war +warehouse +warm +warning sign +warrior +warship +warthog +wash +washer +washing +washing machine +wasp +waste +waste container +watch +water +water bird +water buffalo +water cooler +water drop +water feature +water heater +water level +water lily +water park +water pipe +water purifier +water ski +water sport +water surface +water tower +watercolor +watercolor illustration +watercolor painting +waterfall +watering can +watermark overlay stamp +watermelon +waterproof jacket +waterway +wave +wax +weapon +wear +weather +vane +web +webcam +wedding +wedding ring +wedding bouquet +wedding cake +wedding couple +wedding invitation +wedding party +wedding photo +wedding photographer +wedding photography +wedding reception +wedge +weed +weight +weight scale +welder +well +western food +western restaurant +wet +wet bar +wet suit +wetland +wetsuit +whale +whale shark +wheat +wheat field +wheel +wheelchair +wheelie +whipped cream +whisk +whisker +whiskey +whistle +white +white house +white wine +whiteboard +wicket +wide +wield +wig +Wii +Wii controller +wild +wildebeest +wildfire +wildflower +wildlife +willow +wind +wind chime +wind farm +wind turbine +windmill +window +window box +window display +window frame +window screen +window seat +window sill +wiper +windshield +windy +wine bottle +wine cooler +wine cabinet +wine cellar +wine glass +wine rack +wine tasting +winery +wing +winter +winter melon +winter morning +winter scene +winter sport +winter storm +wire +wisteria +witch +witch hat +wok +wolf +woman +wood +wood duck +wood floor +wood wall +wood-burning stove +wooden spoon +woodland +woodpecker +woodworking plane +wool +job +work card +workbench +worker +workplace +workshop +world +worm +worship +wound +wrap +wrap dress +wrapping paper +wrestle +wrestler +wrinkle +wristband +write +writer +writing +writing brush +writing desk +yacht +yak +yard +yellow +yoga +yoga mat +yoghurt +yoke +yolk +youth +youth hostel +yurt +zebra +zebra crossing +zen garden +zip +zipper +zombie +zongzi +zoo \ No newline at end of file diff --git a/fengshen/models/Lyrics/ram/data/ram_tag_list_chinese.txt b/fengshen/models/Lyrics/ram/data/ram_tag_list_chinese.txt new file mode 100644 index 0000000..b21cc5f --- /dev/null +++ b/fengshen/models/Lyrics/ram/data/ram_tag_list_chinese.txt @@ -0,0 +1,4585 @@ +三维CG渲染 +3d眼镜 +算盘 +鲍鱼 +修道院 +肚子 +学院 +附件 +事故 +手风琴 +橡子 +丙烯颜料 +表演 +行动 +动作电影 +活动 +演员 +改编本 +添加 +胶带 +调整 +成人 +冒险 +广告 +天线 +有氧运动 +喷雾罐 +爆炸头 +农业 +帮助 +空调 +空调系统 +风向标 +飞机客舱 +飞机模型 +机场 +航线 +客机 +飞行员 +飞机 +飞机窗口 +机场 +机场跑道 +航站楼 +飞艇 +航展 +过道 +警报 +闹钟 +信天翁 +唱片 +唱片封面 +酒精 +壁龛 +水藻 +胡同/球道 +杏仁 +芦荟 +高山 +羊驼 +字母表 +德国牧羊犬 +圣坛 +琥珀 +救护车 +秃鹰 +美国短毛猫 +紫水晶 +圆形剧场 +扩音器 +游乐园 +游乐设施 +锚 +古老的 +海葵 +天使 +角 +动物 +动物雕塑 +动物收容所 +动画片 +动画电影 +动画师 +动漫 +脚踝 +短袜 +周年庆 +风衣 +蚂蚁 +羚羊 +古董 +鹿角 +铁砧 +公寓 +猿 +应用程序 +应用图标 +出现 +外观 +开胃菜 +掌声 +苹果 +苹果汁 +苹果派 +苹果树 +苹果酱 +设备 +约定 +通道 +杏子 +围裙 +浅绿色 +水族馆 +观赏鱼 +渡槽 +游乐中心 +商场游戏机 +拱门 +拱桥 +考古现场 +射箭 +群岛 +建筑师 +建筑设计 +档案 +拱门 +地区 +竞技场 +争论 +手臂 +穿山甲 +臂章 +扶手椅 +衣柜 +盔甲 +军队 +军事基地 +坦克 +阵列 +逮捕 +箭头 +艺术 +艺术展 +美术馆 +艺术印刷品 +艺术学校 +艺术工作室 +艺术矢量插图 +洋蓟 +文章 +手工艺品 +艺术家 +艺术阁楼 +灰 +烟灰缸 +亚洲寺庙 +芦笋 +沥青道路 +组装 +集会 +生产流水线 +协会 +宇航员 +天文学家 +运动员 +运动 +地图集 +自助取款机 +大气层 +中庭 +连接 +战斗机 +参加 +吸引力 +全地形车 +茄子 +拍卖 +奥迪汽车 +音频 +礼堂 +极光 +作者 +汽车厂 +汽车修理工 +汽车零件 +车展 +汽车展厅 +汽车电池 +汽车制造 +汽车模型 +汽车 +秋天 +秋天的森林 +秋天的叶子 +秋天的公园 +秋天的树 +阿凡达 +林荫大道 +飞行员太阳镜 +牛油果 +奖品 +颁奖典礼 +获奖者 +棚 +斧头 +杜鹃花 +狒狒 +婴儿 +奶瓶 +婴儿车 +婴儿衣服 +小象 +婴儿食品 +婴儿座椅 +迎婴派对 +背后/后面 +背景 +背光 +背包 +后院 +培根 +徽章 +獾 +荒地 +羽毛球运动 +羽毛球拍 +袋子 +面包圈 +风笛 +法棍 +诱饵 +焙烤食品 +面包师 +面包店 +烘焙 +烤盘 +平衡 +平衡车 +阳台 +球 +球池 +芭蕾舞女演员 +芭蕾舞 +芭蕾舞演员 +芭蕾舞裙 +气球 +气球拱门 +棒球手 +舞厅 +竹子 +竹林 +香蕉 +香蕉面包 +香蕉叶子 +香蕉树 +乐队 +创可贴 +绷带 +头巾 +束发带 +刘海 +手镯 +栏杆 +五弦琴 +银行 +银行卡 +银行金库 +纸币 +横幅/旗帜 +宴会 +宴会厅 +榕树 +包子 +洗礼 +酒吧 +条形码 +高脚凳 +烧烤 +烧烤架 +杠铃 +理发师 +理发店 +芭比娃娃 +驳船 +咖啡师 +树皮 +大麦 +谷仓 +仓鸮 +挡光板 +桶 +路障 +屏障 +手推车 +酒保 +棒球 +棒球基地 +棒球棒 +棒球帽 +棒球场 +棒球比赛 +棒球手套 +棒球投手 +棒球队 +棒球制服 +地下室 +罗勒 +水盆 +篮子 +篮子 +篮球 +篮球篮板 +篮球教练 +篮球场 +篮球比赛 +篮球框 +篮球运动员 +篮球馆 +篮球队 +贝斯 +低音吉他 +低音喇叭 +贝斯手 +球棒/球拍 +浴室 +水浴加热器 +浴垫 +浴巾 +泳装 +浴袍 +浴室 +浴室配件 +浴室柜 +浴室门 +浴室镜子 +浴室水槽 +卫生纸 +浴室窗户 +蝙蝠侠 +棒子 +接连猛打/击球员 +电池 +战斗 +战绳 +战舰 +海湾 +海湾大桥 +凸窗 +杨梅 +集市 +海滩 +沙滩球 +沙滩椅 +海滨别墅 +海滩小屋 +沙滩毛巾 +沙滩排球 +灯塔 +珠子 +比格犬 +鸟嘴 +烧杯 +横梁 +豆子 +豆袋椅 +豆袋 +熊 +幼熊 +胡子 +野兽 +击打/击败 +美丽的 +美丽 +美容院 +海狸 +床 +床单 +床架 +卧室 +床上用品 +便盆 +卧室窗户 +床头灯 +蜜蜂 +山毛榉 +牛肉 +养蜂人 +蜂鸣器 +啤酒 +啤酒瓶 +啤酒罐 +啤酒花园 +啤酒杯 +啤酒馆 +甜菜 +甲虫 +米色 +时钟 +甜椒 +钟楼 +皮带 +皮带扣 +长凳 +弯曲 +孟加拉虎 +盒饭 +贝雷帽 +浆果 +停泊位 +饮料 +围嘴 +拌饭 +圣经 +比熊 +自行车 +自行车头盔 +自行车车轮 +自行车骑士 +坐浴盆 +大本钟 +自行车道 +自行车道 +自行车赛 +骑车 +比基尼 +比基尼上衣 +账单 +台球 +广告牌 +台球台 +垃圾箱 +活页夹 +双筒望远镜 +生物学实验室 +双翼飞机 +桦木 +桦树 +鸟 +鸟池 +喂鸟器 +鸟舍 +鸟巢 +鸟池 +鸟笼 +出生 +生日 +生日蛋糕 +生日蜡烛 +生日贺卡 +生日聚会 +饼干 +主教 +野牛 +钻头 +咬 +黑色 +黑山羊 +黑莓 +乌鸦 +黑板 +铁匠 +叶片/刀片 +毯子/覆盖层 +运动外套 +看台 +搅拌机 +祝福 +窗帘 +眼罩 +闪光 +暴风雪 +块 +博客 +血 +开花 +花 +女装衬衫 +吹 +吹风机 +河豚 +蓝色 +蓝色艺术家 +蓝松鸦 +蓝天 +蓝莓 +蓝知更鸟 +猪 +板子 +板擦 +棋盘游戏 +木板路 +船 +船甲板 +船屋 +桨 +乘船 +浮标 +山猫 +躯干 +身体冲浪板 +健美运动员 +水煮鸡蛋 +锅炉 +饰扣式领带 +门闩 +炸弹 +轰炸机 +披肩榛鸡 +骨骼 +篝火 +阀盖 +盆景 +书 +书籍封面 +书柜 +文件夹 +书签 +书架 +书店 +远程拾音器 +推动 +靴子 +边界 +边境牧羊犬 +植物园 +瓶 +瓶盖 +开瓶器 +螺旋开瓶器 +三角梅 +巨石 +花束 +时装店 +精品酒店 +鞠躬/蝴蝶结 +领结 +弓形窗 +碗 +保龄球运动 +保龄球馆 +保龄球 +保龄球设备 +盒子 +箱形梁桥 +箱龟 +拳击手 +内裤 +拳击 +拳击手套 +拳击台 +男孩 +支撑物 +支架 +辫子 +大脑 +刹车 +刹车灯 +树枝 +商标 +白兰地 +黄铜 +黄铜牌匾 +面包 +面包箱 +休息 +早餐 +防浪堤 +胸部 +啤酒厂 +砖块 +砖建筑物 +墙 +砖块 +婚纱 +新娘 +新郎 +伴娘 +桥 +缰绳 +公文包 +明亮的 +边沿 +钻头 +广播 +西兰花 +青铜 +铜牌 +青铜雕塑 +青铜雕像 +胸针 +小溪 +扫帚 +肉汤 +棕色 +棕熊 +巧克力蛋糕 +早午餐 +浅黑肤色的女人 +刷子 +郊狼 +包菜 +气泡 +泡泡糖 +珍珠奶茶 +斗柜 +盾牌 +芽 +佛 +水牛 +自助餐 +昆虫 +建造 +建造者 +建筑 +积木 +建筑立面 +建筑材料 +灯 +牛 +斗牛犬 +子弹 +动车 +公告栏 +防弹背心 +斗牛 +扩音器 +斗牛场 +大黄蜂 +保险杠 +卷/地形起伏 +捆 +蹦极 +双层床 +地堡/击球 +兔子 +浮标 +书桌 +墓室 +燃烧 +玉米煎饼 +公交车 +公交车司机 +公交车内部 +公交车站 +公交车站 +公交车窗户 +灌木 +商业 +名片 +业务主管 +商务西装 +业务团队 +女商人 +商人 +半身像 +屠夫 +肉铺 +孤峰 +黄油 +奶油 +蝴蝶 +蝴蝶馆 +按钮 +梧桐树 +购买 +出租车 +小屋 +卷心菜 +小屋 +守车 +储藏柜 +橱柜 +电缆 +缆车 +仙人掌 +咖啡馆 +食堂 +笼子 +蛋糕 +蛋糕台 +计算器 +大锅 +日历 +小腿 +通话 +电话亭 +书法 +平静的 +摄像机 +骆驼 +相机 +相机镜头 +迷彩 +露营 +露营者 +篝火 +露营 +营地 +校园 +罐 +开罐器 +运河 +金丝雀 +癌症 +蜡烛 +烛台 +糖果 +块状糖 +柺杖糖 +糖果店 +拐杖 +罐子 +大炮 +树冠/顶棚 +四柱床 +香瓜 +悬臂桥 +帆布 +峡谷 +帽子 +斗篷 +科德角 +卡布奇诺 +胶囊 +队长 +捕获 +车 +汽车经销商 +车门 +汽车内饰 +车标 +后视镜 +停车场 +汽车座椅 +车展 +洗车 +车窗 +焦糖 +卡片 +纸牌游戏 +纸板 +纸板盒 +羊毛衫 +红衣凤头鸟 +货物 +货运飞机 +货船 +加勒比 +康乃馨 +狂欢节 +食肉动物 +旋转木马 +鲤鱼 +木匠 +地毯 +拖鞋 +红雀 +长途客车 +斑点狗 +航空母舰 +胡萝卜 +胡萝卜蛋糕 +携带 +手推车 +纸箱/纸盒 +卡通 +卡通人物 +卡通插图 +卡通风格 +雕刻 +容器 +现金 +腰果 +赌场 +砂锅 +磁带 +盒式录音机 +石膏绷带 +铸造 +城堡 +猫 +猫窝 +猫粮 +猫器具 +猫架 +地下墓穴 +双体船 +美洲狮 +握着/抓着 +捕手 +毛毛虫 +鲶鱼 +教堂 +牛 +猫步 +走秀 +菜花 +洞穴 +鱼子酱 +光盘 +CD播放器 +雪松 +天花板 +吊扇 +庆祝 +庆典 +名人 +芹菜 +大提琴 +手机 +水泥 +墓地 +中心装饰品 +蜈蚣 +陶瓷 +瓷砖 +麦片 +仪式 +证书 +链条 +链锯 +椅子 +升降椅 +躺椅 +木屋 +圣杯 +粉笔 +房间 +变色龙 +香槟酒 +香槟杯 +冠军 +锦标赛 +吊灯 +婴儿换尿布台 +通道 +皴裂处 +小教堂 +人物雕塑 +木炭 +充电 +充电器 +战车 +慈善机构 +慈善活动 +魅力 +图表 +追逐 +底盘 +检查/支票 +支票簿 +棋盘 +检查表 +欢呼声 +鼓励/啦啦队 +奶酪 +奶酪汉堡 +奶酪蛋糕 +猎豹 +厨师 +化合物 +化学家 +化学 +化学实验室 +旗袍 +樱桃 +樱花 +樱桃番茄 +樱桃树 +国际象棋 +栗子 +鸡 +鸡胸肉 +鸡笼 +鸡肉沙拉 +鸡翅 +鹰嘴豆 +小衣橱 +吉娃娃 +孩子 +童星 +孩子的房间 +红番椒 +辣热狗 +烟囱 +黑猩猩 +瓷器 +白菜 +中国园林 +中国结 +月季 +中国塔 +炸薯条/炸薯条 +花栗鼠 +凿子 +巧克力 +巧克力棒 +巧克力蛋糕 +巧克力碎片 +巧克力饼干 +巧克力牛奶 +巧克力慕斯 +松露 +唱诗班 +厨房刀 +砧板 +筷子 +圣诞节 +圣诞球 +圣诞贺卡 +圣诞装饰 +圣诞晚宴 +平安夜 +圣诞帽 +圣诞灯 +圣诞市场 +圣诞装饰 +圣诞树 +菊花 +教堂 +教堂塔 +苹果酒 +雪茄 +雪茄盒 +香烟 +烟盒 +腰带 +电影院 +摄影师 +肉桂 +圆 +电路 +电路板 +马戏团 +水箱 +柑橘类水果 +城市 +城市公交 +市政厅 +城市夜景 +城市公园 +城市天际线 +城市广场 +城市街道 +城墙 +城市景观 +蛤蜊 +单簧管 +扣子 +班级 +经典 +教室 +锁骨 +爪子 +黏土 +陶器 +清洁 +洁净室 +清洁工人 +清洁用品 +清晰的 +栓 +克莱门氏小柑橘 +客户端 +悬崖 +爬 +爬山 +登山者 +诊所 +夹子 +剪贴画 +剪贴板 +快速帆船 +君子兰 +斗篷 +木底鞋 +特写 +壁橱 +布 +穿衣 +衣服 +晒衣夹 +晒衣绳 +服装店 +云 +云雾森林 +多云 +三叶草 +小丑 +小丑鱼 +俱乐部 +离合器 +手拿包 +煤炭 +海岸 +外套 +衣帽架 +玉米 +公鸡 +凤头鹦鹉 +可卡犬 +驾驶 +蟑螂 +鸡尾酒 +小礼服 +鸡尾酒调制器 +鸡尾酒桌 +可可 +椰子 +椰子树 +咖啡 +咖啡豆 +咖啡杯 +咖啡机 +咖啡店 +咖啡壶 +棺材 +法国白兰地 +螺旋 +硬币 +可口可乐 +滤器 +冷的 +卷心菜沙拉 +合作 +拼贴画 +收藏品 +大学生 +牧羊犬 +碰撞 +颜色 +涂色书 +染色材料 +矮种马 +柱子 +梳子 +密码锁 +喜剧演员 +喜剧 +喜剧电影 +彗星 +舒服 +安慰食物 +漫画书 +漫画人物 +连环画 +指挥官 +评论员 +社区 +通勤 +公司 +指南针 +比赛 +比赛 +竞争者 +作曲家 +作文 +堆肥 +电脑 +电脑机箱 +电脑椅 +电脑桌 +键盘 +计算机显示器 +计算机房 +电脑屏幕 +机箱 +概念车 +音乐会 +音乐厅 +贝壳 +混凝土 +调味品 +避孕套 +独立产权的公寓 +指挥 +锥形物 +会议 +会议中心 +会议厅 +会议室 +五彩纸屑 +冲突 +合流 +连接 +连接器 +温室 +星座 +建筑工地 +建筑工人 +包含 +容器 +集装箱船 +大陆 +轮廓 +合同 +控制 +控制塔 +便利店 +集会 +交谈 +转换器 +可转换的 +输送机 +厨师/烹饪 +烹饪 +烹饪喷雾剂 +炊具 +凉的 +冷却器 +铜 +一本/一册 +珊瑚 +珊瑚礁 +粗绳 +有线电话 +酒 +威尔士矮脚狗 +瓶塞 +软木板 +鸬鹚 +玉米 +玉米田 +玉米面包 +角落 +小号 +飞檐 +燕麦片 +围栏 +走廊 +紧身衣 +化妆品 +化妆刷 +化妆镜 +角色扮演 +服装 +服装电影设计师 +婴儿床 +小屋 +棉花 +棉花糖 +沙发 +倒计时 +柜台 +台面 +最佳乡村歌手 +乡村别墅 +乡村公路 +乡村流行歌手 +农村 +双门小轿车 +夫妇/两人/几个 +情侣写真 +小胡瓜 +课程 +球场 +法院 +院子 +堂兄弟 +工作服 +奶牛 +母牛的颈铃 +牛仔 +牛仔靴 +牛仔帽 +螃蟹 +蟹肉 +裂纹 +摇篮 +工艺 +工匠 +蔓越莓 +起重机 +黑纱 +厕所 +板条箱 +火山口湖 +龙虾 +蜡笔 +奶油乳酪 +奶油罐 +创建 +生物 +信用卡 +新月形 +新月形面包 +山顶 +全体船员 +蟋蟀 +板球用球 +板球队 +板球队员 +钩边 +克罗克电锅 +鳄鱼 +庄稼 +露脐上衣 +交叉 +横木 +十字路口 +相声 +人行横道 +油煎面包块 +乌鸦 +撬棍 +人群 +拥挤的 +皇冠 +阴极射线管屏幕 +耶稣受难像 +巡游 +游轮 +巡洋艇 +面包屑 +压坏 +拐杖 +水晶 +幼兽 +立方体 +黄瓜 +球杆 +袖口 +袖扣 +烹饪 +农田 +杯子 +纸杯蛋糕 +丘比特 +马路牙子 +旋度 +卷发器 +无籽葡萄干 +货币 +咖喱 +窗帘 +曲线 +软垫 +顾客 +切 +餐具 +自行车 +骑自行车 +龙卷风 +汽缸 +铙钹 +柏树 +柏树 +达克斯猎狗 +水仙花 +匕首 +大丽花 +萝卜 +乳制品 +雏菊 +大坝 +损害 +潮湿的 +跳舞 +舞池 +舞蹈室 +舞者 +蒲公英 +黑暗 +黑暗 +飞镖 +圆靶 +指示板 +日期 +女儿 +黎明 +天床上 +日光 +门栓 +死亡 +辩论 +碎片 +玻璃水瓶 +甲板 +双层巴士 +装饰 +装修/装饰 +装饰画 +鹿 +后卫 +神 +熟食 +投递 +拆迁 +怪兽 +演示 +兽窝/休闲室 +牛仔夹克 +牙医 +百货商店 +抑郁症 +德比 +皮肤病 +沙漠 +沙漠公路 +设计 +设计师 +桌子/表格 +台灯 +桌面 +台式电脑 +甜点 +破坏 +侦探 +洗涤剂 +露水 +仪表盘 +钻石 +尿布 +尿布包 +杂志 +死 +饮食 +挖掘机 +数字 +数字时钟 +莳萝 +晚餐 +小船 +餐厅 +晚宴 +餐桌 +恐龙 +浸 +文凭 +指引 +导演 +尘埃 +越野摩托车 +泥土地 +泥土路 +泥路/土路 +灾难 +信徒 +迪斯科舞厅 +迪斯科灯秋 +迪斯科舞厅 +疾病 +盘子 +碟形天线 +洗碗机 +抹布 +菜肴 +洗碗液 +迪斯尼乐园 +自动售货机 +展示 +陈列窗 +壕沟 +潜水 +潜水员 +跳水板 +纸杯 +流行音乐播音员 +杜宾犬 +码头 +医生 +文件 +纪录片 +狗 +狗窝 +犬种 +狗项圈 +狗粮 +狗窝 +洋娃娃 +美元 +玩偶之家 +洋娃娃 +海豚 +穹顶 +住宅 +多米诺骨牌 +驴 +甜甜圈 +涂鸦 +门 +门把手 +受气包 +门牌 +门口 +宿舍 +面团 +市中心 +推土机 +拖 +龙 +蜻蜓 +排水沟 +剧本 +戏剧电影 +画 +抽屉里 +图画/画画 +图钉 +辫子 +连衣裙/特定场合的服装 +礼帽 +正装衬衫 +皮鞋 +大礼服 +梳妆台 +更衣室 +运球 +漂移 +浮木 +钻 +饮品/喝 +饮用水 +开车 +司机 +车道 +无人机 +水滴/下降 +吊灯 +滴管 +干旱 +药物 +药店 +鼓 +鼓手 +鸡腿 +干的 +公爵夫人 +鸭子 +鸭嘴兽 +小鸭子 +布基胶带 +伙计 +二重唱 +粗呢 +独木舟 +哑铃 +饺子 +沙丘 +扣篮 +榴莲 +黄昏 +灰尘 +垃圾车 +簸箕 +羽绒被 +DVD +染料 +鹰 +耳朵 +御寒耳罩 +耳机 +耳塞 +耳环 +地震 +画架 +复活节 +复活节兔子 +复活节彩蛋 +吃 +餐厅 +泡芙 +日食 +生态系统 +编辑 +教育 +教育家 +鳗鱼 +蛋 +蛋卷 +蛋挞 +打蛋器 +白鹭 +埃菲尔铁塔 +橡皮筋 +上级 +电椅 +电钻 +电工 +电 +电子 +电子器件 +大象 +高度图 +电梯 +电梯轿厢 +电梯门 +电梯大堂 +电梯井 +路堤 +大使馆 +装饰 +灰烬 +会徽 +刺绣 +翡翠 +紧急 +紧急服务 +紧急车辆 +情感 +帝国大厦 +搪瓷 +外壳/围墙 +茶几 +能源 +订婚 +订婚戒指 +引擎 +机舱 +工程师 +工程 +英国短毛猫 +乐团 +回车键 +演艺人员 +娱乐 +娱乐中心 +入口 +入口大厅 +信封 +马术 +设备 +橡皮擦 +二胡 +侵蚀 +自动扶梯 +食用蜗牛 +浓缩咖啡 +房地产 +河口 +桉树 +晚上 +晚礼服 +夜光 +傍晚天空 +晚上的太阳 +事件 +常绿的 +母羊 +挖掘 +运动 +排气罩 +展览 +出口 +探险者 +爆炸 +延长线 +灭火器 +排气扇 +挤压 +眼睛 +眼影 +眉 +眼线笔 +布料 +纺织品商店 +外观 +脸 +脸部特写 +蜜粉 +毛巾 +面巾纸架 +设施 +工厂 +工厂车间 +集市 +露天市场 +仙女 +猎鹰 +秋天 +家庭 +家庭轿车 +全家福 +家庭房 +风扇/扇子 +尖牙 +农场 +农民 +农民市场 +农舍 +时尚 +时尚配饰 +时装设计师 +时尚的女孩 +时装插图 +时装大片 +时装模特 +时装表演 +快餐 +西式快餐 +父亲 +水龙头 +故障 +动物 +小鹿 +传真 +宴会 +羽毛 +软呢帽 +饲料 +一餐 +饲养 +喂养的椅子 +猫科 +美洲狮 +栅栏 +芬达 +蕨类植物 +雪貂 +摩天轮 +渡船 +肥料 +节日 +纤维 +小说 +小说书 +田野/场地/野外 +田间道路 +无花果 +打架 +花样滑冰运动员 +小雕像 +文件 +档案照片 +文件柜 +填满 +胶片相机 +电影导演 +电影格式 +电影首映礼 +电影制片人 +拍摄 +过滤器 +鳍 +手 +终点线 +冷杉 +冷杉树 +火 +火灾报警 +消防部门 +消防车 +消防通道 +消防水带 +火坑 +消防站 +爆竹 +消防队员 +壁炉 +烟花 +烟花表演 +急救箱 +鱼 +鱼船 +海鲜市场 +鱼塘 +鱼缸 +渔夫 +钓鱼 +渔船 +渔网 +钓鱼 +渔村 +健身 +健身课程 +五个 +固定装置 +峡湾 +国旗 +旗杆 +小薄片 +火焰 +火烈鸟 +法兰绒 +拍打 +耀斑 +闪光 +烧瓶 +平 +比目鱼 +风味 +跳蚤 +跳蚤市场 +舰队 +飞行 +空中乘务员 +翻转 +触发器 +翻转图 +浮动 +群 +洪水 +地板/地面 +落地扇 +脚垫 +楼层平面图 +落地窗 +插花艺术 +花店 +牙线 +面粉 +流动 +花 +花篮 +花坛 +花箱 +花田 +花童 +花卉市场 +流体 +冲洗 +长笛 +飞 +飞行钓鱼 +传单 +马 +泡沫 +雾 +多雾的 +鹅肝酱 +箔纸 +折椅 +树叶 +民间艺术家 +民间舞蹈 +民间摇滚艺术家 +方旦糖 +火锅 +圣洗池 +食物 +食用色素 +美食广场 +食品加工机 +小吃摊 +快餐车 +桌上足球 +脚 +人行桥 +足球 +足球教练 +大学橄榄球赛 +足球比赛 +足球场 +足球比赛 +橄榄球头盔 +足球运动员 +足球场 +足球队 +小路 +脚印 +脚踏板 +台座 +鞋子 +故宫 +浅滩 +额头 +森林 +森林大火 +森林地面 +森林小路 +森林公路 +锻造 +餐叉 +叉车 +表格 +园林 +队列/形成物 +F1方程式赛车 +堡垒 +碉堡 +追逐 +化石 +粉底 +喷泉 +钢笔 +狐狸 +框架 +雀斑 +高速公路 +卡车 +法国 +法国斗牛犬 +薯条 +法式吐司 +化妆水 +冰箱 +炸鸡 +煎蛋 +炒饭 +友谊 +飞盘 +青蛙 +霜 +结霜 +严寒 +结冰 +水果 +水果蛋糕 +水果盘 +水果市场 +水果沙拉 +水果摊 +果树 +水果商店 +油炸食品 +煎锅 +软糖 +燃料 +吸烟罩 +有趣的 +葬礼 +真菌 +漏斗 +毛皮衣服 +毛皮大衣 +家具 +蒲团 +小工具 +枪口 +星云/星系 +美术馆 +游戏 +游戏棋盘 +游戏手柄 +火腿 +团伙 +车库 +车库门 +手工模型 +垃圾 +花园 +花园芦笋 +橡胶软管 +花园蜘蛛 +园丁 +园艺 +加菲猫 +滴水嘴 +花环 +大蒜 +衣服 +气体 +加油站 +煤气炉 +防毒面具 +收集 +聚集 +测量仪器 +露台 +齿轮 +壁虎 +艺妓 +凝胶 +百货商店 +发电机 +天竺葵 +幽灵 +礼物 +礼品袋 +礼品篮 +礼物盒 +礼品卡 +礼品商店 +礼物包装 +演唱会 +杜松子酒 +姜 +姜饼 +姜饼屋 +银杏树 +长颈鹿 +女孩 +给 +冰川 +角斗士 +玻璃珠 +玻璃瓶 +玻璃碗 +玻璃箱 +玻璃建筑 +玻璃门 +玻璃地板 +玻璃屋 +玻璃罐 +玻璃板 +玻璃桌子 +玻璃花瓶 +玻璃墙 +玻璃窗 +眼镜 +光滑面 +滑翔机 +地球 +手套 +发光 +汤圆 +去 +袭击 +球门 +守门员 +山羊 +羊奶酪 +戈壁 +护目镜/墨镜 +黄金 +金牌 +金门大桥 +金毛猎犬 +金鱼 +高尔夫运动 +高尔夫球帽 +高尔夫球车 +高尔夫球杆 +高尔夫球场 +高尔夫球手 +鹅 +大猩猩 +哥特式 +葫芦 +政府 +政府机构 +礼服 +毕业生 +毕业典礼 +谷物 +逆戟鲸 +大奖赛 +祖父 +祖母 +祖父母 +花岗岩 +格兰诺拉麦片 +葡萄 +西柚 +葡萄酒 +草 +蚱蜢 +草原 +长满草的 +擦菜器 +坟墓 +碎石 +墓碑 +肉汁 +调味汁瓶 +灰色 +吃草 +放牧 +绿色 +绿色植物 +欢迎 +问候 +贺卡 +灰狗 +网格 +筛子 +烧烤架 +格栅 +烤鳗鱼 +磨 +研磨机 +粗燕麦粉 +杂货袋 +洞穴 +地松鼠 +群体 +合影 +小树林 +生长 +牛油果酱 +警卫 +看门狗 +宾馆 +客房 +指南 +豚鼠 +吉他 +吉他手 +海湾 +海鸥 +枪 +高达 +谒师所 +古筝 +健身房 +体操运动员 +栖息地 +黑客 +冰雹 +头发 +头发颜色 +发胶 +毛刷 +发型 +发夹 +发网 +发夹 +发型 +一半 +礼堂 +万圣节 +万圣节服装 +万圣节南瓜 +露背装 +汉堡 +汉堡包 +哈密瓜 +锤子 +吊床 +阻碍 +仓鼠 +烘手机 +放大镜 +擦手巾 +手提包 +手球 +手铐 +手枪 +手帕 +把手 +手锯 +握手 +倒立 +手写 +汉服 +悬挂 +飞机库 +衣架 +幸福 +海港 +斑海豹 +硬摇滚艺术家 +精装书 +建筑工人 +硬件 +五金店 +硬木 +硬木地板 +口琴 +管风琴 +羽管键琴 +收获 +收割机 +坐垫/搁脚凳/草丛 +帽子 +帽盒 +双簧管 +山楂 +干草 +干草地 +榛子 +头 +主教练 +大灯 +床头板 +头饰 +海岬 +总部 +听力 +心脏 +心形 +热能 +加热器 +帚石楠 +树篱 +刺猬 +脚后跟 +直升机 +直升机机场 +头盔 +帮助 +母鸡 +指甲花 +药草 +兽群 +寄居蟹 +英雄 +苍鹭 +芙蓉花 +芙蓉花 +隐藏/隐蔽处 +高杠 +高跟鞋 +高地 +突出 +徒步旅行 +徒步旅行者 +徒步靴 +登山设备 +山丘 +丘陵地 +别墅 +山坡 +印度教寺庙 +铰链 +臀部 +嘻哈艺人 +河马 +历史学家 +历史遗迹 +历史 +曲棍球 +冰球馆 +曲棍球比赛 +曲棍球运动员 +曲棍球棒 +锄头 +洞 +假日 +冬青树 +海参 +家/住宅 +家用电器 +基地 +家居装饰 +室内设计 +内政部 +家庭影院 +家庭作业 +鹰嘴豆泥 +蜂蜜 +蜂窝 +蜜月 +风帽 +连帽衫 +挂钩/勾住 +跳 +地平线 +犀鸟 +长角牛 +大黄蜂 +震惊 +恐怖电影 +马鞍褥 +马车 +马场 +骑马 +马背 +马蹄铁 +软管 +医院 +医院病床 +病房 +主持人 +小旅馆 +热 +热气球 +热狗 +辣椒酱 +温泉 +旅馆 +酒店大堂 +酒店房间 +电炉 +沙漏 +房子 +房子外部 +室内植物 +悬滑板 +吼 +蜷缩 +拥抱 +呼啦圈 +人 +增湿器 +蜂鸟 +座头鲸 +打猎 +狩猎小屋 +障碍 +飓风 +哈士奇 +小屋 +鬣狗 +混合物 +绣球花 +消火栓 +水上飞机 +冰 +冰袋 +北极熊 +冰洞 +冰淇淋 +冰淇淋蛋卷 +冰淇淋商店 +冰块 +浮冰 +冰球运动员 +冰球队 +棒棒糖 +制冰机 +溜冰场 +冰雕 +冰架 +溜冰鞋 +滑冰 +冰山 +冰柱 +结冰 +图标 +身份证照片 +身份证 +冰屋 +光/灯光/光线 +鬣蜥蜴 +照亮 +插图 +形象 +黑斑羚 +熏香 +独立日 +个人 +室内 +划船器 +电磁炉 +工业区 +工业 +步兵 +充气艇 +服务台 +基础设施 +成分 +吸入器 +注射 +受伤 +墨水 +印泥 +小湖湾 +题词 +昆虫 +安装 +乐器/器械 +绝缘杯 +互动 +室内设计 +网站 +十字路口 +面试 +无脊椎动物 +邀请 +平板电脑 +苹果手机 +苹果音乐播放器 +虹膜 +铁 +熨衣板 +灌溉系统 +岛 +小岛 +等足类动物 +象牙 +常青藤 +居酒屋 +千斤顶 +帝王蟹/蟹 +夹克衫 +按摩浴缸 +玉 +美洲虎 +监狱牢房 +果酱 +日式花园 +茉莉花 +下巴 +松鸦 +爵士乐 +爵士乐艺术家 +爵士融合艺术家 +牛仔裤 +吉普车 +果冻 +果冻豆 +水母 +喷气式飞机 +摩托艇 +珠宝 +珠宝 +珠宝店 +拼图游戏 +人力车 +赛马骑师 +赛马帽 +慢跑 +联合的 +记者 +操纵杆 +法官 +水壶 +玩杂耍 +果汁 +榨汁器 +枣子 +跳绳 +连身裤 +丛林 +废品堆放场 +羽衣甘蓝 +万花筒 +袋鼠 +卡拉ok +空手道 +卡丁车运动 +旧城区 +皮船 +烤肉串 +按键/钥匙 +门卡 +卡其色 +踢 +苏格兰裙 +和服 +幼儿园教室 +幼儿园 +国王 +帝王蟹 +亲吻 +工具包 +厨房 +厨房橱柜 +厨房台面 +厨房地板 +厨房抽油烟机 +厨房岛 +厨房水槽 +厨房桌子 +厨房用具 +厨房窗户 +厨房用具 +风筝 +猕猴桃 +护膝 +跪下 +餐刀 +骑手 +编织 +编织针 +球形把手 +门环 +结 +考拉 +锦鲤 +ktv +实验室 +实验室外套 +标签 +拉布拉多 +迷宫 +网眼织物 +蕾丝连衣裙 +梯子 +长柄杓 +瓢虫 +环礁湖 +湖泊 +湖区 +湖边小屋 +湖岸 +羊肉 +羊排 +灯柱 +灯罩 +矛 +土地 +陆地车辆 +废物填埋 +着陆 +降落甲板 +地标 +风景 +山崩 +挂带 +灯笼 +腿/大腿 +笔记本电脑 +笔记本键盘 +幼体 +烤宽面条 +激光 +睫毛 +套索 +门闩 +乳胶 +拿铁咖啡 +笑 +发射 +发布会 +举办会议 +自助洗衣店 +洗衣房 +洗衣篮 +洗衣房 +熔岩 +薰衣草 +草坪 +草坪婚礼 +律师 +躺 +引领 +主唱 +通向 +领袖 +泄漏 +倾斜/倚靠 +学习 +皮带 +皮革 +皮夹克 +皮鞋 +演讲 +演讲厅 +教学室 +窗台 +剩饭 +腿 +传说 +紧身裤/秋裤 +立法院 +乐高 +豆类 +柠檬 +柠檬汁 +柠檬水 +狐猴 +镜头 +眩光 +扁豆 +豹 +紧身连衣裤 +紧身裤袜 +小妖精 +功课 +信函 +信箱 +信的标志 +刻字 +生菜 +水平 +图书馆 +许可证 +车牌 +地衣 +舔 +盖子 +躺着 +安全带 +救生衣 +救生艇 +救生员 +提起 +灯具 +灯光秀 +电灯开关 +照明/照明设备 +闪电 +避雷针 +淡紫色 +百合 +肢体 +石灰 +石灰石 +豪华轿车 +线条 +艺术线条 +排队 +亚麻 +邮轮 +狮子 +润唇膏 +口红 +液体 +酒类商店 +列表 +荔枝 +生活 +家畜 +客厅 +生活空间 +蜥蜴 +负载 +装卸码头 +游手好闲的人 +走廊 +定位 +锁 +闸室 +储物柜 +阁楼 +原木 +小木屋 +标志 +洛基 +长头发 +冲浪板 +隐约显现/织布机 +环状 +遗失 +彩票 +莲花 +爱 +双人沙发 +行李 +木材 +伐木工人 +午餐 +午餐盒 +郁郁葱葱的 +奢侈品 +豪华游艇 +雨衣 +澳洲胡桃 +短尾猿 +通心粉 +金刚鹦鹉 +弯刀 +机器 +机枪 +杂志 +魔法 +魔术师 +磁铁 +放大镜 +木兰花 +喜鹊 +麻将 +象夫 +女仆 +邮件 +邮件槽 +制作 +改造 +化妆师 +化妆工具 +野鸭 +野鸭 +槌棒 +哺乳动物 +猛犸象 +男人 +管理 +经理 +海牛 +曼荼罗 +橘子 +普通话 +鬃毛 +漫画 +食槽 +芒果 +山竹果 +红树林 +曼哈顿 +检修孔 +井盖 +修指甲 +人体模型 +庄园主宅 +大厦 +螳螂 +地幔 +活动房层 +制造业 +手稿 +地图 +枫木 +枫叶 +枫糖浆 +沙球 +马拉松 +大理石 +行进 +行进乐队 +母马 +金盏花 +水兵 +海洋无脊椎动物 +海洋哺乳动物 +木偶 +标志 +集市 +市场广场 +市场摊位 +结婚 +武术 +武术家 +武术馆 +马提尼 +马丁尼酒杯 +睫毛膏 +吉祥物 +土豆泥 +搅碎机 +面具/口罩 +按摩 +桅杆 +地垫 +斗牛士 +比赛 +火柴盒 +衣料 +床垫 +陵墓 +长裙 +一餐 +量杯 +卷尺 +肉类 +肉丸 +机械师 +机械风扇 +奖牌 +媒体 +医疗设备 +医学图像 +医务人员 +医药箱 +中世纪的 +麦地那市 +冥想 +猫鼬 +赛事 +香瓜 +纪念碑 +菜单 +美人鱼 +网 +肮脏 +信使袋 +金属 +金属艺术家 +金属探测器 +计量器 +中层楼 +麦克风 +显微镜 +微波炉 +午夜 +里程碑 +军装 +牛奶 +牛奶罐 +奶茶 +奶昔 +磨坊 +矿井 +矿工 +矿物质 +矿泉水 +迷你 +微缩模型 +面包车 +部长 +小型货车 +薄荷 +薄荷糖 +镜子 +小姐 +投掷物 +任务 +槲寄生 +混合 +搅拌机 +搅拌碗 +混合物 +护城河 +电动踏板车 +模型/模特 +汽车模型 +现代 +现代大厦 +潮湿 +模具 +模具 +鼹鼠 +君主 +钱 +监控器 +和尚 +猴子 +活动扳手 +黑白照片 +独轮脚踏车 +怪物卡车 +月亮 +月饼 +月光 +沼泽 +驼鹿 +拭子 +助力车 +早晨 +晨雾 +晨光 +朝阳 +砂浆 +马赛克 +清真寺 +蚊子 +藓类植物 +汽车旅馆 +蛾 +母亲 +主板 +主题 +动作 +电动机 +摩托车 +摩托车 +摩托车头盔 +摩托车赛车手 +骑摩托车的人 +赛车运动 +土堆 +山 +山地自行车 +山地自行车员 +山地自行车运动 +山地大猩猩 +山湖 +山景观 +山口 +山路 +山脉 +山区河流 +山雪 +山间溪流 +山景城 +山村 +登山者 +登山包 +鼠标/鼠 +鼠标垫 +捕鼠器 +嘴 +漱口水 +移动 +电影海报 +电影票 +割草机 +mp3播放器 +先生 +泥 +松饼 +马克杯 +桑树 +覆盖物 +骡子 +直辖市 +壁画 +肌肉 +肌肉车 +博物馆 +蘑菇 +音乐 +音乐节 +音乐凳子 +音乐工作室 +音乐录影带表演者 +音乐键盘 +音乐家 +贻贝 +芥末 +神话 +烤干酪辣味玉米片 +指甲油 +指甲锉 +保姆 +餐巾 +狭窄的 +国旗 +基督诞生的场景 +自然历史博物馆 +自然 +自然保护区 +导航 +九夜节 +海军 +星云 +脖子 +围颈带/领口 +项链 +领口 +花蜜 +油桃 +针状物 +邻居 +与某处邻近的地区 +霓虹灯 +霓虹灯 +神经 +巢 +新年 +新生的 +纽芬兰 +新婚 +新闻 +记者招待会 +报摊 +晚上 +夜市 +夜空 +夜景 +夜总会 +床头柜 +面条 +鼻子 +鼻羁 +注解 +笔记本 +记事本 +信纸 +公告 +数字图标 +修女 +护士 +托儿所 +养老院 +螺母 +胡桃夹子 +橡木 +橡树 +桨 +绿洲 +烘干室 +燕麦片 +燕麦 +方尖塔 +观察塔 +天文台 +超越障碍训练场 +海洋 +章鱼 +提供 +办公室 +办公大楼 +办公椅 +办公室隔间 +办公桌 +办公用品 +办公室的窗户 +军官 +行政官员 +石油 +油灯 +油画 +石油钻台 +秋葵 +老照片 +橄榄 +橄榄油 +橄榄树 +煎蛋卷 +洋葱 +洋葱圈 +蛋白石 +开阔的/张开 +开始 +开幕式 +歌剧 +歌剧院 +操作 +手术室 +操作 +眼镜店 +猩猩 +橙子/橙色 +橙汁 +橙树 +橘园 +轨道 +果园 +乐池 +兰花 +订单 +组织 +折纸 +点缀 +鱼鹰 +鸵鸟 +水獭 +外面的 +露头 +户外 +厕所 +电源插头 +大纲 +椭圆形 +烤箱 +整体 +大衣 +天桥 +猫头鹰 +牡蛎 +橡皮环 +包裹 +包/包装/包裹 +围场 +警车 +挂锁 +肉菜饭 +宝塔 +疼痛 +油漆刷 +画家 +佩斯利印花大手帕 +宫殿 +调色板 +打桩 +棺罩 +棕榈树 +平底锅 +煎饼 +熊猫 +面板 +全景 +三色堇 +喘息 +储藏室 +裤子 +连裤袜 +木瓜 +纸 +纸袋 +切纸机 +纸灯笼 +纸盘子 +纸巾 +平装书 +压纸器 +降落伞 +游行 +天堂 +鹦鹉 +护理人员 +长尾小鹦鹉 +滑翔伞 +伞兵 +羊皮纸 +教区 +公园 +公园长椅 +停车 +停车场 +停车费 +停车标志 +议会 +欧芹/香菜 +参与者 +合作伙伴 +帕特里奇 +聚会 +派对帽 +通过 +通道 +存折 +乘客 +客船 +旅客列车 +百香果 +护照 +面食 +粘贴 +糕点 +牧场 +补丁 +病人 +图案/款式 +人行道/硬路面 +大帐篷 +爪子 +支付 +付费电话 +豌豆 +和平 +桃子 +孔雀 +山峰/尖顶 +花生 +花生酱 +梨 +珍珠 +卵石 +山核桃 +行人 +人行天桥 +步行街 +果皮 +削皮器 +小钉板 +木质腿 +鹈鹕 +笔/围栏 +点球 +铅笔 +铅笔盒 +卷笔刀 +铅笔裙 +吊坠 +钟摆 +企鹅 +半岛 +锦标旗 +便士 +储蓄罐 +牡丹 +胡椒/辣椒 +胡椒研磨机 +胡椒子 +意大利辣香肠 +栖息/鲈鱼 +表演 +性能 +表演舞台 +香水 +绿廊 +波斯猫 +柿子 +个人护理 +个人漂浮装置 +害虫 +宠物 +宠物店 +宠物店 +花瓣 +佩妮 +教堂的长椅 +野鸡 +现象 +哲学家 +电话 +电话簿 +留声机 +照片 +照相亭 +相框 +摄影 +物理学家 +物理实验室 +钢琴家 +钢琴 +选择 +捡起 +泡菜 +野餐 +野餐区 +野餐篮 +野餐桌 +图片 +相框 +馅饼 +鸽子 +朝圣者 +药片 +枕头 +飞行员 +领航艇 +别针 +松树 +松果 +松林 +松子 +菠萝 +乒乓球桌 +乒乓球 +粉色 +一品脱的量 +琵琶 +管子 +管碗 +海盗 +海盗旗 +海盗船 +阿月浑子 +滑雪场 +口袋里的面包 +火龙果 +斗牛犬 +球场 +投手 +猪笼草 +干草叉 +披萨 +披萨刀 +比萨锅 +披萨店 +招牌 +地方 +餐具垫 +格子 +平原 +示意图 +行星 +行星地球 +厚木板 +植物 +种植园 +种植 +匾额 +石膏 +塑料 +橡皮泥 +高原 +平台 +白金 +大浅盘 +玩/演奏/运动 +打羽毛球 +打棒球 +打篮球 +玩台球 +踢足球 +玩乒乓球 +打网球 +打排球 +选手/运动员 +操场 +剧场 +扑克牌 +下棋 +打高尔夫球 +打麻将 +运动场 +护栏 +游戏室 +广场 +钳子 +故事情节 +犁 +插头 +插头帽 +李子 +水管工 +卫生洁具 +羽毛 +夹板 +口袋 +怀表 +随身小折刀 +圆荚体 +乐队指挥台 +诗歌 +一品红 +指/朝向 +指针 +扑克卡 +筹码 +扑克表 +杆/柱 +臭猫 +警察 +警车 +警犬 +警察局 +政治家 +圆点 +花粉 +污染 +马球 +马球领 +马球衬衫 +石榴 +波美拉尼亚的 +雨披 +池塘 +马尾辫 +贵宾犬 +池 +流行 +流行艺术家 +爆米花 +教皇 +罂粟 +瓷 +玄关 +猪肉 +粥 +便携式电池 +门户网站 +投资组合 +汽门 +肖像 +肖像会话 +摆姿势拍照 +负鼠 +帖子 +邮局 +邮票 +明信片 +海报 +海报页 +锅/罐/陶盆 +土豆 +土豆片 +土豆沙拉 +布垫子 +便壶 +袋 +家禽 +英镑 +倾泻 +粉末 +电源线 +电源插头及插座 +权力看 +电站 +练习 +布拉格城堡 +祈祷 +牧师 +首映 +处方 +显示 +演讲 +总统 +新闻发布室 +高压锅 +椒盐卷饼 +王子 +公主 +打印 +打印页面 +打印机 +印刷 +监狱 +农产品/生产 +产品 +职业 +专业的 +教授 +项目图片 +投影屏幕 +投影仪 +毕业舞会 +散步 +螺旋桨 +先知 +建议 +防护服 +抗议 +抗议者 +出版 +宣传画像 +冰上曲棍球 +布丁 +水坑 +泡芙 +角嘴海雀 +哈巴狗 +拉 +讲坛 +脉冲 +泵 +南瓜 +南瓜饼 +南瓜种子 +拳击吊袋 +拳头猛击/穿孔 +学生 +紫色 +推 +轻轻一击 +谜题 +塔 +金字塔 +大蟒 +二维码 +鹌鹑 +采石场 +季度 +石英 +女王 +油炸玉米粉饼 +队列 +乳蛋饼 +被子 +绗缝 +引用 +兔子 +浣熊 +比赛 +赛道 +水沟/跑道 +赛车 +球拍 +雷达 +散热器 +广播 +木筏/橡皮艇 +布娃娃 +栏杆/铁轨 +轨道车 +铁道 +铁路桥梁 +轨道线 +火车站 +雨 +雨靴 +彩虹 +虹鳟鱼 +雨衣 +热带雨林 +多雨的 +葡萄干 +耙子 +公羊 +斜坡 +油菜籽 +快速 +说唱歌手 +树莓 +老鼠 +棘轮 +乌鸦 +峡谷 +雷 +剃须刀 +锋利的 +阅读 +阅读材料 +钻孔器 +后面 +尾灯 +后视图 +后视镜 +收据 +收到 +接待 +配方 +记录 +唱片制作人 +录音机 +录音室 +娱乐室 +休闲车 +矩形 +回收 +回收站 +红色 +红地毯 +红旗 +红熊猫 +红酒 +红木 +芦苇 +礁石 +卷轴 +裁判 +倒影 +倒影 +反射器 +注册 +控制 +驯鹿 +放松 +释放 +救援 +宗教 +宗教的 +享受 +保持 +改造 +遥控器 +移除 +修复 +维修店 +爬行动物 +救援 +救助者 +研究 +研究员 +储层 +住宅 +居民区 +树脂 +度假胜地 +度假小镇 +餐厅的厨房 +餐厅的露台 +厕所 +零售 +寻回犬 +制动火箭 +揭示 +犀牛 +杜鹃 +肋骨 +丝带 +大米 +电饭煲 +稻田 +骑/搭乘 +脊 +骑马 +步枪 +边缘 +环/戒指 +暴乱 +涟漪 +上升 +高层建筑 +河 +河岸 +河船 +河谷 +河床 +路 +路标 +公路旅行 +路边 +烤鸡 +长袍 +罗宾 +机器人 +石头 +岩石拱 +摇滚艺术家 +摇滚乐队 +攀岩者 +攀岩 +摇滚音乐会 +岩石表面 +岩层 +摇滚歌手 +火箭 +摇椅 +岩石 +啮齿动物 +牛仔竞技表演 +竞技舞台 +罗伊 +狍子 +辊 +过山车 +轮式溜冰鞋 +溜冰鞋 +擀面杖 +浪漫 +浪漫的 +屋顶 +屋顶花园 +房间 +房间分频器 +根 +根啤酒 +绳索桥 +念珠 +玫瑰 +迷迭香 +玫瑰色的云 +罗特韦尔犬 +圆桌 +路由器 +行 +罗文 +皇家 +橡皮图章 +废墟 +魔方 +红宝石 +莱夫 +橄榄球 +橄榄球 +橄榄球运动员 +毁坏 +尺 +朗姆酒 +跑 +跑步者 +跑步鞋 +农村的 +锈 +乡村的 +黑麦 +袋 +鞍 +鞍囊 +旅行 +安全 +安全背心 +圣人 +帆 +帆船 +航行 +水手 +松鼠猴 +缘故 +沙拉 +沙拉碗 +火蜥蜴 +意大利蒜味腊肠 +出售 +三文鱼 +沙龙 +萨尔萨舞 +盐 +盐和胡椒瓶 +盐湖 +盐沼 +盐瓶 +敬礼 +萨莫耶德人 +武士 +沙子 +沙洲 +砂箱 +沙堡 +沙雕 +凉鞋 +三明治 +卫生巾 +圣诞老人 +蓝宝石 +沙丁鱼 +莎丽 +生鱼片 +沙爹 +书包 +卫星 +缎 +酱汁 +碟子 +桑拿 +香肠 +稀树大草原 +锯 +锯木架 +萨克斯管 +萨克斯手 +脚手架 +秤/标尺 +比例模型 +扇贝 +疤痕 +稻草人 +围巾 +场景 +风景 +雪纳瑞犬 +学校 +校车 +校服 +校舍 +纵帆船 +科学 +科幻电影 +科学博物馆 +科学家 +剪刀 +壁灯 +司康饼 +勺子 +踏板车 +分数 +记分板 +蝎子 +童子军 +炒蛋 +废弃 +刮板 +刮伤 +屏幕 +纱门 +截图 +螺杆 +螺丝刀 +长卷纸/卷轴 +擦洗 +硬毛刷 +雕塑家 +雕塑 +海洞穴 +海冰 +海狮 +海龟 +海胆 +尖吻鲈 +海底 +海鸟 +海鲜 +海马 +海豹 +海景 +海贝 +海滨度假胜地 +季节 +座位 +安全带 +海藻 +秘书 +安全 +小轿车 +看到 +种子 +跷跷板 +赛格威 +自拍 +出售 +研讨会 +感觉 +传感器 +服务器 +服务器机房 +服务 +集 +缝纫机 +影子 +摇 +瓶 +洗发水 +形状 +分享 +鲨鱼 +卷笔刀 +记号笔 +剃须刀 +剃须膏 +披肩/围巾 +剪切 +剪刀 +羊 +床单 +乐谱 +架子 +贝壳 +贝类 +避难所 +搁置 +牧羊人 +果子露 +柴犬 +发光 +航运 +集装箱 +海难 +船厂 +衬衫 +赤膊的 +浅滩 +鞋 +鞋盒 +鞋店 +鞋楦 +射击 +得分篮球后卫 +商店橱窗 +门面 +购物者 +购物 +购物袋 +购物篮 +购物车 +购物中心 +购物街 +海岸 +海岸线 +短的 +短发 +短裤 +小酒杯 +散弹枪 +肩膀 +单肩包 +铲 +陈列柜 +淋浴 +浴帽 +浴帘 +淋浴门 +淋浴头 +碎纸机 +泼妇 +虾 +神社 +灌木 +快门 +暹罗猫 +西伯利亚 +兄弟姐妹 +侧面 +边柜 +配菜 +边车 +边线 +壁板 +标志 +指示牌 +信号 +签名 +丝绸 +丝袜 +筒仓 +银 +银牌 +银器 +唱歌 +烧焦 +歌手 +水槽 +啜 +坐/放置/坐落 +坐着 +滑板公园 +滑板 +滑板者 +溜冰者 +溜冰场 +骨架 +草图 +串串 +滑雪 +滑雪靴 +滑雪设备 +滑雪服 +滑雪缆车 +滑雪杖 +滑雪胜地 +滑雪板 +滑雪 +滑雪鞋 +皮肤 +头骨 +无边便帽 +天空 +天空塔 +天窗 +天际线 +摩天大楼 +激流回旋 +石板 +雪橇 +睡眠 +睡袋 +睡衣 +袖子 +片 +滑动 +滑块 +吊索 +坡 +投币口 +老虎机 +树懒 +慢炖锅 +鼻涕虫 +贫民窟 +气味 +微笑 +烟雾/抽烟 +零食 +蜗牛 +蛇 +鲷鱼 +快照 +通气管 +鼻子 +雪 +雪豹 +雪山 +雪球 +单板滑雪者 +雪原 +雪花 +雪人 +雪地摩托 +雪犁 +雪鞋 +雪 +肥皂 +肥皂泡 +给皂器 +足球守门员 +社会名流 +短袜 +插座 +苏打水 +垒球 +软件 +太阳能电池阵列 +士兵 +独奏 +解决方案 +宽边帽 +歌曲 +声音 +汤 +汤碗 +汤匙 +酸奶油 +纪念品 +豆浆 +水疗中心 +空间 +航天飞机 +空间站 +宇宙飞船 +意大利面 +横跨 +扳手 +火花 +闪耀 +烟火 +起泡葡萄酒 +麻雀 +抹刀 +扬声器 +观众 +会话框 +速度限制 +限速标志 +快艇 +车速表 +球 +香料 +调料架 +蜘蛛 +蜘蛛网 +扣球 +旋转 +菠菜 +尖塔 +飞溅 +海绵 +勺子 +体育协会 +运动器材 +运动团队 +体育球 +体育器材 +运动会 +运动服装 +点 +喷雾 +伸展 +春天 +春卷 +撒 +洒水器 +发芽 +云杉 +云杉森林 +队 +广场 +南瓜 +蹲 +挤 +鱿鱼 +松鼠 +水枪 +刺 +稳定的 +(码放整齐的)一叠 +体育场 +工作人员 +舞台 +舞台灯 +驿马车 +弄脏 +不锈钢 +楼梯 +楼梯 +楼梯间 +摊位/小隔间 +种马 +站/矗立/摊位 +站 +主食 +订书机 +星星 +盯着 +海星 +杨桃 +燕八哥 +州立公园 +公立学校 +车站 +固定自行车 +文具 +雕像 +牛排 +牛排刀 +蒸汽 +蒸汽机 +蒸汽机车 +蒸汽火车 +馒头 +钢 +方向盘 +(花草的)茎 +模版 +梯凳 +立体声 +听诊器 +炖 +戳/条状物 +竹节虫 +贴纸 +静物画 +高跷 +黄貂鱼 +搅拌 +搅拌器 +镫 +缝 +股票 +长筒袜 +腹部 +石头建筑 +石雕 +石屋 +石磨 +凳子 +停止 +停在 +红灯 +停车标志 +秒表 +红绿灯 +存储箱 +储藏室 +罐/蓄水池 +商店 +店面 +鹳 +风暴 +暴风云 +狂风暴雨的 +炉子 +扑克 +跨骑 +过滤器 +海峡 +带 +稻草/吸管 +草帽 +草莓 +溪流 +街头艺术 +街头艺术家 +街角 +流浪狗 +街头食品 +路灯 +街市场 +街头摄影 +街景 +路标 +街头小贩 +拉伸 +担架 +罢工 +前锋 +细绳 +芝士条 +带子 +条纹 +漫步 +结构 +工作室 +影棚拍摄 +材料 +填充玩具动物 +毛绒玩具 +馅 +树桩 +惊人的 +特技 +佛塔 +风格 +手写笔 +潜艇 +潜艇形大三明治 +海底水 +郊区 +地铁 +地铁站 +低音炮 +多肉 +绒面革 +糖 +糖碗 +甘蔗 +方糖 +西装 +套房 +夏天 +夏天傍晚 +峰顶 +太阳 +太阳帽 +日光浴 +周日 +日晷 +向日葵 +向日葵田 +葵花籽 +太阳镜 +晴天 +日出 +日落 +遮阳伞 +阳光 +超级碗 +跑车 +超级英雄 +超市 +超市货架 +超模 +支持者 +冲浪 +表面 +冲浪板 +冲浪者 +外科医生 +外科手术 +环绕 +寿司 +寿司吧 +背带裤 +悬架 +吊桥 +越野车 +燕子 +燕尾蝶 +沼泽 +天鹅 +天鹅游艇 +运动裤 +防汗带 +毛衣 +运动衫 +甜的 +红薯 +游泳 +泳帽 +游泳者 +游泳洞 +游泳池 +摆动 +平转桥 +秋千 +漩涡 +开关 +转椅 +剑 +旗鱼 +象征 +对称 +犹太教堂 +注射器 +糖浆 +系统 +t恤 +t恤 +塔巴斯科辣椒酱 +虎斑 +乒乓球拍 +桌面 +桌布 +平板电脑 +餐具 +转速表 +拦截 +墨西哥煎玉米卷 +跆拳道 +太极 +尾巴 +裁缝 +拍/拿 +起飞 +说话/交谈/演讲 +手鼓 +棕褐色 +橘子 +胶带/磁带/终点线 +挂毯 +沥青碎石路面 +芋头 +篷布 +果馅饼 +流苏 +味道 +榻榻米 +纹身 +纹身艺术家 +酒馆 +茶 +茶包 +茶话会 +茶园 +茶壶 +茶具 +教 +老师 +茶杯 +水鸭 +团队合影 +团队介绍 +眼泪 +技术员 +技术 +泰迪熊 +T字形物 +青少年 +电线杆 +变焦镜头 +望远镜 +电视 +电视摄像机 +电视室 +电视演播室 +温度 +寺庙 +天妇罗 +网球 +网球场 +网球比赛 +网球网 +网球运动员 +网球拍 +帐篷 +龙舌兰酒 +终端/航站楼 +阳台 +地形 +玻璃容器 +领土 +测试 +测试赛 +试管 +文本 +短信 +纺织 +纹理 +感恩节 +感恩节晚餐 +剧院 +戏剧演员 +治疗 +温度计 +热水瓶 +暖瓶 +恒温器 +灌木丛 +顶针 +东西 +思考 +蓟 +宝座 +金銮殿 +扔 +抱枕 +雷 +雷雨 +百里香 +皇冠 +记号 +票 +售票亭 +潮池 +领带 +老虎 +紧 +瓦 +瓷砖地板 +瓦屋顶 +瓷砖墙 +锡 +锡纸 +箔 +提拉米苏 +轮胎 +纸巾 +烤面包 +烤面包机 +烟草 +烟斗 +学步的小孩 +脚趾 +豆腐 +马桶 +马桶座圈 +化妆包 +东京铁塔 +番茄 +番茄酱 +番茄汤 +墓 +钳子 +钳子 +工具 +工具箱 +牙刷 +牙膏 +牙签 +修剪成形的花园 +配料 +火炬/光源 +龙卷风 +玉米粉圆饼 +乌龟 +大手提袋 +图腾柱 +龙猫 +巨嘴鸟 +触摸 +触地 +旅行 +旅游巴士 +导游 +游客 +旅游景点 +锦标赛 +拖车 +毛巾 +毛巾杆 +大厦 +塔桥 +小镇 +城镇广场 +玩具 +玩具车 +玩具枪 +玩具店 +跑道 +拖拉机 +贸易 +传统 +传统的 +交通 +锥形交通路标 +交通拥堵 +交通堵塞 +交通标志 +小道 +预告片 +拖车 +火车 +火车桥 +火车车厢 +火车内部 +火车轨道 +火车窗口 +教练 +训练 +训练长椅 +训练场 +电车/手推车 +蹦床 +变形金刚 +透明度 +旅行 +托盘/碟子 +跑步机 +款待 +树 +树枝 +林场 +树蛙 +树屋 +树根 +树干 +试验 +三角形 +铁人三项 +部落 +支流 +戏法 +三轮车 +修剪 +三人组 +三脚架 +长号 +部队 +奖杯 +奖杯 +热带 +鳟鱼 +卡车 +卡车司机 +浴缸 +管子 +拖船 +郁金香 +金枪鱼 +苔原 +隧道 +涡轮 +火鸡 +转动 +芜菁 +绿松石 +炮塔 +乌龟 +獠牙 +电视演员 +电视柜 +电视剧 +电视节目类型 +电视名人 +电视节目 +情景喜剧 +电视塔 +枝条 +黄昏 +双胞胎 +麻线 +扭 +类型 +键入 +打字机 +尤克里里 +奥特曼 +伞 +内衣 +水下 +独角兽 +制服 +宇宙 +大学 +向上 +城市 +尿壶 +瓮 +使用 +用具 +杂物间 +吸尘器/真空 +谷 +阀门 +吸血鬼 +货车 +香草 +虚荣 +种类 +花瓶/瓶 +金库 +矢量卡通插图 +矢量图标 +蔬菜 +菜园 +蔬菜市场 +植被 +车辆 +面纱 +静脉 +天鹅绒 +自动售货机 +小贩 +通风孔 +胡蜂属 +船 +背心 +兽医 +经验丰富的 +兽医办公室 +高架桥 +视频 +摄像机 +电子游戏 +录像带 +视镜 +守夜 +别墅 +村庄 +藤蔓 +醋 +葡萄园 +暴力 +紫罗兰色 +小提琴 +小提琴家 +中提琴演奏者 +愿景 +遮阳板 +伏特加 +火山 +排球 +排球场 +排球运动员 +志愿者 +航行 +秃鹰 +华夫饼干 +华夫饼机 +货车 +马车车轮 +腰 +服务员 +候机室 +等候室 +走 +步行 +手杖 +挂钟 +壁纸 +核桃 +海象 +战争 +仓库 +温暖的 +警告标志 +战士 +军舰 +疣猪 +洗 +洗衣机/垫圈 +洗 +洗衣机 +黄蜂 +浪费 +废物容器 +手表 +水 +水鸟 +水牛 +水冷却器 +水滴 +水景 +热水器 +水位 +荷花 +水上乐园 +水管 +净水器 +滑水板 +水上运动 +水面 +水塔 +水彩 +水彩插图 +水彩画 +瀑布 +喷壶 +水印叠加图章 +西瓜 +防水外套 +水路 +波浪 +蜡 +武器 +穿着 +天气 +叶片 +网 +摄像头 +婚礼 +结婚戒指 +婚礼花束 +结婚蛋糕 +新婚夫妇 +婚礼请柬 +婚礼派对 +婚纱照 +婚礼摄影师 +婚纱摄影 +婚宴 +楔 +杂草 +重量 +体重秤 +焊接工 +井 +西餐 +西餐厅 +湿 +吧台 +潜水衣 +湿地 +潜水服 +鲸鱼 +鲸鲨 +小麦 +麦田 +车轮 +轮椅 +后轮支撑车技 +生奶油 +搅拌器 +胡须 +威士忌 +哨子 +白色 +白宫 +白葡萄酒 +白板 +便门 +宽的 +挥动 +假发 +Wii +Wii手柄 +荒野 +角马 +野火 +野花 +野生动物 +柳树 +风 +风铃 +风电场 +风力涡轮机 +风车 +窗户 +窗台花盆箱 +橱窗展示 +窗框 +纱窗 +靠窗的座位 +窗台 +雨刮器 +挡风玻璃 +有风的 +酒瓶 +冷酒器 +酒柜 +酒窖 +酒杯 +酒架 +品酒 +酒庄 +翅膀 +冬天 +冬瓜 +冬天的早晨 +冬季场景 +冬季运动 +冬季风暴 +电线 +紫藤 +巫婆 +女巫帽子 +炒锅 +狼 +女人 +木头 +林鸳鸯 +木地板 +木墙 +烧木炉 +木匙 +林地 +啄木鸟 +木工刨 +羊毛 +工作 +练习卡 +工作台 +工人 +工作场所 +车间 +世界 +蠕虫 +敬拜 +伤口 +包 +裹身裙 +包装纸 +搏斗 +摔跤手 +皱纹 +腕带 +写 +作家 +手写/字迹 +毛笔 +写字桌 +游艇 +牦牛 +院子 +黄色 +瑜伽 +瑜伽垫 +酸奶 +轭 +蛋黄 +青年 +青年旅馆 +蒙古包 +斑马 +斑马线 +禅意花园 +拉链 +拉链 +僵尸 +粽子 +动物园 diff --git a/fengshen/models/Lyrics/ram/data/ram_tag_list_threshold.py b/fengshen/models/Lyrics/ram/data/ram_tag_list_threshold.py new file mode 100644 index 0000000..1583295 --- /dev/null +++ b/fengshen/models/Lyrics/ram/data/ram_tag_list_threshold.py @@ -0,0 +1,4585 @@ +ram_class_threshold = [0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.71, +0.75, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.9, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.61, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.8, +0.7, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.82, +0.8, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.85, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.77, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.89, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.78, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.9, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.9, +0.65, +0.83, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.79, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.89, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.86, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.79, +0.65, +0.63, +0.65, +0.87, +0.8, +0.46, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.9, +0.65, +0.65, +0.9, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.8, +0.65, +0.8, +0.8, +0.8, +0.65, +0.65, +0.84, +0.65, +0.65, +0.79, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.81, +0.65, +0.8, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.87, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.83, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.77, +0.87, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.85, +0.65, +0.68, +0.65, +0.8, +0.65, +0.65, +0.75, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.8, +0.8, +0.8, +0.79, +0.65, +0.85, +0.65, +0.65, +0.65, +0.9, +0.65, +0.89, +0.8, +0.65, +0.65, +0.65, +0.76, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +1, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.9, +0.65, +0.89, +0.7, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.71, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.8, +0.65, +0.8, +0.65, +0.8, +0.8, +0.9, +0.65, +0.85, +0.8, +0.8, +0.8, +0.9, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.75, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.63, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.88, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.71, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.9, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.71, +0.65, +0.8, +0.76, +0.85, +0.8, +0.65, +0.65, +0.8, +0.65, +0.79, +0.65, +0.75, +0.65, +0.8, +0.65, +0.86, +0.65, +0.65, +0.9, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.73, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.9, +0.65, +0.85, +0.65, +0.65, +0.65, +0.65, +0.8, +0.75, +0.65, +0.65, +0.65, +0.65, +0.8, +0.85, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.77, +0.65, +0.65, +0.65, +0.65, +0.65, +0.86, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.6, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.74, +0.65, +0.65, +0.67, +0.65, +0.65, +0.8, +0.65, +0.65, +0.85, +0.65, +0.8, +0.65, +0.65, +0.84, +0.8, +0.8, +0.8, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.9, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.89, +0.65, +0.65, +0.65, +0.83, +0.65, +0.65, +0.65, +0.65, +0.6, +0.65, +0.8, +0.8, +0.8, +0.65, +0.65, +0.89, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.77, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.87, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.74, +0.65, +0.65, +0.66, +0.89, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.84, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.88, +0.65, +0.65, +0.8, +0.65, +0.65, +0.7, +0.65, +0.65, +0.65, +0.9, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.82, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.9, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.75, +0.65, +0.7, +0.9, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.88, +0.65, +0.65, +1, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.8, +0.8, +0.65, +0.8, +0.65, +0.65, +0.71, +0.65, +0.65, +0.65, +0.79, +0.65, +0.65, +0.65, +0.65, +0.65, +0.89, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.9, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.88, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.82, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.9, +0.65, +0.65, +0.88, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.89, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.87, +0.65, +0.66, +0.65, +0.84, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.84, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.5, +0.65, +0.64, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.81, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.8, +0.65, +0.84, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.8, +0.65, +0.85, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.73, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.86, +0.65, +0.65, +0.65, +0.65, +0.87, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.8, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.82, +0.8, +0.65, +0.65, +0.65, +0.84, +0.9, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.64, +0.65, +0.65, +0.65, +0.8, +0.8, +0.87, +0.65, +0.65, +0.78, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.8, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.9, +0.65, +0.65, +0.8, +0.65, +0.85, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.74, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.88, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.83, +0.89, +0.89, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.86, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.85, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.86, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.87, +0.8, +0.84, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.81, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.7, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.82, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.87, +0.65, +0.9, +0.8, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.7, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.85, +0.65, +0.65, +0.65, +0.65, +0.65, +0.73, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.89, +0.8, +0.65, +0.9, +0.65, +1, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.89, +0.89, +0.65, +0.65, +0.65, +0.8, +0.75, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.8, +0.8, +0.65, +0.65, +0.88, +0.65, +0.8, +0.65, +0.65, +0.8, +0.85, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.9, +0.57, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.8, +0.8, +0.79, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.89, +0.8, +0.65, +0.8, +0.65, +0.8, +0.65, +0.81, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.89, +0.65, +0.65, +0.65, +0.65, +0.65, +0.89, +0.84, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.89, +0.65, +0.8, +0.83, +0.65, +0.65, +0.8, +0.65, +0.65, +0.72, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +1, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.9, +0.65, +0.65, +0.89, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.69, +0.8, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.71, +0.65, +0.65, +0.65, +0.88, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.85, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.87, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.8, +0.9, +0.65, +0.8, +0.8, +0.65, +0.65, +0.8, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.85, +0.65, +0.65, +0.8, +0.65, +0.89, +0.65, +0.65, +0.9, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.86, +0.65, +0.77, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.8, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.89, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.75, +0.8, +0.65, +0.8, +0.88, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.88, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.82, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.83, +0.65, +0.65, +0.92, +0.89, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.75, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.85, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.87, +0.65, +0.79, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.83, +0.8, +0.65, +0.65, +0.8, +0.8, +0.65, +0.7, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.9, +0.8, +0.65, +0.65, +0.65, +0.65, +0.7, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.87, +0.65, +0.65, +0.65, +0.65, +0.8, +0.82, +0.65, +0.8, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +1, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.64, +0.65, +0.65, +0.63, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.76, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.8, +0.65, +0.75, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.87, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.82, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.89, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.9, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.8, +0.65, +0.73, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.86, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.9, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.86, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.86, +0.65, +0.8, +0.8, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.69, +0.65, +0.65, +0.65, +0.65, +0.65, +0.88, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.72, +0.65, +0.65, +0.8, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.9, +0.9, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.45, +0.8, +0.65, +0.88, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.8, +0.51, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.66, +0.65, +0.8, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.81, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.75, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.66, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.8, +0.65, +0.85, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.81, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.89, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.79, +0.75, +0.65, +0.65, +0.8, +0.65, +0.67, +0.8, +0.8, +0.86, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.81, +0.8, +0.65, +0.65, +0.9, +0.65, +0.79, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.77, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.8, +0.65, +0.74, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.6, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.89, +0.8, +0.65, +0.65, +0.88, +0.65, +0.65, +0.65, +0.9, +0.75, +0.65, +0.65, +0.65, +0.8, +0.6, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.84, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.8, +0.8, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.85, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.63, +0.65, +0.65, +0.65, +0.7, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.9, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.84, +0.65, +0.65, +0.8, +0.65, +0.81, +0.8, +0.8, +0.8, +0.82, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.8, +0.65, +0.88, +0.65, +0.8, +0.65, +0.7, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +1, +0.8, +0.8, +0.65, +0.65, +0.65, +0.8, +0.8, +0.8, +0.65, +0.74, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.85, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.9, +0.86, +0.8, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.64, +0.65, +0.65, +0.8, +0.8, +0.65, +0.87, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.87, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.7, +0.65, +0.65, +0.8, +0.65, +0.65, +0.75, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.85, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.71, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.73, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.8, +0.65, +0.86, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.75, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.88, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.81, +0.65, +0.65, +0.8, +0.65, +0.65, +0.9, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.9, +0.65, +0.65, +0.65, +0.65, +0.7, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.77, +0.65, +0.65, +0.65, +0.65, +0.65, +0.85, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.87, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.57, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.76, +1, +0.8, +0.65, +0.65, +0.58, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +1, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.87, +0.8, +0.9, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.87, +0.68, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.99, +0.8, +0.77, +0.65, +0.9, +0.65, +0.65, +0.88, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.88, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.89, +0.65, +0.65, +0.8, +0.8, +0.65, +0.7, +0.65, +0.65, +0.8, +0.9, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.8, +0.65, +0.77, +0.65, +0.65, +0.65, +0.65, +0.79, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.85, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.52, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.86, +0.65, +0.65, +0.8, +0.56, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.72, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.9, +0.65, +0.65, +0.8, +0.65, +0.8, +0.6, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.88, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.89, +0.85, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.87, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.75, +0.65, +0.65, +0.65, +0.65, +0.54, +1, +0.65, +0.65, +0.75, +0.65, +0.75, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.9, +0.62, +0.65, +0.65, +0.65, +0.65, +0.86, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.82, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.9, +0.74, +0.8, +0.65, +0.8, +0.8, +0.7, +0.65, +0.65, +0.65, +0.89, +0.65, +0.65, +0.8, +0.8, +0.8, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.8, +0.8, +0.84, +0.8, +0.65, +0.65, +0.8, +0.75, +0.65, +0.65, +0.65, +0.89, +0.65, +0.65, +0.65, +0.65, +0.82, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.84, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.8, +0.65, +0.7, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.74, +0.65, +0.8, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.65, +0.65, +0.85, +0.65, +0.9, +0.9, +0.65, +0.65, +0.65, +0.63, +0.82, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.7, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.65, +0.74, +0.9, +0.65, +0.8, +0.65, +0.65, +0.58, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.89, +0.75, +0.65, +0.65, +0.8, +0.65, +0.65, +0.88, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.89, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.8, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.87, +0.65, +0.65, +0.65, +0.8, +0.65, +0.64, +0.65, +0.65, +0.65, +0.8, +0.87, +0.65, +0.65, +0.8, +0.9, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.65, +0.89, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.83, +0.65, +0.65, +0.8, +0.65, +0.9, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.78, +0.65, +0.8, +0.65, +0.9, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.9, +0.65, +0.88, +0.8, +0.65, +0.65, +0.65, +0.81, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.77, +0.65, +0.65, +0.65, +0.8, +0.8, +0.8, +0.8, +0.65, +0.65, +0.65, +1, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.85, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.88, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.8, +0.8, +0.65, +0.65, +0.65, +0.65, +0.68, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.89, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.9, +0.65, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.81, +0.65, +0.65, +0.65, +0.8, +0.85, +0.65, +0.77, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.8, +0.8, +0.9, +0.65, +0.65, +0.89, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.8, +0.65, +0.65, +0.65, +0.88, +0.8, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.82, +0.65, +0.8, +0.74, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.85, +0.65, +0.65, +0.85, +0.65, +0.65, +0.65, +0.65, +0.7, +0.7, +0.8, +0.65, +0.65, +0.65, +0.65, +0.87, +0.8, +0.65, +0.65, +0.65, +0.89, +0.85, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.7, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.9, +0.8, +0.8, +0.65, +0.66, +0.57, +0.65, +0.65, +0.65, +0.49, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.65, +0.65, +0.65, +0.8, +0.65, +0.8, +0.8, +0.86, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.89, +0.65, +0.65, +0.65, +0.65, +0.65, +0.65, +0.76] diff --git a/fengshen/models/Lyrics/ram/models/bert.py b/fengshen/models/Lyrics/ram/models/bert.py new file mode 100644 index 0000000..cb90b79 --- /dev/null +++ b/fengshen/models/Lyrics/ram/models/bert.py @@ -0,0 +1,1035 @@ +''' + * Copyright (c) 2022, salesforce.com, inc. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause + * By Junnan Li + * Based on huggingface code base + * https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/models/bert +''' + +import math +import os +import warnings +from dataclasses import dataclass +from typing import Optional, Tuple + +import torch +from torch import Tensor, device, dtype, nn +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss +import torch.nn.functional as F + +from transformers.activations import ACT2FN +from transformers.file_utils import ( + ModelOutput, +) +from transformers.modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + NextSentencePredictorOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from transformers.modeling_utils import ( + PreTrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer, +) +from transformers.utils import logging +from transformers.models.bert.configuration_bert import BertConfig + + +logger = logging.get_logger(__name__) + + +class BertEmbeddings_nopos(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + # self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + # self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + # self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + + self.config = config + + def forward( + self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 + ): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + # if position_ids is None: + # position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + embeddings = inputs_embeds + + # if self.position_embedding_type == "absolute": + # position_embeddings = self.position_embeddings(position_ids) + # # print('add position_embeddings!!!!') + # embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + + + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + + self.config = config + + def forward( + self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 + ): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + embeddings = inputs_embeds + + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + # print('add position_embeddings!!!!') + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + def __init__(self, config, is_cross_attention): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + self.key = nn.Linear(config.encoder_width, self.all_head_size) + self.value = nn.Linear(config.encoder_width, self.all_head_size) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + self.save_attention = False + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + # print(self.key.weight.shape) + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # compatible with higher versions of transformers + if key_layer.shape[0] > query_layer.shape[0]: + key_layer = key_layer[:query_layer.shape[0], :, :, :] + attention_mask = attention_mask[:query_layer.shape[0], :, :] + value_layer = value_layer[:query_layer.shape[0], :, :, :] + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + if is_cross_attention and self.save_attention: + self.save_attention_map(attention_probs) + attention_probs.register_hook(self.save_attn_gradients) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = torch.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + outputs = outputs + (past_key_value,) + return outputs + + +class BertSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.self = BertSelfAttention(config, is_cross_attention) + self.output = BertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class BertIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + def __init__(self, config, layer_num): + super().__init__() + self.config = config + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BertAttention(config) + self.layer_num = layer_num + if self.config.add_cross_attention: + self.crossattention = BertAttention(config, is_cross_attention=self.config.add_cross_attention) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + mode=None, + ): + + if mode == 'tagging': + + assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers" + + cross_attention_outputs = self.crossattention( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + present_key_value = cross_attention_outputs[-1] + + else: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + + if mode=='multimodal': + assert encoder_hidden_states is not None, "encoder_hidden_states must be given for cross-attention layers" + + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BertEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([BertLayer(config,i) for i in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + mode='multimodal', + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + if use_cache: + logger.warn( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, past_key_value, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(layer_module), + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + mode=mode, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + mode=mode, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class BertPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BertConfig + base_model_prefix = "bert" + _keys_to_ignore_on_load_missing = [r"position_ids"] + + def _init_weights(self, module): + """ Initialize the weights """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +class BertModel(BertPreTrainedModel): + """ + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in `Attention is + all you need `__ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + argument and :obj:`add_cross_attention` set to :obj:`True`; an :obj:`encoder_hidden_states` is then expected as an + input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = BertEmbeddings(config) + + self.encoder = BertEncoder(config) + + self.pooler = BertPooler(config) if add_pooling_layer else None + + self.init_weights() + + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + + def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple[int], device: device, is_decoder: bool) -> Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (:obj:`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (:obj:`Tuple[int]`): + The shape of the input to the model. + device: (:obj:`torch.device`): + The device of the input to the model. + + Returns: + :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if is_decoder: + batch_size, seq_length = input_shape + + seq_ids = torch.arange(seq_length, device=device) + causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] + # in case past_key_values are used we need to add a prefix ones mask to the causal mask + # causal and attention masks must have same type with pytorch version < 1.3 + causal_mask = causal_mask.to(attention_mask.dtype) + + if causal_mask.shape[1] < attention_mask.shape[1]: + prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1] + causal_mask = torch.cat( + [ + torch.ones((batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype), + causal_mask, + ], + axis=-1, + ) + + extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] + else: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + is_decoder=False, + mode='multimodal', + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + batch_size, seq_length = input_shape + device = input_ids.device + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size, seq_length = input_shape + device = inputs_embeds.device + elif encoder_embeds is not None: + input_shape = encoder_embeds.size()[:-1] + batch_size, seq_length = input_shape + device = encoder_embeds.device + else: + raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds") + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, + device, is_decoder) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if type(encoder_hidden_states) == list: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size() + else: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if type(encoder_attention_mask) == list: + encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask] + elif encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + if encoder_embeds is None: + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + else: + embedding_output = encoder_embeds + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + mode=mode, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +class BertLMHeadModel(BertPreTrainedModel): + + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + self.init_weights() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + def forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + return_logits=False, + is_decoder=True, + reduction='mean', + mode='multimodal', + ): + r""" + encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are + ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]`` + past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` + (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` + instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. + use_cache (:obj:`bool`, `optional`): + If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up + decoding (see :obj:`past_key_values`). + Returns: + Example:: + >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig + >>> import torch + >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased') + >>> config = BertConfig.from_pretrained("bert-base-cased") + >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config) + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + >>> prediction_logits = outputs.logits + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + is_decoder=is_decoder, + mode=mode, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + # sequence_output.shape torch.Size([85, 30, 768]) + # prediction_scores.shape torch.Size([85, 30, 30524]) + # labels.shape torch.Size([85, 30]) + + + if return_logits: + return prediction_scores[:, :-1, :].contiguous() + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() + labels = labels[:, 1:].contiguous() + loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1) + lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + if reduction=='none': + lm_loss = lm_loss.view(prediction_scores.size(0),-1).sum(1) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + # cut decoder_input_ids if past is used + if past is not None: + input_ids = input_ids[:, -1:] + + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "past_key_values": past, + "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None), + "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None), + "is_decoder": True, + } + + def _reorder_cache(self, past, beam_idx): + reordered_past = () + for layer_past in past: + reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) + return reordered_past + + diff --git a/fengshen/models/Lyrics/ram/models/ram.py b/fengshen/models/Lyrics/ram/models/ram.py new file mode 100644 index 0000000..dd33951 --- /dev/null +++ b/fengshen/models/Lyrics/ram/models/ram.py @@ -0,0 +1,212 @@ +''' + * The Recognize Anything Model (RAM) & Tag2Text Model + * Written by Xinyu Huang +''' +import numpy as np +import json +import torch +import warnings +import sys +from torch import nn +from fengshen.models.groundedblip.ram.models.bert import BertConfig, BertModel, BertLMHeadModel +from fengshen.models.groundedblip.ram.models.swin_transformer import SwinTransformer +from fengshen.models.groundedblip.ram.data.ram_tag_list_threshold import ram_class_threshold + +from fengshen.models.groundedblip.ram.models.utils import * + +warnings.filterwarnings("ignore") + + +class RAM(nn.Module): + def __init__(self, + args): + r""" The Recognize Anything Model (RAM) inference module. + RAM is a strong image tagging model, which can recognize any common category with high accuracy. + Described in the paper " Recognize Anything: A Strong Image Tagging Model" https://recognize-anything.github.io/ + + Args: + med_config (str): path for the mixture of encoder-decoder model's configuration file + image_size (int): input image size + vit (str): model size of vision transformer + threshold (int): tagging threshold + delete_tag_index (list): delete some tags that may disturb captioning + """ + super().__init__() + + # create image encoder + + if args.vit == 'swin_l': + assert args.image_size == args.image_res + # assert config['patch_size'] == 32 + vision_width =args.vision_width + + self.visual_encoder = SwinTransformer( + img_size=args.image_res, + patch_size=4, + in_chans=3, + embed_dim=args.embed_dim, + depths=args.depths, + num_heads=args.num_heads, + window_size=args.window_size, + mlp_ratio=4., + qkv_bias=True, + drop_rate=0.0, + drop_path_rate=0.1, + ape=False, + patch_norm=True, + use_checkpoint=False) + + # create tokenzier + self.tokenizer = init_tokenizer() + + # Tag2Text employ encoder-decoder architecture for image-tag-text generation: image-tag interaction encoder and image-tag-text decoder + # create image-tag interaction encoder + encoder_config = BertConfig.from_json_file(args.med_config) + encoder_config.encoder_width = 512 + self.tag_encoder = BertModel(config=encoder_config, + add_pooling_layer=False) + + # create image-tag-text decoder + decoder_config = BertConfig.from_json_file(args.med_config) + self.text_decoder = BertLMHeadModel(config=decoder_config) + + self.delete_tag_index = args.delete_tag_index + self.prompt = args.prompt + self.prompt_length = len(self.tokenizer(self.prompt).input_ids) - 1 + + # load tag list + self.tag_list = self.load_tag_list(args.tag_list) + self.tag_list_chinese = self.load_tag_list(args.tag_list_chinese) + + # create image-tag recognition decoder + self.threshold = args.threshold + self.num_class = len(self.tag_list) + q2l_config = BertConfig.from_json_file(args.q2l_config) + q2l_config.encoder_width = 512 + self.tagging_head = BertModel(config=q2l_config, + add_pooling_layer=False) + self.tagging_head.resize_token_embeddings(len(self.tokenizer)) + self.label_embed = nn.Embedding(self.num_class, q2l_config.hidden_size) + + if q2l_config.hidden_size != 512: + self.wordvec_proj = nn.Linear(512, q2l_config.hidden_size) + else: + self.wordvec_proj = nn.Identity() + + self.fc = nn.Linear(q2l_config.hidden_size, 1) + + self.del_selfattention() + + # share weights of the lowest 2-layer of "image-tag interaction encoder" with the "image-tag recogntion decoder" + tie_encoder_decoder_weights(self.tag_encoder, self.tagging_head, '', + ' ') + self.image_proj = nn.Linear(vision_width, 512) + # self.label_embed = nn.Parameter(torch.load(args.textual_label_embedding_path,map_location='cpu').float()) + self.label_embed = nn.Parameter(torch.zeros(4585, 512, dtype=float)) # 4585是标签个数 + # adjust thresholds for some tags + self.class_threshold = torch.ones(self.num_class) * self.threshold + for key,value in enumerate(ram_class_threshold): + self.class_threshold[key] = value + + def load_tag_list(self, tag_list_file): + with open(tag_list_file, 'r', encoding="utf-8") as f: + tag_list = f.read().splitlines() + tag_list = np.array(tag_list) + return tag_list + + # delete self-attention layer of image-tag recognition decoder to reduce computation, follower Query2Label + def del_selfattention(self): + del self.tagging_head.embeddings + for layer in self.tagging_head.encoder.layer: + del layer.attention + + def generate_tag(self, + image, + threshold=0.68, + tag_input=None, + ): + + label_embed = torch.nn.functional.relu(self.wordvec_proj(self.label_embed)) + + image_embeds = self.image_proj(self.visual_encoder(image)) + image_atts = torch.ones(image_embeds.size()[:-1], + dtype=torch.long).to(image.device) + + # recognized image tags using image-tag recogntiion decoder + image_cls_embeds = image_embeds[:, 0, :] + image_spatial_embeds = image_embeds[:, 1:, :] + + bs = image_spatial_embeds.shape[0] + label_embed = label_embed.unsqueeze(0).repeat(bs, 1, 1) + tagging_embed = self.tagging_head( + encoder_embeds=label_embed, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=False, + mode='tagging', + ) + + logits = self.fc(tagging_embed[0]).squeeze(-1) + + targets = torch.where( + torch.sigmoid(logits) > self.class_threshold.to(image.device), + torch.tensor(1.0).to(image.device), + torch.zeros(self.num_class).to(image.device)) + + tag = targets.cpu().numpy() + tag[:,self.delete_tag_index] = 0 + tag_output = [] + tag_output_chinese = [] + for b in range(bs): + index = np.argwhere(tag[b] == 1) + token = self.tag_list[index].squeeze(axis=1) + tag_output.append(' | '.join(token)) + token_chinese = self.tag_list_chinese[index].squeeze(axis=1) + tag_output_chinese.append(' | '.join(token_chinese)) + + + return tag_output, tag_output_chinese + + def generate_tag_zeroshot(self, + image, + threshold=0.68, + tag_input=None, + ): + + label_embed = torch.nn.functional.relu(self.wordvec_proj(self.label_embed)) + + image_embeds = self.image_proj(self.visual_encoder(image)) + image_atts = torch.ones(image_embeds.size()[:-1], + dtype=torch.long).to(image.device) + + # recognized image tags using image-tag recogntiion decoder + image_cls_embeds = image_embeds[:, 0, :] + image_spatial_embeds = image_embeds[:, 1:, :] + + bs = image_spatial_embeds.shape[0] + label_embed = label_embed.unsqueeze(0).repeat(bs, 1, 1) + tagging_embed = self.tagging_head( + encoder_embeds=label_embed, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=False, + mode='tagging', + ) + + logits = self.fc(tagging_embed[0]).squeeze(-1) + + targets = torch.where( + torch.sigmoid(logits) > self.class_threshold.to(image.device), + torch.tensor(1.0).to(image.device), + torch.zeros(self.num_class).to(image.device)) + + tag = targets.cpu().numpy() + tag[:,self.delete_tag_index] = 0 + tag_output = [] + for b in range(bs): + index = np.argwhere(tag[b] == 1) + token = self.tag_list[index].squeeze(axis=1) + tag_output.append(' | '.join(token)) + + return tag_output + diff --git a/fengshen/models/Lyrics/ram/models/swin_transformer.py b/fengshen/models/Lyrics/ram/models/swin_transformer.py new file mode 100644 index 0000000..c1affc9 --- /dev/null +++ b/fengshen/models/Lyrics/ram/models/swin_transformer.py @@ -0,0 +1,654 @@ +# -------------------------------------------------------- +# Swin Transformer +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ze Liu +# -------------------------------------------------------- + +import numpy as np +from scipy import interpolate + +import torch +import torch.nn as nn +import torch.utils.checkpoint as checkpoint +from timm.models.layers import DropPath, to_2tuple, trunc_normal_ + + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class WindowAttention(nn.Module): + r""" Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.): + + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=.02) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask=None): + """ + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + def extra_repr(self) -> str: + return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}' + + def flops(self, N): + # calculate flops for 1 window with token length of N + flops = 0 + # qkv = self.qkv(x) + flops += N * self.dim * 3 * self.dim + # attn = (q @ k.transpose(-2, -1)) + flops += self.num_heads * N * (self.dim // self.num_heads) * N + # x = (attn @ v) + flops += self.num_heads * N * N * (self.dim // self.num_heads) + # x = self.proj(x) + flops += N * self.dim * self.dim + return flops + + +class SwinTransformerBlock(nn.Module): + r""" Swin Transformer Block. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resulotion. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, input_resolution, num_heads, window_size=7, shift_size=0, + mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., + act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + if min(self.input_resolution) <= self.window_size: + # if window size is larger than input resolution, we don't partition windows + self.shift_size = 0 + self.window_size = min(self.input_resolution) + assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, + qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + if self.shift_size > 0: + # calculate attention mask for SW-MSA + H, W = self.input_resolution + img_mask = torch.zeros((1, H, W, 1)) # 1 H W 1 + h_slices = (slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None)) + w_slices = (slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None)) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) + else: + attn_mask = None + + self.register_buffer("attn_mask", attn_mask) + + def forward(self, x): + H, W = self.input_resolution + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + + shortcut = x + x = self.norm1(x) + x = x.view(B, H, W, C) + + # cyclic shift + if self.shift_size > 0: + shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) + else: + shifted_x = x + + # partition windows + x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C + x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows = self.attn(x_windows, mask=self.attn_mask) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) + shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) + else: + x = shifted_x + x = x.view(B, H * W, C) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + def extra_repr(self) -> str: + return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \ + f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}" + + def flops(self): + flops = 0 + H, W = self.input_resolution + # norm1 + flops += self.dim * H * W + # W-MSA/SW-MSA + nW = H * W / self.window_size / self.window_size + flops += nW * self.attn.flops(self.window_size * self.window_size) + # mlp + flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio + # norm2 + flops += self.dim * H * W + return flops + + +class PatchMerging(nn.Module): + r""" Patch Merging Layer. + + Args: + input_resolution (tuple[int]): Resolution of input feature. + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.input_resolution = input_resolution + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x): + """ + x: B, H*W, C + """ + H, W = self.input_resolution + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + assert H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even." + + x = x.view(B, H, W, C) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + def extra_repr(self) -> str: + return f"input_resolution={self.input_resolution}, dim={self.dim}" + + def flops(self): + H, W = self.input_resolution + flops = H * W * self.dim + flops += (H // 2) * (W // 2) * 4 * self.dim * 2 * self.dim + return flops + + +class BasicLayer(nn.Module): + """ A basic Swin Transformer layer for one stage. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + window_size (int): Local window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__(self, dim, input_resolution, depth, num_heads, window_size, + mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., norm_layer=nn.LayerNorm, downsample=None, use_checkpoint=False): + + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList([ + SwinTransformerBlock(dim=dim, input_resolution=input_resolution, + num_heads=num_heads, window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop, attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer) + for i in range(depth)]) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer) + else: + self.downsample = None + + def forward(self, x): + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + if self.downsample is not None: + x = self.downsample(x) + return x + + def extra_repr(self) -> str: + return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}" + + def flops(self): + flops = 0 + for blk in self.blocks: + flops += blk.flops() + if self.downsample is not None: + flops += self.downsample.flops() + return flops + + +class PatchEmbed(nn.Module): + r""" Image to Patch Embedding + + Args: + img_size (int): Image size. Default: 224. + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__(self, img_size=224, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + patches_resolution = [img_size[0] // patch_size[0], img_size[1] // patch_size[1]] + self.img_size = img_size + self.patch_size = patch_size + self.patches_resolution = patches_resolution + self.num_patches = patches_resolution[0] * patches_resolution[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + B, C, H, W = x.shape + # FIXME look at relaxing size constraints + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x).flatten(2).transpose(1, 2) # B Ph*Pw C + if self.norm is not None: + x = self.norm(x) + return x + + def flops(self): + Ho, Wo = self.patches_resolution + flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1]) + if self.norm is not None: + flops += Ho * Wo * self.embed_dim + return flops + + +class SwinTransformer(nn.Module): + r""" Swin Transformer + A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - + https://arxiv.org/pdf/2103.14030 + + Args: + img_size (int | tuple(int)): Input image size. Default 224 + patch_size (int | tuple(int)): Patch size. Default: 4 + in_chans (int): Number of input image channels. Default: 3 + num_classes (int): Number of classes for classification head. Default: 1000 + embed_dim (int): Patch embedding dimension. Default: 96 + depths (tuple(int)): Depth of each Swin Transformer layer. + num_heads (tuple(int)): Number of attention heads in different layers. + window_size (int): Window size. Default: 7 + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. Default: None + drop_rate (float): Dropout rate. Default: 0 + attn_drop_rate (float): Attention dropout rate. Default: 0 + drop_path_rate (float): Stochastic depth rate. Default: 0.1 + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False + patch_norm (bool): If True, add normalization after patch embedding. Default: True + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False + """ + + def __init__(self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, + embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], + window_size=7, mlp_ratio=4., qkv_bias=True, qk_scale=None, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, + norm_layer=nn.LayerNorm, ape=False, patch_norm=True, + use_checkpoint=False, **kwargs): + super().__init__() + + self.num_classes = num_classes + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.ape = ape + self.patch_norm = patch_norm + self.num_features = int(embed_dim * 2 ** (self.num_layers - 1)) + self.mlp_ratio = mlp_ratio + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None) + num_patches = self.patch_embed.num_patches + patches_resolution = self.patch_embed.patches_resolution + self.patches_resolution = patches_resolution + + # absolute position embedding + if self.ape: + self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) + trunc_normal_(self.absolute_pos_embed, std=.02) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule + + # build layers + self.layers = nn.ModuleList() + for i_layer in range(self.num_layers): + layer = BasicLayer(dim=int(embed_dim * 2 ** i_layer), + input_resolution=(patches_resolution[0] // (2 ** i_layer), + patches_resolution[1] // (2 ** i_layer)), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=self.mlp_ratio, + qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], + norm_layer=norm_layer, + downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, + use_checkpoint=use_checkpoint) + self.layers.append(layer) + + self.norm = norm_layer(self.num_features) + self.avgpool = nn.AdaptiveAvgPool1d(1) + # self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'absolute_pos_embed'} + + @torch.jit.ignore + def no_weight_decay_keywords(self): + return {'relative_position_bias_table'} + + def forward(self, x, idx_to_group_img=None, image_atts=None, **kwargs): + x = self.patch_embed(x) + if self.ape: + x = x + self.absolute_pos_embed + x = self.pos_drop(x) + + for layer in self.layers: + x = layer(x) + + x = self.norm(x) # B L C + + x_cls = self.avgpool(x.transpose(1, 2)) # B C 1 + + if idx_to_group_img is None: + return torch.cat([x_cls.transpose(1, 2), x], dim=1) + else: + x_bs = torch.gather(x, dim=0, index=idx_to_group_img.view(-1, 1, 1).expand(-1, x.shape[1], x.shape[2])) + weights = image_atts[:, 1:].unsqueeze(2) # B L 1 + x_bs_cls = torch.sum((weights * x_bs).transpose(1, 2), dim=-1, keepdim=True) # B C 1 + x_bs_cls = x_bs_cls / torch.sum(weights.transpose(1, 2), dim=-1, keepdim=True) # avgpool + + return torch.cat([x_bs_cls.transpose(1, 2), x_bs], dim=1), \ + torch.cat([x_cls.transpose(1, 2), x], dim=1) + + def flops(self): + flops = 0 + flops += self.patch_embed.flops() + for i, layer in enumerate(self.layers): + flops += layer.flops() + flops += self.num_features * self.patches_resolution[0] * self.patches_resolution[1] // (2 ** self.num_layers) + flops += self.num_features * self.num_classes + return flops + + +def interpolate_relative_pos_embed(rel_pos_bias, dst_num_pos, param_name=''): + # from: https://github.com/microsoft/unilm/blob/8a0a1c1f4e7326938ea7580a00d56d7f17d65612/beit/run_class_finetuning.py#L348 + + # rel_pos_bias: relative_position_bias_table + src_num_pos, num_attn_heads = rel_pos_bias.size() + + num_extra_tokens = 0 + src_size = int((src_num_pos - num_extra_tokens) ** 0.5) + dst_size = int((dst_num_pos - num_extra_tokens) ** 0.5) + if src_size != dst_size: + print("Position interpolate %s from %dx%d to %dx%d" % (param_name, src_size, src_size, dst_size, dst_size)) + + # extra_tokens = rel_pos_bias[-num_extra_tokens:, :] + # rel_pos_bias = rel_pos_bias[:-num_extra_tokens, :] + + def geometric_progression(a, r, n): + return a * (1.0 - r ** n) / (1.0 - r) + + left, right = 1.01, 1.5 + while right - left > 1e-6: + q = (left + right) / 2.0 + gp = geometric_progression(1, q, src_size // 2) + if gp > dst_size // 2: + right = q + else: + left = q + + # if q > 1.090307: + # q = 1.090307 + + dis = [] + cur = 1 + for i in range(src_size // 2): + dis.append(cur) + cur += q ** (i + 1) + + r_ids = [-_ for _ in reversed(dis)] + + x = r_ids + [0] + dis + y = r_ids + [0] + dis + + t = dst_size // 2.0 + dx = np.arange(-t, t + 0.1, 1.0) + dy = np.arange(-t, t + 0.1, 1.0) + + # print("Original positions = %s" % str(x)) + # print("Target positions = %s" % str(dx)) + + all_rel_pos_bias = [] + + for i in range(num_attn_heads): + z = rel_pos_bias[:, i].view(src_size, src_size).float().numpy() + f = interpolate.interp2d(x, y, z, kind='cubic') + all_rel_pos_bias.append( + torch.Tensor(f(dx, dy)).contiguous().view(-1, 1).to(rel_pos_bias.device)) + + rel_pos_bias = torch.cat(all_rel_pos_bias, dim=-1) + + return rel_pos_bias \ No newline at end of file diff --git a/fengshen/models/Lyrics/ram/models/utils.py b/fengshen/models/Lyrics/ram/models/utils.py new file mode 100644 index 0000000..4b445d4 --- /dev/null +++ b/fengshen/models/Lyrics/ram/models/utils.py @@ -0,0 +1,282 @@ +import os +import json +import torch +import math + +from torch import nn +from typing import List +from transformers import BertTokenizer +from urllib.parse import urlparse +from timm.models.hub import download_cached_file +from fengshen.models.groundedblip.ram.models.vit import interpolate_pos_embed, VisionTransformer +from fengshen.models.groundedblip.ram.models.swin_transformer import interpolate_relative_pos_embed +from pathlib import Path +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +CONFIG_PATH=(Path(__file__).resolve().parents[1]) + +def read_json(rpath): + with open(rpath, 'r') as f: + return json.load(f) + + +def tie_encoder_decoder_weights(encoder: nn.Module, decoder: nn.Module, + base_model_prefix: str, skip_key: str): + uninitialized_encoder_weights: List[str] = [] + if decoder.__class__ != encoder.__class__: + logger.info( + f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder weights are correctly initialized." + ) + + def tie_encoder_to_decoder_recursively( + decoder_pointer: nn.Module, + encoder_pointer: nn.Module, + module_name: str, + uninitialized_encoder_weights: List[str], + skip_key: str, + depth=0, + ): + assert isinstance(decoder_pointer, nn.Module) and isinstance( + encoder_pointer, nn.Module + ), f"{decoder_pointer} and {encoder_pointer} have to be of type torch.nn.Module" + if hasattr(decoder_pointer, "weight") and skip_key not in module_name: + assert hasattr(encoder_pointer, "weight") + encoder_pointer.weight = decoder_pointer.weight + if hasattr(decoder_pointer, "bias"): + assert hasattr(encoder_pointer, "bias") + encoder_pointer.bias = decoder_pointer.bias + print(module_name + ' is tied') + return + + encoder_modules = encoder_pointer._modules + decoder_modules = decoder_pointer._modules + if len(decoder_modules) > 0: + assert ( + len(encoder_modules) > 0 + ), f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}" + + all_encoder_weights = set([ + module_name + "/" + sub_name + for sub_name in encoder_modules.keys() + ]) + encoder_layer_pos = 0 + for name, module in decoder_modules.items(): + if name.isdigit(): + encoder_name = str(int(name) + encoder_layer_pos) + decoder_name = name + if not isinstance( + decoder_modules[decoder_name], + type(encoder_modules[encoder_name])) and len( + encoder_modules) != len(decoder_modules): + # this can happen if the name corresponds to the position in a list module list of layers + # in this case the decoder has added a cross-attention that the encoder does not have + # thus skip this step and subtract one layer pos from encoder + encoder_layer_pos -= 1 + continue + elif name not in encoder_modules: + continue + elif depth > 500: + raise ValueError( + "Max depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is a circular dependency between two or more `nn.Modules` of your model." + ) + else: + decoder_name = encoder_name = name + tie_encoder_to_decoder_recursively( + decoder_modules[decoder_name], + encoder_modules[encoder_name], + module_name + "/" + name, + uninitialized_encoder_weights, + skip_key, + depth=depth + 1, + ) + all_encoder_weights.remove(module_name + "/" + encoder_name) + + uninitialized_encoder_weights += list(all_encoder_weights) + + # tie weights recursively + tie_encoder_to_decoder_recursively(decoder, encoder, base_model_prefix, + uninitialized_encoder_weights, skip_key) + + +class GroupWiseLinear(nn.Module): + # could be changed to: + # output = torch.einsum('ijk,zjk->ij', x, self.W) + # or output = torch.einsum('ijk,jk->ij', x, self.W[0]) + def __init__(self, num_class, hidden_dim, bias=True): + super().__init__() + self.num_class = num_class + self.hidden_dim = hidden_dim + self.bias = bias + + self.W = nn.Parameter(torch.Tensor(1, num_class, hidden_dim)) + if bias: + self.b = nn.Parameter(torch.Tensor(1, num_class)) + self.reset_parameters() + + def reset_parameters(self): + stdv = 1. / math.sqrt(self.W.size(2)) + for i in range(self.num_class): + self.W[0][i].data.uniform_(-stdv, stdv) + if self.bias: + for i in range(self.num_class): + self.b[0][i].data.uniform_(-stdv, stdv) + + def forward(self, x): + # x: B,K,d + x = (self.W * x).sum(-1) + if self.bias: + x = x + self.b + return x + + +def init_tokenizer(): + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + tokenizer.add_special_tokens({'bos_token': '[DEC]'}) + tokenizer.add_special_tokens({'additional_special_tokens': ['[ENC]']}) + tokenizer.enc_token_id = tokenizer.additional_special_tokens_ids[0] + return tokenizer + + +def create_vit(vit, + image_size, + use_grad_checkpointing=False, + ckpt_layer=0, + drop_path_rate=0): + + assert vit in ['base', 'large'], "vit parameter must be base or large" + if vit == 'base': + vision_width = 768 + visual_encoder = VisionTransformer( + img_size=image_size, + patch_size=16, + embed_dim=vision_width, + depth=12, + num_heads=12, + use_grad_checkpointing=use_grad_checkpointing, + ckpt_layer=ckpt_layer, + drop_path_rate=0 or drop_path_rate) + elif vit == 'large': + vision_width = 1024 + visual_encoder = VisionTransformer( + img_size=image_size, + patch_size=16, + embed_dim=vision_width, + depth=24, + num_heads=16, + use_grad_checkpointing=use_grad_checkpointing, + ckpt_layer=ckpt_layer, + drop_path_rate=0.1 or drop_path_rate) + return visual_encoder, vision_width + + +def is_url(url_or_filename): + parsed = urlparse(url_or_filename) + return parsed.scheme in ("http", "https") + + +def load_checkpoint(model, url_or_filename): + if is_url(url_or_filename): + cached_file = download_cached_file(url_or_filename, + check_hash=False, + progress=True) + checkpoint = torch.load(cached_file, map_location='cpu') + elif os.path.isfile(url_or_filename): + checkpoint = torch.load(url_or_filename, map_location='cpu') + else: + raise RuntimeError('checkpoint url or path is invalid') + + state_dict = checkpoint['model'] + + state_dict['visual_encoder.pos_embed'] = interpolate_pos_embed( + state_dict['visual_encoder.pos_embed'], model.visual_encoder) + if 'visual_encoder_m.pos_embed' in model.state_dict().keys(): + state_dict['visual_encoder_m.pos_embed'] = interpolate_pos_embed( + state_dict['visual_encoder_m.pos_embed'], model.visual_encoder_m) + for key in model.state_dict().keys(): + if key in state_dict.keys(): + if state_dict[key].shape != model.state_dict()[key].shape: + del state_dict[key] + + msg = model.load_state_dict(state_dict, strict=False) + print('load checkpoint from %s' % url_or_filename) + return model, msg + + +def load_checkpoint_swinbase(model, url_or_filename, kwargs): + if kwargs['image_size'] == 224: + vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinB_224.json' + elif kwargs['image_size'] == 384: + vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinB_384.json' + window_size = read_json(vision_config_path)['window_size'] + print('--------------') + print(url_or_filename) + print('--------------') + if is_url(url_or_filename): + cached_file = download_cached_file(url_or_filename, + check_hash=False, + progress=True) + checkpoint = torch.load(cached_file, map_location='cpu') + elif os.path.isfile(url_or_filename): + checkpoint = torch.load(url_or_filename, map_location='cpu') + else: + raise RuntimeError('checkpoint url or path is invalid') + + state_dict = checkpoint['model'] + + for k in list(state_dict.keys()): + if 'relative_position_bias_table' in k: + dst_num_pos = (2 * window_size - 1)**2 + state_dict[k] = interpolate_relative_pos_embed(state_dict[k], + dst_num_pos, + param_name=k) + elif ('relative_position_index' in k) or ('attn_mask' in k): + del state_dict[k] + elif "vision_multi" in k: + state_dict[k.replace("vision_multi", + "tagging_head")] = state_dict.pop(k) + + msg = model.load_state_dict(state_dict, strict=False) + print('load checkpoint from %s' % url_or_filename) + return model, msg + + +def load_checkpoint_swinlarge(model, url_or_filename, kwargs): + if kwargs['image_size'] == 224: + vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinL_224.json' + elif kwargs['image_size'] == 384: + vision_config_path = f'{CONFIG_PATH}/configs/swin/config_swinL_384.json' + window_size = read_json(vision_config_path)['window_size'] + print('--------------') + print(url_or_filename) + print('--------------') + if is_url(url_or_filename): + cached_file = download_cached_file(url_or_filename, + check_hash=False, + progress=True) + checkpoint = torch.load(cached_file, map_location='cpu') + elif os.path.isfile(url_or_filename): + checkpoint = torch.load(url_or_filename, map_location='cpu') + else: + raise RuntimeError('checkpoint url or path is invalid') + + state_dict = checkpoint['model'] + + for k in list(state_dict.keys()): + if 'relative_position_bias_table' in k: + dst_num_pos = (2 * window_size - 1)**2 + state_dict[k] = interpolate_relative_pos_embed(state_dict[k], + dst_num_pos, + param_name=k) + elif ('relative_position_index' in k) or ('attn_mask' in k): + del state_dict[k] + elif "vision_multi" in k: + state_dict[k.replace("vision_multi", + "tagging_head")] = state_dict.pop(k) + + msg = model.load_state_dict(state_dict, strict=False) + print('load checkpoint from %s' % url_or_filename) + return model, msg + + diff --git a/fengshen/models/Lyrics/ram/models/vit.py b/fengshen/models/Lyrics/ram/models/vit.py new file mode 100644 index 0000000..cec3d8e --- /dev/null +++ b/fengshen/models/Lyrics/ram/models/vit.py @@ -0,0 +1,305 @@ +''' + * Copyright (c) 2022, salesforce.com, inc. + * All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause + * By Junnan Li + * Based on timm code base + * https://github.com/rwightman/pytorch-image-models/tree/master/timm +''' + +import torch +import torch.nn as nn +import torch.nn.functional as F +from functools import partial + +from timm.models.vision_transformer import _cfg, PatchEmbed +from timm.models.registry import register_model +from timm.models.layers import trunc_normal_, DropPath +from timm.models.helpers import named_apply, adapt_input_conv + +from fairscale.nn.checkpoint.checkpoint_activations import checkpoint_wrapper + +class Mlp(nn.Module): + """ MLP as used in Vision Transformer, MLP-Mixer and related networks + """ + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights + self.scale = qk_scale or head_dim ** -0.5 + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.attn_gradients = None + self.attention_map = None + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def forward(self, x, register_hook=False): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + if register_hook: + self.save_attention_map(attn) + attn.register_hook(self.save_attn_gradients) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, use_grad_checkpointing=False): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + if use_grad_checkpointing: + self.attn = checkpoint_wrapper(self.attn) + self.mlp = checkpoint_wrapper(self.mlp) + + def forward(self, x, register_hook=False): + x = x + self.drop_path(self.attn(self.norm1(x), register_hook=register_hook)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class VisionTransformer(nn.Module): + """ Vision Transformer + A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` - + https://arxiv.org/abs/2010.11929 + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12, + num_heads=12, mlp_ratio=4., qkv_bias=True, qk_scale=None, representation_size=None, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=None, + use_grad_checkpointing=False, ckpt_layer=0): + """ + Args: + img_size (int, tuple): input image size + patch_size (int, tuple): patch size + in_chans (int): number of input channels + num_classes (int): number of classes for classification head + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + qk_scale (float): override default qk scale of head_dim ** -0.5 if set + representation_size (Optional[int]): enable and set representation layer (pre-logits) to this value if set + drop_rate (float): dropout rate + attn_drop_rate (float): attention dropout rate + drop_path_rate (float): stochastic depth rate + norm_layer: (nn.Module): normalization layer + """ + super().__init__() + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + use_grad_checkpointing=(use_grad_checkpointing and i>=depth-ckpt_layer) + ) + for i in range(depth)]) + self.norm = norm_layer(embed_dim) + + trunc_normal_(self.pos_embed, std=.02) + trunc_normal_(self.cls_token, std=.02) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + def forward(self, x, register_blk=-1): + B = x.shape[0] + x = self.patch_embed(x) + + cls_tokens = self.cls_token.expand(B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + + x = x + self.pos_embed[:,:x.size(1),:] + x = self.pos_drop(x) + + for i,blk in enumerate(self.blocks): + x = blk(x, register_blk==i) + x = self.norm(x) + + return x + + @torch.jit.ignore() + def load_pretrained(self, checkpoint_path, prefix=''): + _load_weights(self, checkpoint_path, prefix) + + +@torch.no_grad() +def _load_weights(model: VisionTransformer, checkpoint_path: str, prefix: str = ''): + """ Load weights from .npz checkpoints for official Google Brain Flax implementation + """ + import numpy as np + + def _n2p(w, t=True): + if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1: + w = w.flatten() + if t: + if w.ndim == 4: + w = w.transpose([3, 2, 0, 1]) + elif w.ndim == 3: + w = w.transpose([2, 0, 1]) + elif w.ndim == 2: + w = w.transpose([1, 0]) + return torch.from_numpy(w) + + w = np.load(checkpoint_path) + if not prefix and 'opt/target/embedding/kernel' in w: + prefix = 'opt/target/' + + if hasattr(model.patch_embed, 'backbone'): + # hybrid + backbone = model.patch_embed.backbone + stem_only = not hasattr(backbone, 'stem') + stem = backbone if stem_only else backbone.stem + stem.conv.weight.copy_(adapt_input_conv(stem.conv.weight.shape[1], _n2p(w[f'{prefix}conv_root/kernel']))) + stem.norm.weight.copy_(_n2p(w[f'{prefix}gn_root/scale'])) + stem.norm.bias.copy_(_n2p(w[f'{prefix}gn_root/bias'])) + if not stem_only: + for i, stage in enumerate(backbone.stages): + for j, block in enumerate(stage.blocks): + bp = f'{prefix}block{i + 1}/unit{j + 1}/' + for r in range(3): + getattr(block, f'conv{r + 1}').weight.copy_(_n2p(w[f'{bp}conv{r + 1}/kernel'])) + getattr(block, f'norm{r + 1}').weight.copy_(_n2p(w[f'{bp}gn{r + 1}/scale'])) + getattr(block, f'norm{r + 1}').bias.copy_(_n2p(w[f'{bp}gn{r + 1}/bias'])) + if block.downsample is not None: + block.downsample.conv.weight.copy_(_n2p(w[f'{bp}conv_proj/kernel'])) + block.downsample.norm.weight.copy_(_n2p(w[f'{bp}gn_proj/scale'])) + block.downsample.norm.bias.copy_(_n2p(w[f'{bp}gn_proj/bias'])) + embed_conv_w = _n2p(w[f'{prefix}embedding/kernel']) + else: + embed_conv_w = adapt_input_conv( + model.patch_embed.proj.weight.shape[1], _n2p(w[f'{prefix}embedding/kernel'])) + model.patch_embed.proj.weight.copy_(embed_conv_w) + model.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias'])) + model.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False)) + pos_embed_w = _n2p(w[f'{prefix}Transformer/posembed_input/pos_embedding'], t=False) + if pos_embed_w.shape != model.pos_embed.shape: + pos_embed_w = resize_pos_embed( # resize pos embedding when different size from pretrained weights + pos_embed_w, model.pos_embed, getattr(model, 'num_tokens', 1), model.patch_embed.grid_size) + model.pos_embed.copy_(pos_embed_w) + model.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale'])) + model.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias'])) +# if isinstance(model.head, nn.Linear) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]: +# model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel'])) +# model.head.bias.copy_(_n2p(w[f'{prefix}head/bias'])) +# if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w: +# model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel'])) +# model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias'])) + for i, block in enumerate(model.blocks.children()): + block_prefix = f'{prefix}Transformer/encoderblock_{i}/' + mha_prefix = block_prefix + 'MultiHeadDotProductAttention_1/' + block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale'])) + block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias'])) + block.attn.qkv.weight.copy_(torch.cat([ + _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('query', 'key', 'value')])) + block.attn.qkv.bias.copy_(torch.cat([ + _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('query', 'key', 'value')])) + block.attn.proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1)) + block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias'])) + for r in range(2): + getattr(block.mlp, f'fc{r + 1}').weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/kernel'])) + getattr(block.mlp, f'fc{r + 1}').bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/bias'])) + block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/scale'])) + block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/bias'])) + + +def interpolate_pos_embed(pos_embed_checkpoint, visual_encoder): + # interpolate position embedding + embedding_size = pos_embed_checkpoint.shape[-1] + num_patches = visual_encoder.patch_embed.num_patches + num_extra_tokens = visual_encoder.pos_embed.shape[-2] - num_patches + # height (== width) for the checkpoint position embedding + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) + # height (== width) for the new position embedding + new_size = int(num_patches ** 0.5) + + if orig_size!=new_size: + # class_token and dist_token are kept unchanged + extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] + # only the position tokens are interpolated + pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] + pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) + pos_tokens = torch.nn.functional.interpolate( + pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) + new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) + print('reshape position embedding from %d to %d'%(orig_size ** 2,new_size ** 2)) + + return new_pos_embed + else: + return pos_embed_checkpoint \ No newline at end of file