mindspore-lab · lvyufeng · Sep 6, 2024 · Sep 6, 2024
diff --git a/mindnlp/core/nn/functional.py b/mindnlp/core/nn/functional.py
@@ -332,6 +332,11 @@ def mse_loss(input, target, reduction='mean'):
 def l1_loss(input, target, reduction='mean'):
     return ops.l1_loss(input, target, reduction)
 
+def smooth_l1_loss(input, target, beta=1.0, reduction='none'):
+    input = input.to(mindspore.float32)
+    target = target.to(mindspore.float32)
+    return ops.smooth_l1_loss(input, target, beta, reduction)
+
 def kl_div(logits, labels, reduction='mean', log_target=False):
     if log_target:
         labels = ops.log(labels)

diff --git a/mindnlp/transformers/modeling_utils.py b/mindnlp/transformers/modeling_utils.py
@@ -1622,14 +1622,14 @@ def _get_resized_embeddings(
         # numbers of tokens to copy
         n = min(old_num_tokens, new_num_tokens)
 
-        new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :]
+        new_embeddings.weight[:n, :] = old_embeddings.weight[:n, :]
 
         # Replace weights in old_embeddings and return to maintain the same embedding type.
         # This ensures correct functionality when a Custom Embedding class is passed as input.
         # The input and output embedding types remain consistent. (c.f. https://github.com/huggingface/transformers/pull/31979)
 
         old_embeddings.weight = new_embeddings.weight
-        old_embeddings.num_embeddings = new_embeddings.weight.data.shape[0]
+        old_embeddings.num_embeddings = new_embeddings.weight.shape[0]
         if old_embeddings.padding_idx is not None and (new_num_tokens - 1) < old_embeddings.padding_idx:
             old_embeddings.padding_idx = None
 

diff --git a/mindnlp/transformers/models/auto/configuration_auto.py b/mindnlp/transformers/models/auto/configuration_auto.py
@@ -123,9 +123,12 @@
         ("layoutlmv2", "LayoutLMv2Config"),
         ("layoutlmv3", "LayoutLMv3Config"),
         ("led", "LEDConfig"),
+        ("lilt", "LiltConfig"),
         ("llama", "LlamaConfig"),
         ("llava", "LlavaConfig"),
         ("llava_next", "LlavaNextConfig"),
+        ("longformer", "LongformerConfig"),
+        ("luke", "LukeConfig"),
         ("lxmert", "LxmertConfig"),
         ("m2m_100", "M2M100Config"),
         ("mamba", "MambaConfig"),

diff --git a/mindnlp/transformers/models/layoutlm/modeling_layoutlm.py b/mindnlp/transformers/models/layoutlm/modeling_layoutlm.py
diff --git a/mindnlp/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/mindnlp/transformers/models/layoutlmv2/modeling_layoutlmv2.py
diff --git a/mindnlp/transformers/models/led/modeling_led.py b/mindnlp/transformers/models/led/modeling_led.py
@@ -23,7 +23,7 @@
 from mindspore import Tensor
 from mindspore.common.initializer import initializer, Normal
 
-from mindnlp.core import nn, ops
+from mindnlp.core import nn, ops, no_grad
 from mindnlp.core.nn import functional as F
 from ...activations import ACT2FN
 from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask
@@ -539,19 +539,20 @@ def _get_global_attn_indices(is_index_global_attn):
         # helper variable
         num_global_attn_indices = is_index_global_attn.long().sum(axis=1)
         # max number of global attn indices in batch
-        max_num_global_attn_indices = ops.max(num_global_attn_indices).item()
+        with no_grad():
+            max_num_global_attn_indices = ops.max(num_global_attn_indices).item()
 
         # indices of global attn
-        is_index_global_attn_nonzero = is_index_global_attn.nonzero(as_tuple=True)
+        is_index_global_attn_nonzero = ops.nonzero(is_index_global_attn, as_tuple=True)
 
         # helper variable
         is_local_index_global_attn = ops.arange(max_num_global_attn_indices) < num_global_attn_indices.unsqueeze(dim=-1)
 
         # location of the non-padding values within global attention indices
-        is_local_index_global_attn_nonzero = is_local_index_global_attn.nonzero(as_tuple=True)
+        is_local_index_global_attn_nonzero = ops.nonzero(is_local_index_global_attn, as_tuple=True)
 
         # location of the padding values within global attention indices
-        is_local_index_no_global_attn_nonzero = (is_local_index_global_attn == 0).nonzero(as_tuple=True)
+        is_local_index_no_global_attn_nonzero = ops.nonzero((is_local_index_global_attn == 0), as_tuple=True)
         return (
             max_num_global_attn_indices,
             is_index_global_attn_nonzero,

diff --git a/mindnlp/transformers/models/llama/modeling_llama.py b/mindnlp/transformers/models/llama/modeling_llama.py
@@ -797,6 +797,7 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[mindspore.Tensor] = None,
+        num_logits_to_keep: int = 0,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
@@ -849,7 +850,7 @@ def forward(
             logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
             logits = ops.cat(logits, dim=-1)
         else:
-            logits = self.lm_head(hidden_states)
+            logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
         logits = logits.float()
 
         loss = None
@@ -885,6 +886,7 @@ def prepare_inputs_for_generation(
         cache_position=None,
         position_ids=None,
         use_cache=True,
+        num_logits_to_keep=None,
         **kwargs,
     ):
         # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
@@ -930,6 +932,9 @@ def prepare_inputs_for_generation(
                 batch_size=batch_size,
             )
 
+        if num_logits_to_keep is not None:
+            model_inputs["num_logits_to_keep"] = num_logits_to_keep
+
         model_inputs.update(
             {
                 "position_ids": position_ids,

diff --git a/mindnlp/transformers/models/llava/modeling_llava.py b/mindnlp/transformers/models/llava/modeling_llava.py
@@ -20,7 +20,7 @@
 
 import mindspore
 
-from mindnlp.core import nn, ops
+from mindnlp.core import nn, ops, no_grad
 from mindnlp.core.nn import functional as F
 from ...modeling_utils import PreTrainedModel
 from ...activations import ACT2FN
@@ -422,7 +422,8 @@ def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, in
         special_image_token_mask = input_ids == self.config.image_token_index
         num_special_image_tokens = ops.sum(special_image_token_mask, dim=-1)
         # Compute the maximum embed dimension
-        max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)).item() + sequence_length
+        with no_grad():
+            max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)).item() + sequence_length
         nonzero = ops.nonzero(input_ids != self.config.image_token_index)
         batch_indices, non_image_indices = ops.chunk(nonzero, 2, -1)
 

diff --git a/mindnlp/transformers/models/llava_next/configuration_llava_next.py b/mindnlp/transformers/models/llava_next/configuration_llava_next.py
@@ -1,18 +1,17 @@
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-# http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ============================================
-""" Llava-NeXT model configuration"""
+"""Llava-NeXT model configuration"""
 
 from ...configuration_utils import PretrainedConfig
 from ....utils import logging
@@ -24,10 +23,10 @@
 
 class LlavaNextConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a [`LlavaNextForConditionalGeneration`].
-    It is used to instantiate an Llava-NeXT model according to the specified arguments, defining the model architecture.
-    Instantiating a configuration with the defaults will yield a similar configuration to that of the
-    [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) model.
+    This is the configuration class to store the configuration of a [`LlavaNextForConditionalGeneration`]. It is used to instantiate an
+    Llava-NeXT model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)
+    model.
 
     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
     documentation from [`PretrainedConfig`] for more information.
@@ -52,27 +51,32 @@ class LlavaNextConfig(PretrainedConfig):
         image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`):
             A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list
             of the form `(height, width)`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.
 
     Example:
-        ```python
-        >>> from transformers import LlavaNextForConditionalGeneration, LlavaNextConfig, CLIPVisionConfig, LlamaConfig
-        ...
-        >>> # Initializing a CLIP-vision config
-        >>> vision_config = CLIPVisionConfig()
-        ...
-        >>> # Initializing a Llama config
-        >>> text_config = LlamaConfig()
-        ...
-        >>> # Initializing a Llava-Next llava-hf/llava-v1.6-mistral-7b-hf style configuration
-        >>> configuration = LlavaNextConfig(vision_config, text_config)
-        ...
-        >>> # Initializing a model from the llava-hf/llava-v1.6-mistral-7b-hf style configuration
-        >>> model = LlavaNextForConditionalGeneration(configuration)
-        ...
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-        ```
-    """
+
+    ```python
+    >>> from transformers import LlavaNextForConditionalGeneration, LlavaNextConfig, CLIPVisionConfig, LlamaConfig
+
+    >>> # Initializing a CLIP-vision config
+    >>> vision_config = CLIPVisionConfig()
+
+    >>> # Initializing a Llama config
+    >>> text_config = LlamaConfig()
+
+    >>> # Initializing a Llava-Next llava-hf/llava-v1.6-mistral-7b-hf style configuration
+    >>> configuration = LlavaNextConfig(vision_config, text_config)
+
+    >>> # Initializing a model from the llava-hf/llava-v1.6-mistral-7b-hf style configuration
+    >>> model = LlavaNextForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
     model_type = "llava_next"
     is_composition = False
 
@@ -86,36 +90,14 @@ def __init__(
         vision_feature_select_strategy="default",
         vision_feature_layer=-2,
         image_grid_pinpoints=None,
+        tie_word_embeddings=False,
+        image_seq_length=576,
         **kwargs,
     ):
-        """
-        This method initializes an instance of the LlavaNextConfig class with the provided parameters.
-
-        Args:
-            self: The instance of the class.
-            vision_config (dict, optional): Configuration settings for the vision model.
-                If not provided, default settings will be used.
-            text_config (dict, optional): Configuration settings for the text model.
-                If not provided, default settings will be used.
-            ignore_index (int, optional): Index to ignore during computation. Default is -100.
-            image_token_index (int, optional): Index for image token. Default is 32000.
-            projector_hidden_act (str, optional): Activation function for hidden layers in projector.
-                Default is 'gelu'.
-            vision_feature_select_strategy (str): Strategy for selecting vision features.
-                Should be one of 'default' or 'full'.
-            vision_feature_layer (int, optional): Layer to extract features from in the vision model.
-            image_grid_pinpoints (list of lists, optional): Coordinates for image grid pinpoints.
-                Default is [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008].
-
-        Returns:
-            None
-
-        Raises:
-            ValueError: If vision_feature_select_strategy is not 'default' or 'full'.
-        """
         self.ignore_index = ignore_index
         self.image_token_index = image_token_index
         self.projector_hidden_act = projector_hidden_act
+        self.image_seq_length = image_seq_length
 
         if vision_feature_select_strategy not in ["default", "full"]:
             raise ValueError(
@@ -136,8 +118,7 @@ def __init__(
             vision_config["model_type"] = (
                 vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
             )
-            vision_config = CONFIG_MAPPING[vision_config["model_type"]](
-                **vision_config)
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
         elif vision_config is None:
             vision_config = CONFIG_MAPPING["clip_vision_model"](
                 intermediate_size=4096,
@@ -154,15 +135,13 @@ def __init__(
 
         if isinstance(text_config, dict):
             text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
-            text_config = CONFIG_MAPPING[text_config["model_type"]](
-                **text_config)
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
         elif text_config is None:
             text_config = CONFIG_MAPPING["llama"]()
 
         self.text_config = text_config
 
-        super().__init__(**kwargs)
-
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
 
 __all__ = [
     "LlavaNextConfig",