Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions mindnlp/core/nn/functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,11 @@ def mse_loss(input, target, reduction='mean'):
def l1_loss(input, target, reduction='mean'):
return ops.l1_loss(input, target, reduction)

def smooth_l1_loss(input, target, beta=1.0, reduction='none'):
input = input.to(mindspore.float32)
target = target.to(mindspore.float32)
return ops.smooth_l1_loss(input, target, beta, reduction)

def kl_div(logits, labels, reduction='mean', log_target=False):
if log_target:
labels = ops.log(labels)
Expand Down
4 changes: 2 additions & 2 deletions mindnlp/transformers/modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1622,14 +1622,14 @@ def _get_resized_embeddings(
# numbers of tokens to copy
n = min(old_num_tokens, new_num_tokens)

new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :]
new_embeddings.weight[:n, :] = old_embeddings.weight[:n, :]

# Replace weights in old_embeddings and return to maintain the same embedding type.
# This ensures correct functionality when a Custom Embedding class is passed as input.
# The input and output embedding types remain consistent. (c.f. https://github.com/huggingface/transformers/pull/31979)

old_embeddings.weight = new_embeddings.weight
old_embeddings.num_embeddings = new_embeddings.weight.data.shape[0]
old_embeddings.num_embeddings = new_embeddings.weight.shape[0]
if old_embeddings.padding_idx is not None and (new_num_tokens - 1) < old_embeddings.padding_idx:
old_embeddings.padding_idx = None

Expand Down
3 changes: 3 additions & 0 deletions mindnlp/transformers/models/auto/configuration_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,12 @@
("layoutlmv2", "LayoutLMv2Config"),
("layoutlmv3", "LayoutLMv3Config"),
("led", "LEDConfig"),
("lilt", "LiltConfig"),
("llama", "LlamaConfig"),
("llava", "LlavaConfig"),
("llava_next", "LlavaNextConfig"),
("longformer", "LongformerConfig"),
("luke", "LukeConfig"),
("lxmert", "LxmertConfig"),
("m2m_100", "M2M100Config"),
("mamba", "MambaConfig"),
Expand Down
1,300 changes: 304 additions & 996 deletions mindnlp/transformers/models/layoutlm/modeling_layoutlm.py

Large diffs are not rendered by default.

1,481 changes: 371 additions & 1,110 deletions mindnlp/transformers/models/layoutlmv2/modeling_layoutlmv2.py

Large diffs are not rendered by default.

11 changes: 6 additions & 5 deletions mindnlp/transformers/models/led/modeling_led.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
from mindspore import Tensor
from mindspore.common.initializer import initializer, Normal

from mindnlp.core import nn, ops
from mindnlp.core import nn, ops, no_grad
from mindnlp.core.nn import functional as F
from ...activations import ACT2FN
from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask
Expand Down Expand Up @@ -539,19 +539,20 @@ def _get_global_attn_indices(is_index_global_attn):
# helper variable
num_global_attn_indices = is_index_global_attn.long().sum(axis=1)
# max number of global attn indices in batch
max_num_global_attn_indices = ops.max(num_global_attn_indices).item()
with no_grad():
max_num_global_attn_indices = ops.max(num_global_attn_indices).item()

# indices of global attn
is_index_global_attn_nonzero = is_index_global_attn.nonzero(as_tuple=True)
is_index_global_attn_nonzero = ops.nonzero(is_index_global_attn, as_tuple=True)

# helper variable
is_local_index_global_attn = ops.arange(max_num_global_attn_indices) < num_global_attn_indices.unsqueeze(dim=-1)

# location of the non-padding values within global attention indices
is_local_index_global_attn_nonzero = is_local_index_global_attn.nonzero(as_tuple=True)
is_local_index_global_attn_nonzero = ops.nonzero(is_local_index_global_attn, as_tuple=True)

# location of the padding values within global attention indices
is_local_index_no_global_attn_nonzero = (is_local_index_global_attn == 0).nonzero(as_tuple=True)
is_local_index_no_global_attn_nonzero = ops.nonzero((is_local_index_global_attn == 0), as_tuple=True)
return (
max_num_global_attn_indices,
is_index_global_attn_nonzero,
Expand Down
7 changes: 6 additions & 1 deletion mindnlp/transformers/models/llama/modeling_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -797,6 +797,7 @@ def forward(
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[mindspore.Tensor] = None,
num_logits_to_keep: int = 0,
) -> Union[Tuple, CausalLMOutputWithPast]:
r"""
Args:
Expand Down Expand Up @@ -849,7 +850,7 @@ def forward(
logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
logits = ops.cat(logits, dim=-1)
else:
logits = self.lm_head(hidden_states)
logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
logits = logits.float()

loss = None
Expand Down Expand Up @@ -885,6 +886,7 @@ def prepare_inputs_for_generation(
cache_position=None,
position_ids=None,
use_cache=True,
num_logits_to_keep=None,
**kwargs,
):
# If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
Expand Down Expand Up @@ -930,6 +932,9 @@ def prepare_inputs_for_generation(
batch_size=batch_size,
)

if num_logits_to_keep is not None:
model_inputs["num_logits_to_keep"] = num_logits_to_keep

model_inputs.update(
{
"position_ids": position_ids,
Expand Down
5 changes: 3 additions & 2 deletions mindnlp/transformers/models/llava/modeling_llava.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

import mindspore

from mindnlp.core import nn, ops
from mindnlp.core import nn, ops, no_grad
from mindnlp.core.nn import functional as F
from ...modeling_utils import PreTrainedModel
from ...activations import ACT2FN
Expand Down Expand Up @@ -422,7 +422,8 @@ def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, in
special_image_token_mask = input_ids == self.config.image_token_index
num_special_image_tokens = ops.sum(special_image_token_mask, dim=-1)
# Compute the maximum embed dimension
max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)).item() + sequence_length
with no_grad():
max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)).item() + sequence_length
nonzero = ops.nonzero(input_ids != self.config.image_token_index)
batch_indices, non_image_indices = ops.chunk(nonzero, 2, -1)

Expand Down
97 changes: 38 additions & 59 deletions mindnlp/transformers/models/llava_next/configuration_llava_next.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
# Copyright 2024 Huawei Technologies Co., Ltd
#
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================
""" Llava-NeXT model configuration"""
"""Llava-NeXT model configuration"""

from ...configuration_utils import PretrainedConfig
from ....utils import logging
Expand All @@ -24,10 +23,10 @@

class LlavaNextConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`LlavaNextForConditionalGeneration`].
It is used to instantiate an Llava-NeXT model according to the specified arguments, defining the model architecture.
Instantiating a configuration with the defaults will yield a similar configuration to that of the
[llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) model.
This is the configuration class to store the configuration of a [`LlavaNextForConditionalGeneration`]. It is used to instantiate an
Llava-NeXT model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)
model.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Expand All @@ -52,27 +51,32 @@ class LlavaNextConfig(PretrainedConfig):
image_grid_pinpoints (`List`, *optional*, defaults to `[[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008]]`):
A list of possible resolutions to use for processing high resolution images. Each item in the list should be a tuple or list
of the form `(height, width)`.
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether the model's input and output word embeddings should be tied.
image_seq_length (`int`, *optional*, defaults to 576):
Sequence length of one image embedding.

Example:
```python
>>> from transformers import LlavaNextForConditionalGeneration, LlavaNextConfig, CLIPVisionConfig, LlamaConfig
...
>>> # Initializing a CLIP-vision config
>>> vision_config = CLIPVisionConfig()
...
>>> # Initializing a Llama config
>>> text_config = LlamaConfig()
...
>>> # Initializing a Llava-Next llava-hf/llava-v1.6-mistral-7b-hf style configuration
>>> configuration = LlavaNextConfig(vision_config, text_config)
...
>>> # Initializing a model from the llava-hf/llava-v1.6-mistral-7b-hf style configuration
>>> model = LlavaNextForConditionalGeneration(configuration)
...
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""

```python
>>> from transformers import LlavaNextForConditionalGeneration, LlavaNextConfig, CLIPVisionConfig, LlamaConfig

>>> # Initializing a CLIP-vision config
>>> vision_config = CLIPVisionConfig()

>>> # Initializing a Llama config
>>> text_config = LlamaConfig()

>>> # Initializing a Llava-Next llava-hf/llava-v1.6-mistral-7b-hf style configuration
>>> configuration = LlavaNextConfig(vision_config, text_config)

>>> # Initializing a model from the llava-hf/llava-v1.6-mistral-7b-hf style configuration
>>> model = LlavaNextForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```"""

model_type = "llava_next"
is_composition = False

Expand All @@ -86,36 +90,14 @@ def __init__(
vision_feature_select_strategy="default",
vision_feature_layer=-2,
image_grid_pinpoints=None,
tie_word_embeddings=False,
image_seq_length=576,
**kwargs,
):
"""
This method initializes an instance of the LlavaNextConfig class with the provided parameters.

Args:
self: The instance of the class.
vision_config (dict, optional): Configuration settings for the vision model.
If not provided, default settings will be used.
text_config (dict, optional): Configuration settings for the text model.
If not provided, default settings will be used.
ignore_index (int, optional): Index to ignore during computation. Default is -100.
image_token_index (int, optional): Index for image token. Default is 32000.
projector_hidden_act (str, optional): Activation function for hidden layers in projector.
Default is 'gelu'.
vision_feature_select_strategy (str): Strategy for selecting vision features.
Should be one of 'default' or 'full'.
vision_feature_layer (int, optional): Layer to extract features from in the vision model.
image_grid_pinpoints (list of lists, optional): Coordinates for image grid pinpoints.
Default is [[336, 672], [672, 336], [672, 672], [1008, 336], [336, 1008].

Returns:
None

Raises:
ValueError: If vision_feature_select_strategy is not 'default' or 'full'.
"""
self.ignore_index = ignore_index
self.image_token_index = image_token_index
self.projector_hidden_act = projector_hidden_act
self.image_seq_length = image_seq_length

if vision_feature_select_strategy not in ["default", "full"]:
raise ValueError(
Expand All @@ -136,8 +118,7 @@ def __init__(
vision_config["model_type"] = (
vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
)
vision_config = CONFIG_MAPPING[vision_config["model_type"]](
**vision_config)
vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
elif vision_config is None:
vision_config = CONFIG_MAPPING["clip_vision_model"](
intermediate_size=4096,
Expand All @@ -154,15 +135,13 @@ def __init__(

if isinstance(text_config, dict):
text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
text_config = CONFIG_MAPPING[text_config["model_type"]](
**text_config)
text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
elif text_config is None:
text_config = CONFIG_MAPPING["llama"]()

self.text_config = text_config

super().__init__(**kwargs)

super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

__all__ = [
"LlavaNextConfig",
Expand Down
Loading