Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions paddleformers/transformers/ernie4_5_moe_vl/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def __init__(
sep_token="<sep>",
unk_token="<unk>",
additional_special_tokens=None,
special_tokens_pattern="cls_sep",
**kwargs,
):
"""
Expand Down Expand Up @@ -89,6 +90,7 @@ def __init__(
sep_token=sep_token,
unk_token=unk_token,
additional_special_tokens=additional_special_tokens,
special_tokens_pattern=special_tokens_pattern,
**kwargs,
)

Expand Down
2 changes: 1 addition & 1 deletion paddleformers/transformers/glm4v_moe/video_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def _preprocess(
grid_h, grid_w = resized_height // patch_size, resized_width // patch_size

patches = patches.view(
grid_t,
batch_size * grid_t,
temporal_patch_size,
channel,
grid_h // merge_size,
Expand Down
37 changes: 16 additions & 21 deletions paddleformers/transformers/processing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
AllKwargsForChatTemplate as AllKwargsForChatTemplate_hf,
)
from transformers.processing_utils import ProcessingKwargs as ProcessingKwargs_hf
from transformers.processing_utils import ProcessorChatTemplateKwargs
from transformers.processing_utils import ProcessorMixin as ProcessorMixin_hf
from transformers.processing_utils import transformers_module
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
Expand Down Expand Up @@ -209,6 +210,7 @@ class ProcessingKwargs(ProcessingKwargs_hf):

class AllKwargsForChatTemplate(AllKwargsForChatTemplate_hf):
processor_kwargs: ProcessingKwargs
template_kwargs: ProcessorChatTemplateKwargs


@dataclass
Expand Down Expand Up @@ -933,25 +935,18 @@ def apply_chat_template(
else:
kwargs["return_offsets_mapping"] = True # force offset mapping so we can infer token boundaries

# Fill sets of kwargs that should be used by different parts of template
processed_kwargs = {
"mm_load_kwargs": {},
"template_kwargs": {},
}

for kwarg_type in processed_kwargs:
for key in AllKwargsForChatTemplate.__annotations__[kwarg_type].__annotations__:
kwarg_type_defaults = AllKwargsForChatTemplate.__annotations__[kwarg_type]
default_value = getattr(kwarg_type_defaults, key, None)
value = kwargs.pop(key, default_value)
if value is not None and not isinstance(value, dict):
processed_kwargs[kwarg_type][key] = value

# pop unused and deprecated kwarg
kwargs.pop("video_load_backend", None)
# Fill sets of kwargs that should be used by jinja template, filtering out kwargs used in `processor.__call__`
# NOTE: we don't only filter but also set the default values here. Without default values, we can remove it
template_kwargs = {}
for key in AllKwargsForChatTemplate.__annotations__["template_kwargs"].__annotations__:
kwarg_type_defaults = AllKwargsForChatTemplate.__annotations__["template_kwargs"]
default_value = getattr(kwarg_type_defaults, key, None)
value = kwargs.pop(key, default_value)
if value is not None and not isinstance(value, dict):
template_kwargs[key] = value

# Pass unprocessed custom kwargs
processed_kwargs["template_kwargs"].update(kwargs)
template_kwargs.update(kwargs)

if isinstance(conversation, (list, tuple)) and (
isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
Expand All @@ -962,8 +957,8 @@ def apply_chat_template(
is_batched = False
conversations = [conversation]

tokenize = processed_kwargs["template_kwargs"].pop("tokenize", False)
return_dict = processed_kwargs["template_kwargs"].pop("return_dict", False)
tokenize = template_kwargs.pop("tokenize", False)
return_dict = template_kwargs.pop("return_dict", True)

if tokenize:
batch_images, batch_videos = [], []
Expand Down Expand Up @@ -994,7 +989,7 @@ def apply_chat_template(
prompt, generation_indices = render_jinja_template(
conversations=conversations,
chat_template=chat_template,
**processed_kwargs["template_kwargs"], # different flags such as `return_assistant_mask`
**template_kwargs, # different flags such as `return_assistant_mask`
**self.tokenizer.special_tokens_map, # tokenizer special tokens are used by some templates
)

Expand Down Expand Up @@ -1029,7 +1024,7 @@ def apply_chat_template(
)

if return_dict:
if processed_kwargs["template_kwargs"].get("return_assistant_tokens_mask", False):
if template_kwargs.get("return_assistant_tokens_mask", False):
assistant_masks = []
offset_mapping = out.pop("offset_mapping")
input_ids = out["input_ids"]
Expand Down
4 changes: 4 additions & 0 deletions paddleformers/transformers/qwen3_moe/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,10 @@ class Qwen3MoeConfig(PretrainedConfig):
model_type = "qwen3_moe"
keys_to_ignore_at_inference = ["past_key_values"]

attribute_map = {
"num_experts": "num_local_experts",
}

def __init__(
self,
vocab_size=151936,
Expand Down
7 changes: 7 additions & 0 deletions paddleformers/transformers/qwen3_moe/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ class Qwen3MoEModelProvider(GPTModelProvider):
"expert_parallel_degree": "expert_model_parallel_size",
"dtype": "params_dtype",
"num_experts": "n_routed_experts",
"num_local_experts": "n_routed_experts",
}

rotary_base: float = 1000000.0
Expand Down Expand Up @@ -786,8 +787,11 @@ class Qwen3MoePretrainedModel(PretrainedModel):
def _gen_aoa_config(cls, config: Qwen3MoeConfig):
if hasattr(config, "n_routed_experts"):
num_experts = config.n_routed_experts
elif hasattr(config, "num_local_experts"):
num_experts = config.num_local_experts
else:
num_experts = config.num_experts

model_prefix = "" if cls == cls.base_model_class else "model."
using_sonic_moe = config.using_sonic_moe
aoa_config = {
Expand Down Expand Up @@ -891,8 +895,11 @@ def _gen_aoa_config(cls, config: Qwen3MoeConfig):
def _gen_inv_aoa_config(cls, config: Qwen3MoeConfig):
if hasattr(config, "n_routed_experts"):
num_experts = config.n_routed_experts
elif hasattr(config, "num_local_experts"):
num_experts = config.num_local_experts
else:
num_experts = config.num_experts

model_prefix = "" if cls == cls.base_model_class else "model."
using_sonic_moe = config.using_sonic_moe
aoa_statements = [
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,6 @@ tiktoken
ml_dtypes
omegaconf
modelscope
transformers==5.0.0
transformers>=5.0.0
GPUtil
importlib_metadata
67 changes: 6 additions & 61 deletions tests/transformers/glm4v_moe/test_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import paddle

from paddleformers.transformers import AutoProcessor, Glm4vProcessor
from tests.testing_utils import gpu_device_initializer
from tests.transformers.test_processing_common import ProcessorTesterMixin


Expand All @@ -40,6 +41,11 @@ def setUpClass(cls):
processor.save_pretrained(cls.tmpdir)
cls.image_token = processor.image_token

# Use GPU 0 to prevent CUDA illegal memory access during resize
@gpu_device_initializer(log_prefix="Glm4vMoeProcessorTest", gpu_id=0)
def setUp(self):
pass

def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdir, **kwargs).tokenizer

Expand Down Expand Up @@ -324,64 +330,3 @@ def test_kwargs_overrides_custom_image_processor_kwargs(self):
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input, return_tensors="pd")
self.assertEqual(inputs[self.images_input_name].shape[0], 4)

# pass temporarily
def test_doubly_passed_kwargs(self):
pass

def test_doubly_passed_kwargs_video(self):
pass

def test_image_processor_defaults_preserved_by_image_kwargs(self):
pass

def test_kwargs_overrides_default_image_processor_kwargs(self):
pass

def test_kwargs_overrides_default_tokenizer_kwargs(self):
pass

def test_kwargs_overrides_default_tokenizer_kwargs_video(self):
pass

def test_kwargs_overrides_default_video_processor_kwargs(self):
pass

def test_overlapping_text_image_kwargs_handling(self):
pass

def test_processor_from_and_save_pretrained_as_nested_dict(self):
pass

def test_structured_kwargs_nested(self):
pass

def test_structured_kwargs_nested_from_dict(self):
pass

def test_structured_kwargs_nested_from_dict_video(self):
pass

def test_structured_kwargs_nested_video(self):
pass

def test_tokenizer_defaults_preserved_by_kwargs(self):
pass

def test_tokenizer_defaults_preserved_by_kwargs_video(self):
pass

def test_unstructured_kwargs(self):
pass

def test_unstructured_kwargs_batched(self):
pass

def test_unstructured_kwargs_batched_video(self):
pass

def test_unstructured_kwargs_video(self):
pass

def test_video_processor_defaults_preserved_by_video_kwargs(self):
pass
39 changes: 17 additions & 22 deletions tests/transformers/test_processing_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@
import paddle
from PIL import Image

from paddleformers.transformers.auto.processing import processor_class_from_name
from paddleformers.transformers.processing_utils import (
MODALITY_TO_AUTOPROCESSOR_MAPPING,
)

from .test_utils import check_json_file_has_correct_format

Expand Down Expand Up @@ -67,16 +69,12 @@ def prepare_processor_dict():
return {}

def get_component(self, attribute, **kwargs):
assert attribute in self.processor_class.attributes
component_class_name = getattr(self.processor_class, f"{attribute}_class")
if isinstance(component_class_name, tuple):
if attribute == "image_processor":
component_class_name = component_class_name[0]
else:
component_class_name = component_class_name[-1]

component_class = processor_class_from_name(component_class_name)
component = component_class.from_pretrained(self.tmpdir, **kwargs) # noqa
if attribute not in MODALITY_TO_AUTOPROCESSOR_MAPPING and "tokenizer" in attribute:
auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING["tokenizer"]
component = auto_processor_class.from_pretrained(self.tmpdir, subfolder=attribute, **kwargs) # noqa
else:
auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute]
component = auto_processor_class.from_pretrained(self.tmpdir, **kwargs) # noqa
if "tokenizer" in attribute and not component.pad_token:
component.pad_token = "[TEST_PAD]"
if component.pad_token_id is None:
Expand All @@ -86,7 +84,7 @@ def get_component(self, attribute, **kwargs):

def prepare_components(self):
components = {}
for attribute in self.processor_class.attributes:
for attribute in self.processor_class.get_attributes():
component = self.get_component(attribute)
components[attribute] = component

Expand Down Expand Up @@ -182,16 +180,13 @@ def test_processor_from_and_save_pretrained_as_nested_dict(self):
self.assertEqual(processor_second.to_dict(), processor_first.to_dict())

# Try to load each attribute separately from saved directory
for attribute in processor_first.attributes:
attribute_class_name = getattr(processor_first, f"{attribute}_class")
if isinstance(attribute_class_name, tuple):
if attribute == "image_processor":
attribute_class_name = attribute_class_name[0]
else:
attribute_class_name = attribute_class_name[-1]

attribute_class = processor_class_from_name(attribute_class_name)
attribute_reloaded = attribute_class.from_pretrained(tmpdir)
for attribute in processor_first.get_attributes():
if attribute not in MODALITY_TO_AUTOPROCESSOR_MAPPING and "tokenizer" in attribute:
auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING["tokenizer"]
attribute_reloaded = auto_processor_class.from_pretrained(tmpdir, subfolder=attribute)
else:
auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[attribute]
attribute_reloaded = auto_processor_class.from_pretrained(tmpdir)
attribute_first = getattr(processor_first, attribute)

# tokenizer repr contains model-path from where we loaded
Expand Down
Loading