Skip to content

Commit 25d93c6

Browse files
authored
Feat(processor): Support transformers v5.0 by optimize processor (#3742)
1 parent 1d6500a commit 25d93c6

32 files changed

+477
-244
lines changed

paddleformers/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ def _check_dependency_versions():
7777
import os
7878

7979
PADDLEFORMERS_TESTING = os.environ.get("PADDLEFORMERS_TESTING", False)
80+
sys.modules["torchcodec"] = None # Explicitly disable torchcodec to prevent optional dependency issues
8081
if "torch" not in sys.modules and not PADDLEFORMERS_TESTING:
8182
sys.modules["torch"] = None
8283
sys.modules["torchvision"] = None

paddleformers/cli/train/sft/sft_trainer.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,10 @@
3636
from paddle.io import BatchSampler, DataLoader, DistributedBatchSampler
3737

3838
if TYPE_CHECKING:
39-
from transformers.tokenization_utils import PreTrainedTokenizer
39+
try:
40+
from transformers.tokenization_python import PreTrainedTokenizer
41+
except ImportError:
42+
from transformers.tokenization_utils import PreTrainedTokenizer
4043

4144
from paddleformers.data import DataCollator, DataCollatorForSeq2Seq
4245
from paddleformers.trainer import Trainer

paddleformers/cli/utils/llm_utils.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@
2828
from sklearn.metrics import accuracy_score
2929

3030
if TYPE_CHECKING:
31-
from transformers.tokenization_utils import PreTrainedTokenizer
31+
try:
32+
from transformers.tokenization_python import PreTrainedTokenizer
33+
except ImportError:
34+
from transformers.tokenization_utils import PreTrainedTokenizer
3235

3336
from paddleformers.generation import GenerationConfig
3437
from paddleformers.transformers import ( # ChatGLMv2Tokenizer,

paddleformers/datasets/rlhf_datasets/rl_dataset.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,10 @@
2222
from paddle.io import Dataset
2323

2424
if TYPE_CHECKING:
25-
from transformers.tokenization_utils import PreTrainedTokenizer
25+
try:
26+
from transformers.tokenization_python import PreTrainedTokenizer
27+
except ImportError:
28+
from transformers.tokenization_utils import PreTrainedTokenizer
2629

2730
from transformers.utils import PaddingStrategy
2831

paddleformers/generation/streamers.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,10 @@
1717
from typing import TYPE_CHECKING, Optional
1818

1919
if TYPE_CHECKING:
20-
from transformers.tokenization_utils import PreTrainedTokenizer
20+
try:
21+
from transformers.tokenization_python import PreTrainedTokenizer
22+
except ImportError:
23+
from transformers.tokenization_utils import PreTrainedTokenizer
2124

2225

2326
class BaseStreamer:

paddleformers/trainer/trainer.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,10 @@
115115
else:
116116
get_batch_on_this_cp_rank = None
117117
if TYPE_CHECKING:
118-
from transformers.tokenization_utils import PreTrainedTokenizer
118+
try:
119+
from transformers.tokenization_python import PreTrainedTokenizer
120+
except ImportError:
121+
from transformers.tokenization_utils import PreTrainedTokenizer
119122

120123
from paddle.framework.recall_error import LOSS_INF_ERROR, LOSS_NAN_ERROR
121124

paddleformers/transformers/auto/image_processing.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
{
5454
"glm4v": (),
5555
"glm4v_moe": ("Glm4vImageProcessor", "Glm4vImageProcessorFast"),
56+
"paddleocr_vl": (),
5657
"qwen2_5_vl": ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast"),
5758
"qwen2_vl": ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast"),
5859
"qwen3_vl": ("Qwen3VLImageProcessor", "Qwen3VLImageProcessorFast"),
@@ -72,10 +73,10 @@ def get_image_processor_class_from_name(class_name: str):
7273
if class_name in extractors:
7374
module_name = model_type_to_module_name(module_name)
7475

75-
module = importlib.import_module(f".{module_name}", "paddleformers.transformers")
7676
try:
77+
module = importlib.import_module(f".{module_name}", "paddleformers.transformers")
7778
return getattr(module, class_name)
78-
except AttributeError:
79+
except (ModuleNotFoundError, AttributeError):
7980
continue
8081

8182
for extractor in IMAGE_PROCESSOR_MAPPING._extra_content.values():

paddleformers/transformers/auto/processing.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,10 @@ def processor_class_from_name(class_name: str):
6464
if class_name in extractors:
6565
module_name = model_type_to_module_name(module_name)
6666

67-
module = importlib.import_module(f".{module_name}", "paddleformers.transformers")
6867
try:
68+
module = importlib.import_module(f".{module_name}", "paddleformers.transformers")
6969
return getattr(module, class_name)
70-
except AttributeError:
70+
except (ModuleNotFoundError, AttributeError):
7171
continue
7272

7373
for extractor in PROCESSOR_MAPPING._extra_content.values():

paddleformers/transformers/auto/tokenizer.py

Lines changed: 100 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from typing import Any, Dict, Optional, Union
2020

2121
import transformers as hf
22-
from transformers import AutoConfig, PretrainedConfig
22+
from transformers import AutoConfig, PreTrainedConfig
2323
from transformers.dynamic_module_utils import (
2424
get_class_from_dynamic_module,
2525
resolve_trust_remote_code,
@@ -42,6 +42,7 @@
4242
EncoderDecoderConfig,
4343
)
4444
from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE
45+
from transformers.tokenization_utils_tokenizers import TokenizersBackend
4546
from transformers.utils import cached_file
4647

4748
from ...utils.download import DownloadSource, resolve_file_path
@@ -147,14 +148,14 @@ def get_paddleformers_tokenizer_config(
147148

148149

149150
def tokenizer_class_from_name(class_name: str) -> Union[type[Any], None]:
150-
for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items():
151-
if class_name in tokenizers:
151+
for module_name, tokenizer_class in TOKENIZER_MAPPING_NAMES.items():
152+
if tokenizer_class == class_name:
152153
module_name = model_type_to_module_name(module_name)
153154

154-
module = importlib.import_module(f".{module_name}", "paddleformers.transformers")
155155
try:
156+
module = importlib.import_module(f".{module_name}", "paddleformers.transformers")
156157
return getattr(module, class_name)
157-
except AttributeError:
158+
except (ModuleNotFoundError, AttributeError):
158159
continue
159160

160161
for tokenizers in TOKENIZER_MAPPING._extra_content.values():
@@ -228,38 +229,23 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
228229
tokenizer_type = kwargs.pop("tokenizer_type", None)
229230
trust_remote_code = kwargs.pop("trust_remote_code", None)
230231
gguf_file = kwargs.get("gguf_file")
232+
config_model_type = None
231233

232234
# First, let's see whether the tokenizer_type is passed so that we can leverage it
233235
if tokenizer_type is not None:
234-
tokenizer_class = None
235-
tokenizer_class_tuple = TOKENIZER_MAPPING_NAMES.get(tokenizer_type, None)
236+
tokenizer_class_name = TOKENIZER_MAPPING_NAMES.get(tokenizer_type, None)
236237

237-
if tokenizer_class_tuple is None:
238+
if tokenizer_class_name is None:
238239
raise ValueError(
239240
f"Passed `tokenizer_type` {tokenizer_type} does not exist. `tokenizer_type` should be one of "
240241
f"{', '.join(c for c in TOKENIZER_MAPPING_NAMES)}."
241242
)
242243

243-
tokenizer_class_name, tokenizer_fast_class_name = tokenizer_class_tuple
244+
tokenizer_class = tokenizer_class_from_name_hf(tokenizer_class_name)
244245

245-
if use_fast:
246-
if tokenizer_fast_class_name is not None:
247-
tokenizer_class = tokenizer_class_from_name_hf(tokenizer_fast_class_name)
248-
249-
# Not found in Transformers, try local PaddleFormers registry
250-
if tokenizer_class is None:
251-
tokenizer_class = tokenizer_class_from_name(tokenizer_fast_class_name)
252-
else:
253-
logger.warning(
254-
"`use_fast` is set to `True` but the tokenizer class does not have a fast version. "
255-
" Falling back to the slow version."
256-
)
246+
# Not found in Transformers, try local PaddleFormers registry
257247
if tokenizer_class is None:
258-
tokenizer_class = tokenizer_class_from_name_hf(tokenizer_class_name)
259-
260-
# Not found in Transformers, try local PaddleFormers registry
261-
if tokenizer_class is None:
262-
tokenizer_class = tokenizer_class_from_name(tokenizer_class_name)
248+
tokenizer_class = tokenizer_class_from_name(tokenizer_class_name)
263249

264250
if tokenizer_class is None:
265251
raise ValueError(f"Tokenizer class {tokenizer_class_name} is not currently imported.")
@@ -272,9 +258,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
272258
# download tokenizer_config.json file to get tokenizer class name
273259
if download_hub == DownloadSource.HUGGINGFACE:
274260
tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
275-
if "_commit_hash" in tokenizer_config:
276-
kwargs["_commit_hash"] = tokenizer_config["_commit_hash"]
277-
config_tokenizer_class = tokenizer_config.get("tokenizer_class")
278261
else:
279262
try:
280263
tokenizer_config = get_paddleformers_tokenizer_config(pretrained_model_name_or_path, **kwargs)
@@ -299,7 +282,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
299282
) from None
300283
else:
301284
raise
302-
config_tokenizer_class = tokenizer_config.get("tokenizer_class")
285+
286+
tokenizer_config_class = tokenizer_config.get("tokenizer_class", None)
303287

304288
tokenizer_auto_map = None
305289
if "auto_map" in tokenizer_config:
@@ -309,54 +293,92 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
309293
else:
310294
tokenizer_auto_map = tokenizer_config["auto_map"].get("AutoTokenizer", None)
311295

312-
# If that did not work, let's try to use the config.
313-
if config_tokenizer_class is None:
314-
if not isinstance(config, PretrainedConfig):
315-
if gguf_file:
316-
gguf_path = cached_file(pretrained_model_name_or_path, gguf_file, **kwargs)
317-
config_dict = load_gguf_checkpoint(gguf_path, return_tensors=False)["config"]
318-
config = AutoConfig.for_model(**config_dict)
319-
else:
296+
if tokenizer_config_class is None:
297+
if gguf_file:
298+
gguf_path = cached_file(pretrained_model_name_or_path, gguf_file, **kwargs)
299+
config_dict = load_gguf_checkpoint(gguf_path, return_tensors=False)["config"]
300+
config = AutoConfig.for_model(**config_dict)
301+
elif config is None:
302+
try:
320303
config = AutoConfig.from_pretrained(
321304
pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs
322305
)
323-
config_tokenizer_class = config.tokenizer_class
306+
except Exception:
307+
config = PreTrainedConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
308+
309+
tokenizer_config_class = config.tokenizer_class
324310
if hasattr(config, "auto_map") and "AutoTokenizer" in config.auto_map:
325311
tokenizer_auto_map = config.auto_map["AutoTokenizer"]
326312

313+
if config:
314+
config_model_type = config.get("model_type", None)
315+
316+
# if there is a config, we can check that the tokenizer class != than model class and can thus assume we need to use TokenizersBackend
317+
# Skip this early exit if auto_map is present (custom tokenizer with trust_remote_code)
318+
if (
319+
tokenizer_auto_map is None
320+
and tokenizer_config_class is not None
321+
and config_model_type is not None
322+
and config_model_type != ""
323+
and TOKENIZER_MAPPING_NAMES.get(config_model_type, "").replace("Fast", "")
324+
!= tokenizer_config_class.replace("Fast", "")
325+
):
326+
# new model, but we ignore it unless the model type is the same
327+
try:
328+
return TokenizersBackend.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
329+
except Exception:
330+
tokenizer_class = tokenizer_class_from_name_hf(tokenizer_config_class)
331+
# Not found in Transformers, try local PaddleFormers registry
332+
if tokenizer_class is None:
333+
tokenizer_class = tokenizer_class_from_name(tokenizer_config_class)
334+
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
335+
336+
if "_commit_hash" in tokenizer_config:
337+
kwargs["_commit_hash"] = tokenizer_config["_commit_hash"]
338+
327339
has_remote_code = tokenizer_auto_map is not None
328340
has_local_code = type(config) in TOKENIZER_MAPPING or (
329-
config_tokenizer_class is not None
341+
tokenizer_config_class is not None
330342
and (
331-
tokenizer_class_from_name_hf(config_tokenizer_class) is not None
332-
or tokenizer_class_from_name_hf(config_tokenizer_class + "Fast") is not None
343+
tokenizer_class_from_name_hf(tokenizer_config_class) is not None
344+
or tokenizer_class_from_name_hf(tokenizer_config_class + "Fast") is not None
333345
)
334346
)
335347

336-
if config_tokenizer_class is not None:
337-
tokenizer_class = None
338-
if use_fast and not config_tokenizer_class.endswith("Fast"):
339-
tokenizer_class_candidate = f"{config_tokenizer_class}Fast"
340-
tokenizer_class = tokenizer_class_from_name_hf(tokenizer_class_candidate)
341-
# Not found in Transformers, try local PaddleFormers registry
342-
if tokenizer_class is None:
343-
tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
344-
348+
if tokenizer_config_class is not None:
349+
tokenizer_class_candidate = tokenizer_config_class
350+
tokenizer_class = tokenizer_class_from_name_hf(tokenizer_class_candidate)
351+
# Not found in Transformers, try local PaddleFormers registry
345352
if tokenizer_class is None:
346-
tokenizer_class_candidate = config_tokenizer_class
353+
tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
354+
355+
if tokenizer_class is None and not tokenizer_config_class.endswith("Fast"):
356+
tokenizer_class_candidate = f"{tokenizer_config_class}Fast"
347357
tokenizer_class = tokenizer_class_from_name_hf(tokenizer_class_candidate)
348358
# Not found in Transformers, try local PaddleFormers registry
349359
if tokenizer_class is None:
350360
tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
361+
362+
if tokenizer_class is not None and tokenizer_class.__name__ == "PythonBackend":
363+
tokenizer_class = TokenizersBackend
364+
# Fallback to TokenizersBackend if the class wasn't found
351365
if tokenizer_class is None:
352-
raise ValueError(
353-
f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported."
354-
)
366+
tokenizer_class = TokenizersBackend
355367

356368
# Bind PaddleTokenizerMixin
357369
tokenizer_class = _bind_paddle_mixin_if_available(tokenizer_class)
358370
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
359371

372+
if getattr(config, "tokenizer_class", None):
373+
_class = config.tokenizer_class
374+
if "PreTrainedTokenizerFast" not in _class:
375+
_class = _class.replace("Fast", "")
376+
tokenizer_class = tokenizer_class_from_name_hf(_class)
377+
# Not found in Transformers, try local PaddleFormers registry
378+
if tokenizer_class is None:
379+
tokenizer_class = tokenizer_class_from_name(_class)
380+
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
381+
360382
if has_remote_code:
361383
if use_fast and tokenizer_auto_map[1] is not None:
362384
class_ref = tokenizer_auto_map[1]
@@ -406,11 +428,29 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
406428
# Bind PaddleTokenizerMixin
407429
tokenizer_class_py = _bind_paddle_mixin_if_available(tokenizer_class_py)
408430
return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
409-
else:
410-
raise ValueError(
411-
"This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed "
412-
"in order to use this tokenizer."
413-
)
431+
432+
# Fallback: try tokenizer_class from tokenizer_config.json
433+
tokenizer_config_class = tokenizer_config.get("tokenizer_class", None)
434+
if tokenizer_config_class is not None:
435+
if tokenizer_config_class != "TokenizersBackend" and "Fast" in tokenizer_config_class:
436+
tokenizer_config_class = tokenizer_config_class[:-4]
437+
438+
tokenizer_class = tokenizer_class_from_name_hf(tokenizer_config_class)
439+
# Not found in Transformers, try local PaddleFormers registry
440+
if tokenizer_class is None:
441+
tokenizer_class = tokenizer_class_from_name(tokenizer_config_class)
442+
443+
if tokenizer_class is None and not tokenizer_config_class.endswith("Fast"):
444+
tokenizer_class = tokenizer_class_from_name_hf(tokenizer_config_class + "Fast")
445+
# Not found in Transformers, try local PaddleFormers registry
446+
if tokenizer_class is None:
447+
tokenizer_class = tokenizer_class_from_name(tokenizer_config_class + "Fast")
448+
if tokenizer_class is not None and tokenizer_class.__name__ == "PythonBackend":
449+
tokenizer_class = TokenizersBackend
450+
if tokenizer_class is None:
451+
tokenizer_class = TokenizersBackend
452+
tokenizer_class = _bind_paddle_mixin_if_available(tokenizer_class)
453+
return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
414454

415455
raise ValueError(
416456
f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n"

paddleformers/transformers/auto/video_processing.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,10 +55,10 @@ def video_processor_class_from_name(class_name: str):
5555
if class_name in extractors:
5656
module_name = model_type_to_module_name(module_name)
5757

58-
module = importlib.import_module(f".{module_name}", "paddleformers.transformers")
5958
try:
59+
module = importlib.import_module(f".{module_name}", "paddleformers.transformers")
6060
return getattr(module, class_name)
61-
except AttributeError:
61+
except (ModuleNotFoundError, AttributeError):
6262
continue
6363

6464
for extractor in VIDEO_PROCESSOR_MAPPING._extra_content.values():

0 commit comments

Comments
 (0)