1919from typing import Any , Dict , Optional , Union
2020
2121import transformers as hf
22- from transformers import AutoConfig , PretrainedConfig
22+ from transformers import AutoConfig , PreTrainedConfig
2323from transformers .dynamic_module_utils import (
2424 get_class_from_dynamic_module ,
2525 resolve_trust_remote_code ,
4242 EncoderDecoderConfig ,
4343)
4444from transformers .tokenization_utils_base import TOKENIZER_CONFIG_FILE
45+ from transformers .tokenization_utils_tokenizers import TokenizersBackend
4546from transformers .utils import cached_file
4647
4748from ...utils .download import DownloadSource , resolve_file_path
@@ -147,14 +148,14 @@ def get_paddleformers_tokenizer_config(
147148
148149
149150def tokenizer_class_from_name (class_name : str ) -> Union [type [Any ], None ]:
150- for module_name , tokenizers in TOKENIZER_MAPPING_NAMES .items ():
151- if class_name in tokenizers :
151+ for module_name , tokenizer_class in TOKENIZER_MAPPING_NAMES .items ():
152+ if tokenizer_class == class_name :
152153 module_name = model_type_to_module_name (module_name )
153154
154- module = importlib .import_module (f".{ module_name } " , "paddleformers.transformers" )
155155 try :
156+ module = importlib .import_module (f".{ module_name } " , "paddleformers.transformers" )
156157 return getattr (module , class_name )
157- except AttributeError :
158+ except ( ModuleNotFoundError , AttributeError ) :
158159 continue
159160
160161 for tokenizers in TOKENIZER_MAPPING ._extra_content .values ():
@@ -228,38 +229,23 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
228229 tokenizer_type = kwargs .pop ("tokenizer_type" , None )
229230 trust_remote_code = kwargs .pop ("trust_remote_code" , None )
230231 gguf_file = kwargs .get ("gguf_file" )
232+ config_model_type = None
231233
232234 # First, let's see whether the tokenizer_type is passed so that we can leverage it
233235 if tokenizer_type is not None :
234- tokenizer_class = None
235- tokenizer_class_tuple = TOKENIZER_MAPPING_NAMES .get (tokenizer_type , None )
236+ tokenizer_class_name = TOKENIZER_MAPPING_NAMES .get (tokenizer_type , None )
236237
237- if tokenizer_class_tuple is None :
238+ if tokenizer_class_name is None :
238239 raise ValueError (
239240 f"Passed `tokenizer_type` { tokenizer_type } does not exist. `tokenizer_type` should be one of "
240241 f"{ ', ' .join (c for c in TOKENIZER_MAPPING_NAMES )} ."
241242 )
242243
243- tokenizer_class_name , tokenizer_fast_class_name = tokenizer_class_tuple
244+ tokenizer_class = tokenizer_class_from_name_hf ( tokenizer_class_name )
244245
245- if use_fast :
246- if tokenizer_fast_class_name is not None :
247- tokenizer_class = tokenizer_class_from_name_hf (tokenizer_fast_class_name )
248-
249- # Not found in Transformers, try local PaddleFormers registry
250- if tokenizer_class is None :
251- tokenizer_class = tokenizer_class_from_name (tokenizer_fast_class_name )
252- else :
253- logger .warning (
254- "`use_fast` is set to `True` but the tokenizer class does not have a fast version. "
255- " Falling back to the slow version."
256- )
246+ # Not found in Transformers, try local PaddleFormers registry
257247 if tokenizer_class is None :
258- tokenizer_class = tokenizer_class_from_name_hf (tokenizer_class_name )
259-
260- # Not found in Transformers, try local PaddleFormers registry
261- if tokenizer_class is None :
262- tokenizer_class = tokenizer_class_from_name (tokenizer_class_name )
248+ tokenizer_class = tokenizer_class_from_name (tokenizer_class_name )
263249
264250 if tokenizer_class is None :
265251 raise ValueError (f"Tokenizer class { tokenizer_class_name } is not currently imported." )
@@ -272,9 +258,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
272258 # download tokenizer_config.json file to get tokenizer class name
273259 if download_hub == DownloadSource .HUGGINGFACE :
274260 tokenizer_config = get_tokenizer_config (pretrained_model_name_or_path , ** kwargs )
275- if "_commit_hash" in tokenizer_config :
276- kwargs ["_commit_hash" ] = tokenizer_config ["_commit_hash" ]
277- config_tokenizer_class = tokenizer_config .get ("tokenizer_class" )
278261 else :
279262 try :
280263 tokenizer_config = get_paddleformers_tokenizer_config (pretrained_model_name_or_path , ** kwargs )
@@ -299,7 +282,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
299282 ) from None
300283 else :
301284 raise
302- config_tokenizer_class = tokenizer_config .get ("tokenizer_class" )
285+
286+ tokenizer_config_class = tokenizer_config .get ("tokenizer_class" , None )
303287
304288 tokenizer_auto_map = None
305289 if "auto_map" in tokenizer_config :
@@ -309,54 +293,92 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
309293 else :
310294 tokenizer_auto_map = tokenizer_config ["auto_map" ].get ("AutoTokenizer" , None )
311295
312- # If that did not work, let's try to use the config.
313- if config_tokenizer_class is None :
314- if not isinstance (config , PretrainedConfig ):
315- if gguf_file :
316- gguf_path = cached_file (pretrained_model_name_or_path , gguf_file , ** kwargs )
317- config_dict = load_gguf_checkpoint (gguf_path , return_tensors = False )["config" ]
318- config = AutoConfig .for_model (** config_dict )
319- else :
296+ if tokenizer_config_class is None :
297+ if gguf_file :
298+ gguf_path = cached_file (pretrained_model_name_or_path , gguf_file , ** kwargs )
299+ config_dict = load_gguf_checkpoint (gguf_path , return_tensors = False )["config" ]
300+ config = AutoConfig .for_model (** config_dict )
301+ elif config is None :
302+ try :
320303 config = AutoConfig .from_pretrained (
321304 pretrained_model_name_or_path , trust_remote_code = trust_remote_code , ** kwargs
322305 )
323- config_tokenizer_class = config .tokenizer_class
306+ except Exception :
307+ config = PreTrainedConfig .from_pretrained (pretrained_model_name_or_path , ** kwargs )
308+
309+ tokenizer_config_class = config .tokenizer_class
324310 if hasattr (config , "auto_map" ) and "AutoTokenizer" in config .auto_map :
325311 tokenizer_auto_map = config .auto_map ["AutoTokenizer" ]
326312
313+ if config :
314+ config_model_type = config .get ("model_type" , None )
315+
316+ # if there is a config, we can check that the tokenizer class != than model class and can thus assume we need to use TokenizersBackend
317+ # Skip this early exit if auto_map is present (custom tokenizer with trust_remote_code)
318+ if (
319+ tokenizer_auto_map is None
320+ and tokenizer_config_class is not None
321+ and config_model_type is not None
322+ and config_model_type != ""
323+ and TOKENIZER_MAPPING_NAMES .get (config_model_type , "" ).replace ("Fast" , "" )
324+ != tokenizer_config_class .replace ("Fast" , "" )
325+ ):
326+ # new model, but we ignore it unless the model type is the same
327+ try :
328+ return TokenizersBackend .from_pretrained (pretrained_model_name_or_path , * inputs , ** kwargs )
329+ except Exception :
330+ tokenizer_class = tokenizer_class_from_name_hf (tokenizer_config_class )
331+ # Not found in Transformers, try local PaddleFormers registry
332+ if tokenizer_class is None :
333+ tokenizer_class = tokenizer_class_from_name (tokenizer_config_class )
334+ return tokenizer_class .from_pretrained (pretrained_model_name_or_path , * inputs , ** kwargs )
335+
336+ if "_commit_hash" in tokenizer_config :
337+ kwargs ["_commit_hash" ] = tokenizer_config ["_commit_hash" ]
338+
327339 has_remote_code = tokenizer_auto_map is not None
328340 has_local_code = type (config ) in TOKENIZER_MAPPING or (
329- config_tokenizer_class is not None
341+ tokenizer_config_class is not None
330342 and (
331- tokenizer_class_from_name_hf (config_tokenizer_class ) is not None
332- or tokenizer_class_from_name_hf (config_tokenizer_class + "Fast" ) is not None
343+ tokenizer_class_from_name_hf (tokenizer_config_class ) is not None
344+ or tokenizer_class_from_name_hf (tokenizer_config_class + "Fast" ) is not None
333345 )
334346 )
335347
336- if config_tokenizer_class is not None :
337- tokenizer_class = None
338- if use_fast and not config_tokenizer_class .endswith ("Fast" ):
339- tokenizer_class_candidate = f"{ config_tokenizer_class } Fast"
340- tokenizer_class = tokenizer_class_from_name_hf (tokenizer_class_candidate )
341- # Not found in Transformers, try local PaddleFormers registry
342- if tokenizer_class is None :
343- tokenizer_class = tokenizer_class_from_name (tokenizer_class_candidate )
344-
348+ if tokenizer_config_class is not None :
349+ tokenizer_class_candidate = tokenizer_config_class
350+ tokenizer_class = tokenizer_class_from_name_hf (tokenizer_class_candidate )
351+ # Not found in Transformers, try local PaddleFormers registry
345352 if tokenizer_class is None :
346- tokenizer_class_candidate = config_tokenizer_class
353+ tokenizer_class = tokenizer_class_from_name (tokenizer_class_candidate )
354+
355+ if tokenizer_class is None and not tokenizer_config_class .endswith ("Fast" ):
356+ tokenizer_class_candidate = f"{ tokenizer_config_class } Fast"
347357 tokenizer_class = tokenizer_class_from_name_hf (tokenizer_class_candidate )
348358 # Not found in Transformers, try local PaddleFormers registry
349359 if tokenizer_class is None :
350360 tokenizer_class = tokenizer_class_from_name (tokenizer_class_candidate )
361+
362+ if tokenizer_class is not None and tokenizer_class .__name__ == "PythonBackend" :
363+ tokenizer_class = TokenizersBackend
364+ # Fallback to TokenizersBackend if the class wasn't found
351365 if tokenizer_class is None :
352- raise ValueError (
353- f"Tokenizer class { tokenizer_class_candidate } does not exist or is not currently imported."
354- )
366+ tokenizer_class = TokenizersBackend
355367
356368 # Bind PaddleTokenizerMixin
357369 tokenizer_class = _bind_paddle_mixin_if_available (tokenizer_class )
358370 return tokenizer_class .from_pretrained (pretrained_model_name_or_path , * inputs , ** kwargs )
359371
372+ if getattr (config , "tokenizer_class" , None ):
373+ _class = config .tokenizer_class
374+ if "PreTrainedTokenizerFast" not in _class :
375+ _class = _class .replace ("Fast" , "" )
376+ tokenizer_class = tokenizer_class_from_name_hf (_class )
377+ # Not found in Transformers, try local PaddleFormers registry
378+ if tokenizer_class is None :
379+ tokenizer_class = tokenizer_class_from_name (_class )
380+ return tokenizer_class .from_pretrained (pretrained_model_name_or_path , * inputs , ** kwargs )
381+
360382 if has_remote_code :
361383 if use_fast and tokenizer_auto_map [1 ] is not None :
362384 class_ref = tokenizer_auto_map [1 ]
@@ -406,11 +428,29 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
406428 # Bind PaddleTokenizerMixin
407429 tokenizer_class_py = _bind_paddle_mixin_if_available (tokenizer_class_py )
408430 return tokenizer_class_py .from_pretrained (pretrained_model_name_or_path , * inputs , ** kwargs )
409- else :
410- raise ValueError (
411- "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed "
412- "in order to use this tokenizer."
413- )
431+
432+ # Fallback: try tokenizer_class from tokenizer_config.json
433+ tokenizer_config_class = tokenizer_config .get ("tokenizer_class" , None )
434+ if tokenizer_config_class is not None :
435+ if tokenizer_config_class != "TokenizersBackend" and "Fast" in tokenizer_config_class :
436+ tokenizer_config_class = tokenizer_config_class [:- 4 ]
437+
438+ tokenizer_class = tokenizer_class_from_name_hf (tokenizer_config_class )
439+ # Not found in Transformers, try local PaddleFormers registry
440+ if tokenizer_class is None :
441+ tokenizer_class = tokenizer_class_from_name (tokenizer_config_class )
442+
443+ if tokenizer_class is None and not tokenizer_config_class .endswith ("Fast" ):
444+ tokenizer_class = tokenizer_class_from_name_hf (tokenizer_config_class + "Fast" )
445+ # Not found in Transformers, try local PaddleFormers registry
446+ if tokenizer_class is None :
447+ tokenizer_class = tokenizer_class_from_name (tokenizer_config_class + "Fast" )
448+ if tokenizer_class is not None and tokenizer_class .__name__ == "PythonBackend" :
449+ tokenizer_class = TokenizersBackend
450+ if tokenizer_class is None :
451+ tokenizer_class = TokenizersBackend
452+ tokenizer_class = _bind_paddle_mixin_if_available (tokenizer_class )
453+ return tokenizer_class .from_pretrained (pretrained_model_name_or_path , * inputs , ** kwargs )
414454
415455 raise ValueError (
416456 f"Unrecognized configuration class { config .__class__ } to build an AutoTokenizer.\n "
0 commit comments