File tree Expand file tree Collapse file tree 14 files changed +32
-39
lines changed Expand file tree Collapse file tree 14 files changed +32
-39
lines changed Original file line number Diff line number Diff line change 18
18
import unicodedata
19
19
from shutil import copyfile
20
20
21
- from paddle .utils import try_import
21
+ import sentencepiece as spm
22
+
22
23
from .. import PretrainedTokenizer , BertTokenizer , AddedToken
23
24
24
25
__all__ = ['AlbertTokenizer' ]
@@ -581,8 +582,6 @@ def __init__(self,
581
582
self .remove_space = remove_space
582
583
self .keep_accents = keep_accents
583
584
self .sentencepiece_model_file = sentencepiece_model_file
584
-
585
- spm = try_import ("sentencepiece" )
586
585
self .sp_model = spm .SentencePieceProcessor ()
587
586
self .sp_model .Load (sentencepiece_model_file )
588
587
@@ -597,7 +596,6 @@ def __getstate__(self):
597
596
598
597
def __setstate__ (self , d ):
599
598
self .__dict__ = d
600
- spm = try_import ("sentencepiece" )
601
599
self .sp_model = spm .SentencePieceProcessor ()
602
600
self .sp_model .Load (self .sentencepiece_model_file )
603
601
Original file line number Diff line number Diff line change 12
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
13
# See the License for the specific language governing permissions and
14
14
# limitations under the License.
15
- import itertools
15
+
16
16
from paddle .utils import try_import
17
17
from .. import GPTTokenizer , AddedToken
18
18
Original file line number Diff line number Diff line change 18
18
import six
19
19
import re
20
20
import numpy as np
21
- from paddle .utils import try_import
21
+
22
+ import sentencepiece as spm
22
23
from paddlenlp .data import Vocab
24
+
23
25
from .. import PretrainedTokenizer , AddedToken
24
26
25
27
__all__ = ['BigBirdTokenizer' ]
@@ -95,8 +97,7 @@ def __init__(self,
95
97
"`tokenizer = BigBirdTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
96
98
.format (sentencepiece_model_file ))
97
99
self .encoding = encoding
98
- mod = try_import ('sentencepiece' )
99
- self .sp_model = mod .SentencePieceProcessor ()
100
+ self .sp_model = spm .SentencePieceProcessor ()
100
101
if os .path .isfile (sentencepiece_model_file ):
101
102
self .sp_model .Load (sentencepiece_model_file )
102
103
vocab_dict = {}
Original file line number Diff line number Diff line change 18
18
import six
19
19
import shutil
20
20
21
- from paddle .utils import try_import
21
+ import sentencepiece as spm
22
+
22
23
from paddlenlp .utils .env import MODEL_HOME
23
24
24
25
from .. import BasicTokenizer , PretrainedTokenizer , WordpieceTokenizer
@@ -425,8 +426,7 @@ def __init__(self,
425
426
cls_token = "[CLS]" ,
426
427
mask_token = "[MASK]" ,
427
428
** kwargs ):
428
- mod = try_import ('sentencepiece' )
429
- self .sp_model = mod .SentencePieceProcessor ()
429
+ self .sp_model = spm .SentencePieceProcessor ()
430
430
self .word_dict = word_dict
431
431
432
432
self .do_lower_case = do_lower_case
Original file line number Diff line number Diff line change 14
14
15
15
import os
16
16
17
- from paddle . utils import try_import
17
+ import sentencepiece as spm
18
18
19
19
from .. import PretrainedTokenizer
20
20
from ..tokenizer_utils import _is_control , _is_whitespace
@@ -91,8 +91,7 @@ def __init__(self,
91
91
pad_token = "[PAD]" ,
92
92
cls_token = "[CLS]" ,
93
93
mask_token = "[MASK]" ):
94
- mod = try_import ('sentencepiece' )
95
- self .sp_model = mod .SentencePieceProcessor ()
94
+ self .sp_model = spm .SentencePieceProcessor ()
96
95
97
96
self .do_lower_case = do_lower_case
98
97
self .encoding = encoding
Original file line number Diff line number Diff line change 19
19
import json
20
20
import jieba
21
21
import shutil
22
+ import sentencepiece as spm
22
23
from paddle .utils import try_import
23
24
24
- from paddlenlp .utils .log import logger
25
-
26
25
from .. import PretrainedTokenizer , AddedToken
27
26
28
27
__all__ = [
@@ -135,8 +134,8 @@ def __init__(
135
134
"`tokenizer = GPTTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
136
135
.format (model_file ))
137
136
self .max_len = max_len if max_len is not None else int (1e12 )
138
- mod = try_import ( "sentencepiece" )
139
- self .sp = mod . SentencePieceProcessor ( model_file = model_file )
137
+ self . sp = spm . SentencePieceProcessor ( )
138
+ self .sp . Load ( model_file )
140
139
self .translator = str .maketrans (" \n " , "\u2582 \u2583 " )
141
140
142
141
'''
Original file line number Diff line number Diff line change 17
17
import itertools
18
18
from dataclasses import dataclass , field
19
19
from collections import OrderedDict
20
- from paddle .utils import try_import
21
20
from typing import List , Optional
22
21
22
+ import sentencepiece as spm
23
+
23
24
from .. import PretrainedTokenizer , AddedToken
24
25
from ..tokenizer_utils import _is_punctuation , _is_control , _is_whitespace
25
26
@@ -90,7 +91,6 @@ def __init__(self,
90
91
self ._unk_token = unk_token
91
92
self ._pad_token = pad_token
92
93
self ._mask_token = mask_token
93
- spm = try_import ("sentencepiece" )
94
94
self .sp_model = spm .SentencePieceProcessor ()
95
95
self .sp_model .Load (vocab_file )
96
96
self .vocab_file = vocab_file
Original file line number Diff line number Diff line change 14
14
15
15
import itertools
16
16
from contextlib import contextmanager
17
- from paddle .utils import try_import
17
+ import sentencepiece as spm
18
+
18
19
from .. import PretrainedTokenizer , AddedToken
19
20
20
21
__all__ = ['MBartTokenizer' ]
@@ -239,7 +240,6 @@ def __init__(self,
239
240
mask_token , lstrip = True ,
240
241
rstrip = False ) if isinstance (mask_token , str ) else mask_token
241
242
self ._build_special_tokens_map_extended (mask_token = mask_token )
242
- spm = try_import ('sentencepiece' )
243
243
self .sp_model = spm .SentencePieceProcessor ()
244
244
self .sp_model .Load (str (vocab_file ))
245
245
self .fairseq_offset = 1
@@ -432,7 +432,6 @@ def __init__(self,
432
432
mask_token , lstrip = True ,
433
433
rstrip = False ) if isinstance (mask_token , str ) else mask_token
434
434
self ._build_special_tokens_map_extended (mask_token = mask_token )
435
- spm = try_import ('sentencepiece' )
436
435
self .sp_model = spm .SentencePieceProcessor ()
437
436
self .sp_model .Load (str (vocab_file ))
438
437
self .fairseq_offset = 1
Original file line number Diff line number Diff line change 12
12
# See the License for the specific language governing permissions and
13
13
# limitations under the License.
14
14
15
- from paddle .utils import try_import
15
+ import sentencepiece as spm
16
+
16
17
from ..albert .tokenizer import AlbertEnglishTokenizer
17
18
18
19
__all__ = ['ReformerTokenizer' ]
@@ -76,8 +77,6 @@ def __init__(self,
76
77
self .remove_space = remove_space
77
78
self .keep_accents = keep_accents
78
79
self .sentencepiece_model_file = sentencepiece_model_file
79
-
80
- spm = try_import ("sentencepiece" )
81
80
self .sp_model = spm .SentencePieceProcessor ()
82
81
self .sp_model .Load (sentencepiece_model_file )
83
82
Original file line number Diff line number Diff line change 16
16
from shutil import copyfile
17
17
from typing import List , Optional , Tuple
18
18
19
- try :
20
- import sentencepiece as spm
21
- except :
22
- pass
19
+ import sentencepiece as spm
20
+
23
21
from .. import PretrainedTokenizer
24
22
25
23
__all__ = ['RemBertTokenizer' ]
You can’t perform that action at this time.
0 commit comments