@@ -61,6 +61,7 @@ class InternS1CheckModuleMixin(ABC):
61
61
62
62
Note that short strings are ignored by this module.
63
63
"""
64
+
64
65
def __init__ (self , * , min_length : int ):
65
66
self .min_length = min_length
66
67
self .REGEX = self ._build_regex ()
@@ -123,6 +124,7 @@ class FastaCheckModule(InternS1CheckModuleMixin):
123
124
124
125
Automatically detects protein sequence using regex patterns.
125
126
"""
127
+
126
128
def __init__ (self , * , min_length : int = 27 ):
127
129
super ().__init__ (min_length = min_length )
128
130
self .auto_detect_token = ["<FASTA_AUTO_DETECT>" , "</FASTA_AUTO_DETECT>" ]
@@ -135,6 +137,7 @@ def check_legitimacy(self, candidate: str):
135
137
return True
136
138
137
139
140
+ # fmt: off
138
141
bonds = ["-" , "=" , "#" , ":" , "/" , "\\ " , "." , "$" ]
139
142
organic_symbols = ["B" , "C" , "N" , "O" , "P" , "S" , "F" , "Cl" , "Br" , "I" ]
140
143
other_allows = bonds + ["[" , "]" , "(" , ")" , ";" ]
@@ -153,6 +156,7 @@ def check_legitimacy(self, candidate: str):
153
156
"Md" , "No" , "Lr" , "Rf" , "Db" , "Sg" , "Bh" , "Hs" , "Mt" , "Ds" ,
154
157
"Rg" , "Cn" , "Nh" , "Fl" , "Mc" , "Lv" , "Ts" , "Og"
155
158
]
159
+ # fmt: on
156
160
157
161
158
162
class SmilesCheckModule (InternS1CheckModuleMixin ):
@@ -163,13 +167,15 @@ class SmilesCheckModule(InternS1CheckModuleMixin):
163
167
or chemical syntax rules. Uses RDKit for precise validation when available,
164
168
otherwise falls back to rule-based validation.
165
169
"""
170
+
166
171
def __init__ (self , * , min_length : int = 10 ):
167
172
super ().__init__ (min_length = min_length )
168
173
self .auto_detect_token = ["<SMILES_AUTO_DETECT>" , "</SMILES_AUTO_DETECT>" ]
169
- self ._SQ_BRACKET_BAN_1 = re .compile (r' (?:[A-GI-Z]|[a-z]){3,}' )
170
- self ._SQ_BRACKET_BAN_2 = re .compile (r' \d{4,}' )
174
+ self ._SQ_BRACKET_BAN_1 = re .compile (r" (?:[A-GI-Z]|[a-z]){3,}" )
175
+ self ._SQ_BRACKET_BAN_2 = re .compile (r" \d{4,}" )
171
176
172
177
def _build_regex (self ):
178
+ # fmt: off
173
179
_two_letter_elements = [
174
180
'Ac' , 'Ag' , 'Al' , 'Am' , 'Ar' , 'As' , 'At' , 'Au' , 'Ba' , 'Be' , 'Bh' , 'Bi' , 'Bk' , 'Br' , 'Ca' , 'Cd' ,
175
181
'Ce' , 'Cf' , 'Cl' , 'Cm' , 'Cn' , 'Co' , 'Cr' , 'Cs' , 'Cu' , 'Db' , 'Ds' , 'Dy' , 'Er' , 'Es' , 'Eu' , 'Fe' ,
@@ -182,6 +188,7 @@ def _build_regex(self):
182
188
_single_letter_elements = [
183
189
"B" , "C" , "F" , "H" , "I" , "K" , "N" , "O" , "P" , "S" , "U" , "V" , "W" , "Y" , 'b' , 'c' , 'n' , 'o' , 'p' , 's'
184
190
]
191
+ # fmt: on
185
192
all_elements_sorted = sorted (_two_letter_elements + _single_letter_elements , key = lambda x : (- len (x ), x ))
186
193
elements_pattern_str = "|" .join (all_elements_sorted )
187
194
@@ -263,17 +270,17 @@ def check_rings_and_brackets(self, text):
263
270
left_sq_bracket += 1
264
271
if left_sq_bracket > right_sq_bracket + 1 :
265
272
return False
266
- if pos == len (text )- 1 :
273
+ if pos == len (text ) - 1 :
267
274
return False
268
- if ']' not in text [pos + 1 :]:
275
+ if "]" not in text [pos + 1 :]:
269
276
return False
270
- bracket_span = text [pos + 1 : text .find (']' )]
277
+ bracket_span = text [pos + 1 : text .find ("]" )]
271
278
272
279
if self ._SQ_BRACKET_BAN_1 .search (bracket_span ) or self ._SQ_BRACKET_BAN_2 .search (bracket_span ):
273
280
return False
274
281
275
- matches = re .findall (r' \d+' , bracket_span )
276
- if len (matches )> 2 :
282
+ matches = re .findall (r" \d+" , bracket_span )
283
+ if len (matches ) > 2 :
277
284
return False
278
285
if c == "]" :
279
286
step = 1
@@ -477,7 +484,9 @@ def __init__(
477
484
for token in self .protect_end_sp_tokens :
478
485
self .tokens_trie .add (token )
479
486
480
- self .new_sp_token_offset .append (len (self ._added_tokens_decoder ) - sum (self .new_sp_token_offset ) + len (self ._extra_special_tokens ))
487
+ self .new_sp_token_offset .append (
488
+ len (self ._added_tokens_decoder ) - sum (self .new_sp_token_offset ) + len (self ._extra_special_tokens )
489
+ )
481
490
self .check_module_list = [SmilesCheckModule (), FastaCheckModule ()]
482
491
483
492
@property
0 commit comments