@@ -4,6 +4,45 @@ import spacy
4
4
from typing import List, Tuple
5
5
6
6
7
+
8
+ loaded_models = {}
9
+ def load_spacy (spacy_model ):
10
+ if spacy_model not in loaded_models:
11
+ loaded_models[spacy_model] = spacy.load(spacy_model)
12
+ return loaded_models[spacy_model]
13
+
14
+ compiled_regex = {}
15
+
16
+ def get_regex (country_id :str ):
17
+ global compiled_regex
18
+
19
+ if country_id not in compiled_regex:
20
+ r = zip_regex_lookup.get(country_id)
21
+ if not r:
22
+ raise Exception (" unknown country ISO code" )
23
+ compiled_regex[country_id] = re.compile(r)
24
+ return compiled_regex[country_id]
25
+
26
+
27
+ def zipcode_extraction (text : str , extraction_keyword : str , country_id : str , spacy_model : str = " en_core_web_sm" ) -> List[Tuple[str , int , int ]]:
28
+ """
29
+ @param text: the input text
30
+ @param extraction_keyword: the label that is assigned to extracted words
31
+ @param country_id: ISO code of a country
32
+ @return: extracted zip code positions
33
+ """
34
+ nlp = load_spacy(spacy_model)
35
+ doc = nlp(text)
36
+
37
+ regex = get_regex(country_id)
38
+
39
+ zipcode_positions = []
40
+ for match in regex.finditer(text):
41
+ start, end = match.span()
42
+ span = doc.char_span(start, end, alignment_mode = " expand" )
43
+ zipcode_positions.append((extraction_keyword, span.start, span.end))
44
+ return zipcode_positions
45
+
7
46
zip_regex_lookup = {
8
47
" GB" : r " GIR[ ]? 0AA| (( AB| AL| B| BA| BB| BD| BH| BL| BN| BR| BS| BT| CA| CB| CF| CH| CM| CO| CR| CT| CV| CW| DA| DD| DE| DG| DH| DL| DN| DT| DY| E| EC| EH| EN| EX| FK| FY| G| GL| GY| GU| HA| HD| HG| HP| HR| HS| HU| HX| IG| IM| IP| IV| JE| KA| KT| KW| KY| L| LA| LD| LE| LL| LN| LS| LU| M| ME| MK| ML| N| NE| NG| NN| NP| NR| NW| OL| OX| PA| PE| PH| PL| PO| PR| RG| RH| RM| S| SA| SE| SG| SK| SL| SM| SN| SO| SP| SR| SS| ST| SW| SY| TA| TD| TF| TN| TQ| TR| TS| TW| UB| W| WA| WC| WD| WF| WN| WR| WS| WV| YO| ZE) ( \d [\d A-Z ]? [ ]? \d [ABD-HJLN-UW-Z ]{2} )) | BFPO[ ]? \d {1,4} " ,
9
48
" JE" : r " JE\d [\d A-Z ]? [ ]? \d [ABD-HJLN-UW-Z ]{2} " ,
@@ -165,44 +204,6 @@ zip_regex_lookup = {
165
204
" YT" : r " 976\d {2} "
166
205
}
167
206
168
- loaded_models = {}
169
- def load_spacy (spacy_model ):
170
- if spacy_model not in loaded_models:
171
- loaded_models[spacy_model] = spacy.load(spacy_model)
172
- return loaded_models[spacy_model]
173
-
174
- compiled_regex = {}
175
-
176
- def get_regex (country_id :str ):
177
- global compiled_regex
178
-
179
- if country_id not in compiled_regex:
180
- r = zip_regex_lookup.get(country_id)
181
- if not r:
182
- raise Exception (" unknown country ISO code" )
183
- compiled_regex[country_id] = re.compile(r)
184
- return compiled_regex[country_id]
185
-
186
-
187
- def zipcode_extraction (text : str , extraction_keyword : str , country_id : str , spacy_model : str = " en_core_web_sm" ) -> List[Tuple[str , int , int ]]:
188
- """
189
- @param text: the input text
190
- @param extraction_keyword: the label that is assigned to extracted words
191
- @param country_id: ISO code of a country
192
- @return: extracted zip code positions
193
- """
194
- nlp = load_spacy(spacy_model)
195
- doc = nlp(text)
196
-
197
- regex = get_regex(country_id)
198
-
199
- zipcode_positions = []
200
- for match in regex.finditer(text):
201
- start, end = match.span()
202
- span = doc.char_span(start, end, alignment_mode = " expand" )
203
- zipcode_positions.append((extraction_keyword, span.start, span.end))
204
- return zipcode_positions
205
-
206
207
# ↑ necessary bricks function
207
208
# -----------------------------------------------------------------------------------------
208
209
# ↓ example implementation (code further down below)
0 commit comments