Skip to content

Commit 70ce956

Browse files
authored
moved zip code lookup
1 parent a498d0e commit 70ce956

File tree

1 file changed

+39
-38
lines changed

1 file changed

+39
-38
lines changed

extractors/personal_identifiers/zipcode_extraction/code_snippet_common.md

Lines changed: 39 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,45 @@ import spacy
44
from typing import List, Tuple
55

66

7+
8+
loaded_models = {}
9+
def load_spacy(spacy_model):
10+
if spacy_model not in loaded_models:
11+
loaded_models[spacy_model] = spacy.load(spacy_model)
12+
return loaded_models[spacy_model]
13+
14+
compiled_regex = {}
15+
16+
def get_regex(country_id:str):
17+
global compiled_regex
18+
19+
if country_id not in compiled_regex:
20+
r = zip_regex_lookup.get(country_id)
21+
if not r:
22+
raise Exception("unknown country ISO code")
23+
compiled_regex[country_id] = re.compile(r)
24+
return compiled_regex[country_id]
25+
26+
27+
def zipcode_extraction(text: str, extraction_keyword: str, country_id: str, spacy_model: str = "en_core_web_sm") -> List[Tuple[str, int, int]]:
28+
"""
29+
@param text: the input text
30+
@param extraction_keyword: the label that is assigned to extracted words
31+
@param country_id: ISO code of a country
32+
@return: extracted zip code positions
33+
"""
34+
nlp = load_spacy(spacy_model)
35+
doc = nlp(text)
36+
37+
regex = get_regex(country_id)
38+
39+
zipcode_positions = []
40+
for match in regex.finditer(text):
41+
start, end = match.span()
42+
span = doc.char_span(start, end, alignment_mode="expand")
43+
zipcode_positions.append((extraction_keyword, span.start, span.end))
44+
return zipcode_positions
45+
746
zip_regex_lookup = {
847
"GB": r"GIR[ ]?0AA|((AB|AL|B|BA|BB|BD|BH|BL|BN|BR|BS|BT|CA|CB|CF|CH|CM|CO|CR|CT|CV|CW|DA|DD|DE|DG|DH|DL|DN|DT|DY|E|EC|EH|EN|EX|FK|FY|G|GL|GY|GU|HA|HD|HG|HP|HR|HS|HU|HX|IG|IM|IP|IV|JE|KA|KT|KW|KY|L|LA|LD|LE|LL|LN|LS|LU|M|ME|MK|ML|N|NE|NG|NN|NP|NR|NW|OL|OX|PA|PE|PH|PL|PO|PR|RG|RH|RM|S|SA|SE|SG|SK|SL|SM|SN|SO|SP|SR|SS|ST|SW|SY|TA|TD|TF|TN|TQ|TR|TS|TW|UB|W|WA|WC|WD|WF|WN|WR|WS|WV|YO|ZE)(\d[\dA-Z]?[ ]?\d[ABD-HJLN-UW-Z]{2}))|BFPO[ ]?\d{1,4}",
948
"JE": r"JE\d[\dA-Z]?[ ]?\d[ABD-HJLN-UW-Z]{2}",
@@ -165,44 +204,6 @@ zip_regex_lookup = {
165204
"YT": r"976\d{2}"
166205
}
167206

168-
loaded_models = {}
169-
def load_spacy(spacy_model):
170-
if spacy_model not in loaded_models:
171-
loaded_models[spacy_model] = spacy.load(spacy_model)
172-
return loaded_models[spacy_model]
173-
174-
compiled_regex = {}
175-
176-
def get_regex(country_id:str):
177-
global compiled_regex
178-
179-
if country_id not in compiled_regex:
180-
r = zip_regex_lookup.get(country_id)
181-
if not r:
182-
raise Exception("unknown country ISO code")
183-
compiled_regex[country_id] = re.compile(r)
184-
return compiled_regex[country_id]
185-
186-
187-
def zipcode_extraction(text: str, extraction_keyword: str, country_id: str, spacy_model: str = "en_core_web_sm") -> List[Tuple[str, int, int]]:
188-
"""
189-
@param text: the input text
190-
@param extraction_keyword: the label that is assigned to extracted words
191-
@param country_id: ISO code of a country
192-
@return: extracted zip code positions
193-
"""
194-
nlp = load_spacy(spacy_model)
195-
doc = nlp(text)
196-
197-
regex = get_regex(country_id)
198-
199-
zipcode_positions = []
200-
for match in regex.finditer(text):
201-
start, end = match.span()
202-
span = doc.char_span(start, end, alignment_mode="expand")
203-
zipcode_positions.append((extraction_keyword, span.start, span.end))
204-
return zipcode_positions
205-
206207
# ↑ necessary bricks function
207208
# -----------------------------------------------------------------------------------------
208209
# ↓ example implementation (code further down below)

0 commit comments

Comments
 (0)