Skip to content

Commit b596984

Browse files
authored
Merge pull request #376 from code-kern-ai/zipcode-extraction
check for match & remove faulty backslash
2 parents e492728 + 70ce956 commit b596984

File tree

2 files changed

+331
-304
lines changed

2 files changed

+331
-304
lines changed

extractors/personal_identifiers/zipcode_extraction/code_snippet_common.md

Lines changed: 173 additions & 152 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,38 @@ import re
33
import spacy
44
from typing import List, Tuple
55

6-
def zipcode_extraction(text: str, extraction_keyword: str, country_id: str) -> List[Tuple[str, int]]:
6+
7+
8+
loaded_models = {}
9+
def load_spacy(spacy_model):
10+
if spacy_model not in loaded_models:
11+
loaded_models[spacy_model] = spacy.load(spacy_model)
12+
return loaded_models[spacy_model]
13+
14+
compiled_regex = {}
15+
16+
def get_regex(country_id:str):
17+
global compiled_regex
18+
19+
if country_id not in compiled_regex:
20+
r = zip_regex_lookup.get(country_id)
21+
if not r:
22+
raise Exception("unknown country ISO code")
23+
compiled_regex[country_id] = re.compile(r)
24+
return compiled_regex[country_id]
25+
26+
27+
def zipcode_extraction(text: str, extraction_keyword: str, country_id: str, spacy_model: str = "en_core_web_sm") -> List[Tuple[str, int, int]]:
728
"""
829
@param text: the input text
930
@param extraction_keyword: the label that is assigned to extracted words
1031
@param country_id: ISO code of a country
1132
@return: extracted zip code positions
1233
"""
13-
nlp = spacy.load("en_core_web_sm")
34+
nlp = load_spacy(spacy_model)
1435
doc = nlp(text)
1536

16-
regex = re.compile(zip_codes[country_id])
37+
regex = get_regex(country_id)
1738

1839
zipcode_positions = []
1940
for match in regex.finditer(text):
@@ -22,171 +43,171 @@ def zipcode_extraction(text: str, extraction_keyword: str, country_id: str) -> L
2243
zipcode_positions.append((extraction_keyword, span.start, span.end))
2344
return zipcode_positions
2445

25-
# ↑ necessary bricks function
26-
# -----------------------------------------------------------------------------------------
27-
# ↓ example implementation (code further down below)
28-
29-
zip_codes = {
30-
"GB": r"GIR[ ]?0AA|((AB|AL|B|BA|BB|BD|BH|BL|BN|BR|BS|BT|CA|CB|CF|CH|CM|CO|CR|CT|CV|CW|DA|DD|DE|DG|DH|DL|DN|DT|DY|E|EC|EH|EN|EX|FK|FY|G|GL|GY|GU|HA|HD|HG|HP|HR|HS|HU|HX|IG|IM|IP|IV|JE|KA|KT|KW|KY|L|LA|LD|LE|LL|LN|LS|LU|M|ME|MK|ML|N|NE|NG|NN|NP|NR|NW|OL|OX|PA|PE|PH|PL|PO|PR|RG|RH|RM|S|SA|SE|SG|SK|SL|SM|SN|SO|SP|SR|SS|ST|SW|SY|TA|TD|TF|TN|TQ|TR|TS|TW|UB|W|WA|WC|WD|WF|WN|WR|WS|WV|YO|ZE)(\\d[\\dA-Z]?[ ]?\\d[ABD-HJLN-UW-Z]{2}))|BFPO[ ]?\\d{1,4}",
31-
"JE": r"JE\\d[\\dA-Z]?[ ]?\\d[ABD-HJLN-UW-Z]{2}",
32-
"GG": r"GY\\d[\\dA-Z]?[ ]?\\d[ABD-HJLN-UW-Z]{2}",
33-
"IM": r"IM\\d[\\dA-Z]?[ ]?\\d[ABD-HJLN-UW-Z]{2}",
34-
"US": r"\\d{5}([ \\-]\\d{4})?",
35-
"CA": r"[ABCEGHJKLMNPRSTVXY]\\d[ABCEGHJ-NPRSTV-Z][ ]?\\d[ABCEGHJ-NPRSTV-Z]\\d",
36-
"DE": r"\\d{5}",
37-
"JP": r"\\d{3}-\\d{4}",
38-
"FR": r"\\d{2}[ ]?\\d{3}",
39-
"AU": r"\\d{4}",
40-
"IT": r"\\d{5}",
41-
"CH": r"\\d{4}",
42-
"AT": r"\\d{4}",
43-
"ES": r"\\d{5}",
44-
"NL": r"\\d{4}[ ]?[A-Z]{2}",
45-
"BE": r"\\d{4}",
46-
"DK": r"\\d{4}",
47-
"SE": r"\\d{3}[ ]?\\d{2}",
48-
"NO": r"\\d{4}",
49-
"BR": r"\\d{5}[\\-]?\\d{3}",
50-
"PT": r"\\d{4}([\\-]\\d{3})?",
51-
"FI": r"\\d{5}",
52-
"AX": r"22\\d{3}",
53-
"KR": r"\\d{3}[\\-]\\d{3}",
54-
"CN": r"\\d{6}",
55-
"TW": r"\\d{3}(\\d{2})",
56-
"SG": r"\\d{6}",
57-
"DZ": r"\\d{5}",
58-
"AD": r"AD\\d{3}",
59-
"AR": r"([A-HJ-NP-Z])?\\d{4}([A-Z]{3})?",
60-
"AM": r"(37)?\\d{4}",
61-
"AZ": r"\\d{4}",
62-
"BH": r"((1[0-2]|[2-9])\\d{2})?",
63-
"BD": r"\\d{4}",
64-
"BB": r"(BB\\d{5})?",
65-
"BY": r"\\d{6}",
46+
zip_regex_lookup = {
47+
"GB": r"GIR[ ]?0AA|((AB|AL|B|BA|BB|BD|BH|BL|BN|BR|BS|BT|CA|CB|CF|CH|CM|CO|CR|CT|CV|CW|DA|DD|DE|DG|DH|DL|DN|DT|DY|E|EC|EH|EN|EX|FK|FY|G|GL|GY|GU|HA|HD|HG|HP|HR|HS|HU|HX|IG|IM|IP|IV|JE|KA|KT|KW|KY|L|LA|LD|LE|LL|LN|LS|LU|M|ME|MK|ML|N|NE|NG|NN|NP|NR|NW|OL|OX|PA|PE|PH|PL|PO|PR|RG|RH|RM|S|SA|SE|SG|SK|SL|SM|SN|SO|SP|SR|SS|ST|SW|SY|TA|TD|TF|TN|TQ|TR|TS|TW|UB|W|WA|WC|WD|WF|WN|WR|WS|WV|YO|ZE)(\d[\dA-Z]?[ ]?\d[ABD-HJLN-UW-Z]{2}))|BFPO[ ]?\d{1,4}",
48+
"JE": r"JE\d[\dA-Z]?[ ]?\d[ABD-HJLN-UW-Z]{2}",
49+
"GG": r"GY\d[\dA-Z]?[ ]?\d[ABD-HJLN-UW-Z]{2}",
50+
"IM": r"IM\d[\dA-Z]?[ ]?\d[ABD-HJLN-UW-Z]{2}",
51+
"US": r"\d{5}([ \\-]\d{4})?",
52+
"CA": r"[ABCEGHJKLMNPRSTVXY]\d[ABCEGHJ-NPRSTV-Z][ ]?\d[ABCEGHJ-NPRSTV-Z]\d",
53+
"DE": r"\d{5}",
54+
"JP": r"\d{3}-\d{4}",
55+
"FR": r"\d{2}[ ]?\d{3}",
56+
"AU": r"\d{4}",
57+
"IT": r"\d{5}",
58+
"CH": r"\d{4}",
59+
"AT": r"\d{4}",
60+
"ES": r"\d{5}",
61+
"NL": r"\d{4}[ ]?[A-Z]{2}",
62+
"BE": r"\d{4}",
63+
"DK": r"\d{4}",
64+
"SE": r"\d{3}[ ]?\d{2}",
65+
"NO": r"\d{4}",
66+
"BR": r"\d{5}[\\-]?\d{3}",
67+
"PT": r"\d{4}([\\-]\d{3})?",
68+
"FI": r"\d{5}",
69+
"AX": r"22\d{3}",
70+
"KR": r"\d{3}[\\-]\d{3}",
71+
"CN": r"\d{6}",
72+
"TW": r"\d{3}(\d{2})",
73+
"SG": r"\d{6}",
74+
"DZ": r"\d{5}",
75+
"AD": r"AD\d{3}",
76+
"AR": r"([A-HJ-NP-Z])?\d{4}([A-Z]{3})?",
77+
"AM": r"(37)?\d{4}",
78+
"AZ": r"\d{4}",
79+
"BH": r"((1[0-2]|[2-9])\d{2})?",
80+
"BD": r"\d{4}",
81+
"BB": r"(BB\d{5})?",
82+
"BY": r"\d{6}",
6683
"BM": r"[A-Z]{2}[ ]?[A-Z0-9]{2}",
67-
"BA": r"\\d{5}",
84+
"BA": r"\d{5}",
6885
"IO": r"BBND 1ZZ",
69-
"BN": r"[A-Z]{2}[ ]?\\d{4}",
70-
"BG": r"\\d{4}",
71-
"KH": r"\\d{5}",
72-
"CV": r"\\d{4}",
73-
"CL": r"\\d{7}",
74-
"CR": r"\\d{4,5}|\\d{3}-\\d{4}",
75-
"HR": r"\\d{5}",
76-
"CY": r"\\d{4}",
77-
"CZ": r"\\d{3}[ ]?\\d{2}",
78-
"DO": r"\\d{5}",
79-
"EC": r"([A-Z]\\d{4}[A-Z]|(?:[A-Z]{2})?\\d{6})?",
80-
"EG": r"\\d{5}",
81-
"EE": r"\\d{5}",
82-
"FO": r"\\d{3}",
83-
"GE": r"\\d{4}",
84-
"GR": r"\\d{3}[ ]?\\d{2}",
85-
"GL": r"39\\d{2}",
86-
"GT": r"\\d{5}",
87-
"HT": r"\\d{4}",
88-
"HN": r"(?:\\d{5})?",
89-
"HU": r"\\d{4}",
90-
"IS": r"\\d{3}",
91-
"IN": r"\\d{6}",
92-
"ID": r"\\d{5}",
93-
"IL": r"\\d{5}",
94-
"JO": r"\\d{5}",
95-
"KZ": r"\\d{6}",
96-
"KE": r"\\d{5}",
97-
"KW": r"\\d{5}",
98-
"LA": r"\\d{5}",
99-
"LV": r"\\d{4}",
100-
"LB": r"(\\d{4}([ ]?\\d{4})?)?",
86+
"BN": r"[A-Z]{2}[ ]?\d{4}",
87+
"BG": r"\d{4}",
88+
"KH": r"\d{5}",
89+
"CV": r"\d{4}",
90+
"CL": r"\d{7}",
91+
"CR": r"\d{4,5}|\d{3}-\d{4}",
92+
"HR": r"\d{5}",
93+
"CY": r"\d{4}",
94+
"CZ": r"\d{3}[ ]?\d{2}",
95+
"DO": r"\d{5}",
96+
"EC": r"([A-Z]\d{4}[A-Z]|(?:[A-Z]{2})?\d{6})?",
97+
"EG": r"\d{5}",
98+
"EE": r"\d{5}",
99+
"FO": r"\d{3}",
100+
"GE": r"\d{4}",
101+
"GR": r"\d{3}[ ]?\d{2}",
102+
"GL": r"39\d{2}",
103+
"GT": r"\d{5}",
104+
"HT": r"\d{4}",
105+
"HN": r"(?:\d{5})?",
106+
"HU": r"\d{4}",
107+
"IS": r"\d{3}",
108+
"IN": r"\d{6}",
109+
"ID": r"\d{5}",
110+
"IL": r"\d{5}",
111+
"JO": r"\d{5}",
112+
"KZ": r"\d{6}",
113+
"KE": r"\d{5}",
114+
"KW": r"\d{5}",
115+
"LA": r"\d{5}",
116+
"LV": r"\d{4}",
117+
"LB": r"(\d{4}([ ]?\d{4})?)?",
101118
"LI": r"(948[5-9])|(949[0-7])",
102-
"LT": r"\\d{5}",
103-
"LU": r"\\d{4}",
104-
"MK": r"\\d{4}",
105-
"MY": r"\\d{5}",
106-
"MV": r"\\d{5}",
107-
"MT": r"[A-Z]{3}[ ]?\\d{2,4}",
108-
"MU": r"(\\d{3}[A-Z]{2}\\d{3})?",
109-
"MX": r"\\d{5}",
110-
"MD": r"\\d{4}",
111-
"MC": r"980\\d{2}",
112-
"MA": r"\\d{5}",
113-
"NP": r"\\d{5}",
114-
"NZ": r"\\d{4}",
115-
"NI": r"((\\d{4}-)?\\d{3}-\\d{3}(-\\d{1})?)?",
116-
"NG": r"(\\d{6})?",
117-
"OM": r"(PC )?\\d{3}",
118-
"PK": r"\\d{5}",
119-
"PY": r"\\d{4}",
120-
"PH": r"\\d{4}",
121-
"PL": r"\\d{2}-\\d{3}",
122-
"PR": r"00[679]\\d{2}([ \\-]\\d{4})?",
123-
"RO": r"\\d{6}",
124-
"RU": r"\\d{6}",
125-
"SM": r"4789\\d",
126-
"SA": r"\\d{5}",
127-
"SN": r"\\d{5}",
128-
"SK": r"\\d{3}[ ]?\\d{2}",
129-
"SI": r"\\d{4}",
130-
"ZA": r"\\d{4}",
131-
"LK": r"\\d{5}",
132-
"TJ": r"\\d{6}",
133-
"TH": r"\\d{5}",
134-
"TN": r"\\d{4}",
135-
"TR": r"\\d{5}",
136-
"TM": r"\\d{6}",
137-
"UA": r"\\d{5}",
138-
"UY": r"\\d{5}",
139-
"UZ": r"\\d{6}",
119+
"LT": r"\d{5}",
120+
"LU": r"\d{4}",
121+
"MK": r"\d{4}",
122+
"MY": r"\d{5}",
123+
"MV": r"\d{5}",
124+
"MT": r"[A-Z]{3}[ ]?\d{2,4}",
125+
"MU": r"(\d{3}[A-Z]{2}\d{3})?",
126+
"MX": r"\d{5}",
127+
"MD": r"\d{4}",
128+
"MC": r"980\d{2}",
129+
"MA": r"\d{5}",
130+
"NP": r"\d{5}",
131+
"NZ": r"\d{4}",
132+
"NI": r"((\d{4}-)?\d{3}-\d{3}(-\d{1})?)?",
133+
"NG": r"(\d{6})?",
134+
"OM": r"(PC )?\d{3}",
135+
"PK": r"\d{5}",
136+
"PY": r"\d{4}",
137+
"PH": r"\d{4}",
138+
"PL": r"\d{2}-\d{3}",
139+
"PR": r"00[679]\d{2}([ \\-]\d{4})?",
140+
"RO": r"\d{6}",
141+
"RU": r"\d{6}",
142+
"SM": r"4789\d",
143+
"SA": r"\d{5}",
144+
"SN": r"\d{5}",
145+
"SK": r"\d{3}[ ]?\d{2}",
146+
"SI": r"\d{4}",
147+
"ZA": r"\d{4}",
148+
"LK": r"\d{5}",
149+
"TJ": r"\d{6}",
150+
"TH": r"\d{5}",
151+
"TN": r"\d{4}",
152+
"TR": r"\d{5}",
153+
"TM": r"\d{6}",
154+
"UA": r"\d{5}",
155+
"UY": r"\d{5}",
156+
"UZ": r"\d{6}",
140157
"VA": r"00120",
141-
"VE": r"\\d{4}",
142-
"ZM": r"\\d{5}",
158+
"VE": r"\d{4}",
159+
"ZM": r"\d{5}",
143160
"AS": r"96799",
144161
"CC": r"6799",
145-
"CK": r"\\d{4}",
146-
"RS": r"\\d{6}",
147-
"ME": r"8\\d{4}",
148-
"CS": r"\\d{5}",
149-
"YU": r"\\d{5}",
162+
"CK": r"\d{4}",
163+
"RS": r"\d{6}",
164+
"ME": r"8\d{4}",
165+
"CS": r"\d{5}",
166+
"YU": r"\d{5}",
150167
"CX": r"6798",
151-
"ET": r"\\d{4}",
168+
"ET": r"\d{4}",
152169
"FK": r"FIQQ 1ZZ",
153170
"NF": r"2899",
154-
"FM": r"(9694[1-4])([ \\-]\\d{4})?",
155-
"GF": r"9[78]3\\d{2}",
156-
"GN": r"\\d{3}",
157-
"GP": r"9[78][01]\\d{2}",
171+
"FM": r"(9694[1-4])([ \\-]\d{4})?",
172+
"GF": r"9[78]3\d{2}",
173+
"GN": r"\d{3}",
174+
"GP": r"9[78][01]\d{2}",
158175
"GS": r"SIQQ 1ZZ",
159-
"GU": r"969[123]\\d([ \\-]\\d{4})?",
160-
"GW": r"\\d{4}",
161-
"HM": r"\\d{4}",
162-
"IQ": r"\\d{5}",
163-
"KG": r"\\d{6}",
164-
"LR": r"\\d{4}",
165-
"LS": r"\\d{3}",
166-
"MG": r"\\d{3}",
167-
"MH": r"969[67]\\d([ \\-]\\d{4})?",
168-
"MN": r"\\d{6}",
169-
"MP": r"9695[012]([ \\-]\\d{4})?",
170-
"MQ": r"9[78]2\\d{2}",
171-
"NC": r"988\\d{2}",
172-
"NE": r"\\d{4}",
173-
"VI": r"008(([0-4]\\d)|(5[01]))([ \\-]\\d{4})?",
174-
"PF": r"987\\d{2}",
175-
"PG": r"\\d{3}",
176-
"PM": r"9[78]5\\d{2}",
176+
"GU": r"969[123]\d([ \\-]\d{4})?",
177+
"GW": r"\d{4}",
178+
"HM": r"\d{4}",
179+
"IQ": r"\d{5}",
180+
"KG": r"\d{6}",
181+
"LR": r"\d{4}",
182+
"LS": r"\d{3}",
183+
"MG": r"\d{3}",
184+
"MH": r"969[67]\d([ \\-]\d{4})?",
185+
"MN": r"\d{6}",
186+
"MP": r"9695[012]([ \\-]\d{4})?",
187+
"MQ": r"9[78]2\d{2}",
188+
"NC": r"988\d{2}",
189+
"NE": r"\d{4}",
190+
"VI": r"008(([0-4]\d)|(5[01]))([ \\-]\d{4})?",
191+
"PF": r"987\d{2}",
192+
"PG": r"\d{3}",
193+
"PM": r"9[78]5\d{2}",
177194
"PN": r"PCRN 1ZZ",
178195
"PW": r">96940",
179-
"RE": r"9[78]4\\d{2}",
196+
"RE": r"9[78]4\d{2}",
180197
"SH": r"(ASCN|STHL) 1ZZ",
181-
"SJ": r"\\d{4}",
182-
"SO": r"\\d{5}",
183-
"SZ": r"[HLMS]\\d{3}",
198+
"SJ": r"\d{4}",
199+
"SO": r"\d{5}",
200+
"SZ": r"[HLMS]\d{3}",
184201
"TC": r"TKCA 1ZZ",
185-
"WF": r"986\\d{2}",
186-
"XK": r"\\d{5}",
187-
"YT": r"976\\d{2}"
202+
"WF": r"986\d{2}",
203+
"XK": r"\d{5}",
204+
"YT": r"976\d{2}"
188205
}
189206

207+
# ↑ necessary bricks function
208+
# -----------------------------------------------------------------------------------------
209+
# ↓ example implementation (code further down below)
210+
190211
def example_integration():
191212
texts = ["10 Downing Street London SW1A 2AA"]
192213
extraction_keyword = "zip code"

0 commit comments

Comments
 (0)