Skip to content

Commit b871176

Browse files
committed
check for match & remove faulty backslash
1 parent e492728 commit b871176

File tree

2 files changed

+335
-314
lines changed

2 files changed

+335
-314
lines changed

extractors/personal_identifiers/zipcode_extraction/code_snippet_common.md

Lines changed: 184 additions & 164 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,198 @@ import re
33
import spacy
44
from typing import List, Tuple
55

6-
def zipcode_extraction(text: str, extraction_keyword: str, country_id: str) -> List[Tuple[str, int]]:
6+
7+
zip_regex_lookup = {
8+
"GB": r"GIR[ ]?0AA|((AB|AL|B|BA|BB|BD|BH|BL|BN|BR|BS|BT|CA|CB|CF|CH|CM|CO|CR|CT|CV|CW|DA|DD|DE|DG|DH|DL|DN|DT|DY|E|EC|EH|EN|EX|FK|FY|G|GL|GY|GU|HA|HD|HG|HP|HR|HS|HU|HX|IG|IM|IP|IV|JE|KA|KT|KW|KY|L|LA|LD|LE|LL|LN|LS|LU|M|ME|MK|ML|N|NE|NG|NN|NP|NR|NW|OL|OX|PA|PE|PH|PL|PO|PR|RG|RH|RM|S|SA|SE|SG|SK|SL|SM|SN|SO|SP|SR|SS|ST|SW|SY|TA|TD|TF|TN|TQ|TR|TS|TW|UB|W|WA|WC|WD|WF|WN|WR|WS|WV|YO|ZE)(\d[\dA-Z]?[ ]?\d[ABD-HJLN-UW-Z]{2}))|BFPO[ ]?\d{1,4}",
9+
"JE": r"JE\d[\dA-Z]?[ ]?\d[ABD-HJLN-UW-Z]{2}",
10+
"GG": r"GY\d[\dA-Z]?[ ]?\d[ABD-HJLN-UW-Z]{2}",
11+
"IM": r"IM\d[\dA-Z]?[ ]?\d[ABD-HJLN-UW-Z]{2}",
12+
"US": r"\d{5}([ \\-]\d{4})?",
13+
"CA": r"[ABCEGHJKLMNPRSTVXY]\d[ABCEGHJ-NPRSTV-Z][ ]?\d[ABCEGHJ-NPRSTV-Z]\d",
14+
"DE": r"\d{5}",
15+
"JP": r"\d{3}-\d{4}",
16+
"FR": r"\d{2}[ ]?\d{3}",
17+
"AU": r"\d{4}",
18+
"IT": r"\d{5}",
19+
"CH": r"\d{4}",
20+
"AT": r"\d{4}",
21+
"ES": r"\d{5}",
22+
"NL": r"\d{4}[ ]?[A-Z]{2}",
23+
"BE": r"\d{4}",
24+
"DK": r"\d{4}",
25+
"SE": r"\d{3}[ ]?\d{2}",
26+
"NO": r"\d{4}",
27+
"BR": r"\d{5}[\\-]?\d{3}",
28+
"PT": r"\d{4}([\\-]\d{3})?",
29+
"FI": r"\d{5}",
30+
"AX": r"22\d{3}",
31+
"KR": r"\d{3}[\\-]\d{3}",
32+
"CN": r"\d{6}",
33+
"TW": r"\d{3}(\d{2})",
34+
"SG": r"\d{6}",
35+
"DZ": r"\d{5}",
36+
"AD": r"AD\d{3}",
37+
"AR": r"([A-HJ-NP-Z])?\d{4}([A-Z]{3})?",
38+
"AM": r"(37)?\d{4}",
39+
"AZ": r"\d{4}",
40+
"BH": r"((1[0-2]|[2-9])\d{2})?",
41+
"BD": r"\d{4}",
42+
"BB": r"(BB\d{5})?",
43+
"BY": r"\d{6}",
44+
"BM": r"[A-Z]{2}[ ]?[A-Z0-9]{2}",
45+
"BA": r"\d{5}",
46+
"IO": r"BBND 1ZZ",
47+
"BN": r"[A-Z]{2}[ ]?\d{4}",
48+
"BG": r"\d{4}",
49+
"KH": r"\d{5}",
50+
"CV": r"\d{4}",
51+
"CL": r"\d{7}",
52+
"CR": r"\d{4,5}|\d{3}-\d{4}",
53+
"HR": r"\d{5}",
54+
"CY": r"\d{4}",
55+
"CZ": r"\d{3}[ ]?\d{2}",
56+
"DO": r"\d{5}",
57+
"EC": r"([A-Z]\d{4}[A-Z]|(?:[A-Z]{2})?\d{6})?",
58+
"EG": r"\d{5}",
59+
"EE": r"\d{5}",
60+
"FO": r"\d{3}",
61+
"GE": r"\d{4}",
62+
"GR": r"\d{3}[ ]?\d{2}",
63+
"GL": r"39\d{2}",
64+
"GT": r"\d{5}",
65+
"HT": r"\d{4}",
66+
"HN": r"(?:\d{5})?",
67+
"HU": r"\d{4}",
68+
"IS": r"\d{3}",
69+
"IN": r"\d{6}",
70+
"ID": r"\d{5}",
71+
"IL": r"\d{5}",
72+
"JO": r"\d{5}",
73+
"KZ": r"\d{6}",
74+
"KE": r"\d{5}",
75+
"KW": r"\d{5}",
76+
"LA": r"\d{5}",
77+
"LV": r"\d{4}",
78+
"LB": r"(\d{4}([ ]?\d{4})?)?",
79+
"LI": r"(948[5-9])|(949[0-7])",
80+
"LT": r"\d{5}",
81+
"LU": r"\d{4}",
82+
"MK": r"\d{4}",
83+
"MY": r"\d{5}",
84+
"MV": r"\d{5}",
85+
"MT": r"[A-Z]{3}[ ]?\d{2,4}",
86+
"MU": r"(\d{3}[A-Z]{2}\d{3})?",
87+
"MX": r"\d{5}",
88+
"MD": r"\d{4}",
89+
"MC": r"980\d{2}",
90+
"MA": r"\d{5}",
91+
"NP": r"\d{5}",
92+
"NZ": r"\d{4}",
93+
"NI": r"((\d{4}-)?\d{3}-\d{3}(-\d{1})?)?",
94+
"NG": r"(\d{6})?",
95+
"OM": r"(PC )?\d{3}",
96+
"PK": r"\d{5}",
97+
"PY": r"\d{4}",
98+
"PH": r"\d{4}",
99+
"PL": r"\d{2}-\d{3}",
100+
"PR": r"00[679]\d{2}([ \\-]\d{4})?",
101+
"RO": r"\d{6}",
102+
"RU": r"\d{6}",
103+
"SM": r"4789\d",
104+
"SA": r"\d{5}",
105+
"SN": r"\d{5}",
106+
"SK": r"\d{3}[ ]?\d{2}",
107+
"SI": r"\d{4}",
108+
"ZA": r"\d{4}",
109+
"LK": r"\d{5}",
110+
"TJ": r"\d{6}",
111+
"TH": r"\d{5}",
112+
"TN": r"\d{4}",
113+
"TR": r"\d{5}",
114+
"TM": r"\d{6}",
115+
"UA": r"\d{5}",
116+
"UY": r"\d{5}",
117+
"UZ": r"\d{6}",
118+
"VA": r"00120",
119+
"VE": r"\d{4}",
120+
"ZM": r"\d{5}",
121+
"AS": r"96799",
122+
"CC": r"6799",
123+
"CK": r"\d{4}",
124+
"RS": r"\d{6}",
125+
"ME": r"8\d{4}",
126+
"CS": r"\d{5}",
127+
"YU": r"\d{5}",
128+
"CX": r"6798",
129+
"ET": r"\d{4}",
130+
"FK": r"FIQQ 1ZZ",
131+
"NF": r"2899",
132+
"FM": r"(9694[1-4])([ \\-]\d{4})?",
133+
"GF": r"9[78]3\d{2}",
134+
"GN": r"\d{3}",
135+
"GP": r"9[78][01]\d{2}",
136+
"GS": r"SIQQ 1ZZ",
137+
"GU": r"969[123]\d([ \\-]\d{4})?",
138+
"GW": r"\d{4}",
139+
"HM": r"\d{4}",
140+
"IQ": r"\d{5}",
141+
"KG": r"\d{6}",
142+
"LR": r"\d{4}",
143+
"LS": r"\d{3}",
144+
"MG": r"\d{3}",
145+
"MH": r"969[67]\d([ \\-]\d{4})?",
146+
"MN": r"\d{6}",
147+
"MP": r"9695[012]([ \\-]\d{4})?",
148+
"MQ": r"9[78]2\d{2}",
149+
"NC": r"988\d{2}",
150+
"NE": r"\d{4}",
151+
"VI": r"008(([0-4]\d)|(5[01]))([ \\-]\d{4})?",
152+
"PF": r"987\d{2}",
153+
"PG": r"\d{3}",
154+
"PM": r"9[78]5\d{2}",
155+
"PN": r"PCRN 1ZZ",
156+
"PW": r">96940",
157+
"RE": r"9[78]4\d{2}",
158+
"SH": r"(ASCN|STHL) 1ZZ",
159+
"SJ": r"\d{4}",
160+
"SO": r"\d{5}",
161+
"SZ": r"[HLMS]\d{3}",
162+
"TC": r"TKCA 1ZZ",
163+
"WF": r"986\d{2}",
164+
"XK": r"\d{5}",
165+
"YT": r"976\d{2}"
166+
}
167+
168+
loaded_models = {}
169+
def load_spacy(spacy_model):
170+
if spacy_model not in loaded_models:
171+
loaded_models[spacy_model] = spacy.load(spacy_model)
172+
return loaded_models[spacy_model]
173+
174+
compiled_regex = {}
175+
176+
def get_regex(country_id:str):
177+
global compiled_regex
178+
179+
if country_id not in compiled_regex:
180+
r = zip_regex_lookup.get(country_id)
181+
if not r:
182+
raise Exception("unknown country ISO code")
183+
compiled_regex[country_id] = re.compile(r)
184+
return compiled_regex[country_id]
185+
186+
187+
def zipcode_extraction(text: str, extraction_keyword: str, country_id: str, spacy_model: str = "en_core_web_sm") -> List[Tuple[str, int, int]]:
7188
"""
8189
@param text: the input text
9190
@param extraction_keyword: the label that is assigned to extracted words
10191
@param country_id: ISO code of a country
11192
@return: extracted zip code positions
12193
"""
13-
nlp = spacy.load("en_core_web_sm")
194+
nlp = load_spacy(spacy_model)
14195
doc = nlp(text)
15196

16-
regex = re.compile(zip_codes[country_id])
197+
regex = get_regex(country_id)
17198

18199
zipcode_positions = []
19200
for match in regex.finditer(text):
@@ -26,167 +207,6 @@ def zipcode_extraction(text: str, extraction_keyword: str, country_id: str) -> L
26207
# -----------------------------------------------------------------------------------------
27208
# ↓ example implementation (code further down below)
28209

29-
zip_codes = {
30-
"GB": r"GIR[ ]?0AA|((AB|AL|B|BA|BB|BD|BH|BL|BN|BR|BS|BT|CA|CB|CF|CH|CM|CO|CR|CT|CV|CW|DA|DD|DE|DG|DH|DL|DN|DT|DY|E|EC|EH|EN|EX|FK|FY|G|GL|GY|GU|HA|HD|HG|HP|HR|HS|HU|HX|IG|IM|IP|IV|JE|KA|KT|KW|KY|L|LA|LD|LE|LL|LN|LS|LU|M|ME|MK|ML|N|NE|NG|NN|NP|NR|NW|OL|OX|PA|PE|PH|PL|PO|PR|RG|RH|RM|S|SA|SE|SG|SK|SL|SM|SN|SO|SP|SR|SS|ST|SW|SY|TA|TD|TF|TN|TQ|TR|TS|TW|UB|W|WA|WC|WD|WF|WN|WR|WS|WV|YO|ZE)(\\d[\\dA-Z]?[ ]?\\d[ABD-HJLN-UW-Z]{2}))|BFPO[ ]?\\d{1,4}",
31-
"JE": r"JE\\d[\\dA-Z]?[ ]?\\d[ABD-HJLN-UW-Z]{2}",
32-
"GG": r"GY\\d[\\dA-Z]?[ ]?\\d[ABD-HJLN-UW-Z]{2}",
33-
"IM": r"IM\\d[\\dA-Z]?[ ]?\\d[ABD-HJLN-UW-Z]{2}",
34-
"US": r"\\d{5}([ \\-]\\d{4})?",
35-
"CA": r"[ABCEGHJKLMNPRSTVXY]\\d[ABCEGHJ-NPRSTV-Z][ ]?\\d[ABCEGHJ-NPRSTV-Z]\\d",
36-
"DE": r"\\d{5}",
37-
"JP": r"\\d{3}-\\d{4}",
38-
"FR": r"\\d{2}[ ]?\\d{3}",
39-
"AU": r"\\d{4}",
40-
"IT": r"\\d{5}",
41-
"CH": r"\\d{4}",
42-
"AT": r"\\d{4}",
43-
"ES": r"\\d{5}",
44-
"NL": r"\\d{4}[ ]?[A-Z]{2}",
45-
"BE": r"\\d{4}",
46-
"DK": r"\\d{4}",
47-
"SE": r"\\d{3}[ ]?\\d{2}",
48-
"NO": r"\\d{4}",
49-
"BR": r"\\d{5}[\\-]?\\d{3}",
50-
"PT": r"\\d{4}([\\-]\\d{3})?",
51-
"FI": r"\\d{5}",
52-
"AX": r"22\\d{3}",
53-
"KR": r"\\d{3}[\\-]\\d{3}",
54-
"CN": r"\\d{6}",
55-
"TW": r"\\d{3}(\\d{2})",
56-
"SG": r"\\d{6}",
57-
"DZ": r"\\d{5}",
58-
"AD": r"AD\\d{3}",
59-
"AR": r"([A-HJ-NP-Z])?\\d{4}([A-Z]{3})?",
60-
"AM": r"(37)?\\d{4}",
61-
"AZ": r"\\d{4}",
62-
"BH": r"((1[0-2]|[2-9])\\d{2})?",
63-
"BD": r"\\d{4}",
64-
"BB": r"(BB\\d{5})?",
65-
"BY": r"\\d{6}",
66-
"BM": r"[A-Z]{2}[ ]?[A-Z0-9]{2}",
67-
"BA": r"\\d{5}",
68-
"IO": r"BBND 1ZZ",
69-
"BN": r"[A-Z]{2}[ ]?\\d{4}",
70-
"BG": r"\\d{4}",
71-
"KH": r"\\d{5}",
72-
"CV": r"\\d{4}",
73-
"CL": r"\\d{7}",
74-
"CR": r"\\d{4,5}|\\d{3}-\\d{4}",
75-
"HR": r"\\d{5}",
76-
"CY": r"\\d{4}",
77-
"CZ": r"\\d{3}[ ]?\\d{2}",
78-
"DO": r"\\d{5}",
79-
"EC": r"([A-Z]\\d{4}[A-Z]|(?:[A-Z]{2})?\\d{6})?",
80-
"EG": r"\\d{5}",
81-
"EE": r"\\d{5}",
82-
"FO": r"\\d{3}",
83-
"GE": r"\\d{4}",
84-
"GR": r"\\d{3}[ ]?\\d{2}",
85-
"GL": r"39\\d{2}",
86-
"GT": r"\\d{5}",
87-
"HT": r"\\d{4}",
88-
"HN": r"(?:\\d{5})?",
89-
"HU": r"\\d{4}",
90-
"IS": r"\\d{3}",
91-
"IN": r"\\d{6}",
92-
"ID": r"\\d{5}",
93-
"IL": r"\\d{5}",
94-
"JO": r"\\d{5}",
95-
"KZ": r"\\d{6}",
96-
"KE": r"\\d{5}",
97-
"KW": r"\\d{5}",
98-
"LA": r"\\d{5}",
99-
"LV": r"\\d{4}",
100-
"LB": r"(\\d{4}([ ]?\\d{4})?)?",
101-
"LI": r"(948[5-9])|(949[0-7])",
102-
"LT": r"\\d{5}",
103-
"LU": r"\\d{4}",
104-
"MK": r"\\d{4}",
105-
"MY": r"\\d{5}",
106-
"MV": r"\\d{5}",
107-
"MT": r"[A-Z]{3}[ ]?\\d{2,4}",
108-
"MU": r"(\\d{3}[A-Z]{2}\\d{3})?",
109-
"MX": r"\\d{5}",
110-
"MD": r"\\d{4}",
111-
"MC": r"980\\d{2}",
112-
"MA": r"\\d{5}",
113-
"NP": r"\\d{5}",
114-
"NZ": r"\\d{4}",
115-
"NI": r"((\\d{4}-)?\\d{3}-\\d{3}(-\\d{1})?)?",
116-
"NG": r"(\\d{6})?",
117-
"OM": r"(PC )?\\d{3}",
118-
"PK": r"\\d{5}",
119-
"PY": r"\\d{4}",
120-
"PH": r"\\d{4}",
121-
"PL": r"\\d{2}-\\d{3}",
122-
"PR": r"00[679]\\d{2}([ \\-]\\d{4})?",
123-
"RO": r"\\d{6}",
124-
"RU": r"\\d{6}",
125-
"SM": r"4789\\d",
126-
"SA": r"\\d{5}",
127-
"SN": r"\\d{5}",
128-
"SK": r"\\d{3}[ ]?\\d{2}",
129-
"SI": r"\\d{4}",
130-
"ZA": r"\\d{4}",
131-
"LK": r"\\d{5}",
132-
"TJ": r"\\d{6}",
133-
"TH": r"\\d{5}",
134-
"TN": r"\\d{4}",
135-
"TR": r"\\d{5}",
136-
"TM": r"\\d{6}",
137-
"UA": r"\\d{5}",
138-
"UY": r"\\d{5}",
139-
"UZ": r"\\d{6}",
140-
"VA": r"00120",
141-
"VE": r"\\d{4}",
142-
"ZM": r"\\d{5}",
143-
"AS": r"96799",
144-
"CC": r"6799",
145-
"CK": r"\\d{4}",
146-
"RS": r"\\d{6}",
147-
"ME": r"8\\d{4}",
148-
"CS": r"\\d{5}",
149-
"YU": r"\\d{5}",
150-
"CX": r"6798",
151-
"ET": r"\\d{4}",
152-
"FK": r"FIQQ 1ZZ",
153-
"NF": r"2899",
154-
"FM": r"(9694[1-4])([ \\-]\\d{4})?",
155-
"GF": r"9[78]3\\d{2}",
156-
"GN": r"\\d{3}",
157-
"GP": r"9[78][01]\\d{2}",
158-
"GS": r"SIQQ 1ZZ",
159-
"GU": r"969[123]\\d([ \\-]\\d{4})?",
160-
"GW": r"\\d{4}",
161-
"HM": r"\\d{4}",
162-
"IQ": r"\\d{5}",
163-
"KG": r"\\d{6}",
164-
"LR": r"\\d{4}",
165-
"LS": r"\\d{3}",
166-
"MG": r"\\d{3}",
167-
"MH": r"969[67]\\d([ \\-]\\d{4})?",
168-
"MN": r"\\d{6}",
169-
"MP": r"9695[012]([ \\-]\\d{4})?",
170-
"MQ": r"9[78]2\\d{2}",
171-
"NC": r"988\\d{2}",
172-
"NE": r"\\d{4}",
173-
"VI": r"008(([0-4]\\d)|(5[01]))([ \\-]\\d{4})?",
174-
"PF": r"987\\d{2}",
175-
"PG": r"\\d{3}",
176-
"PM": r"9[78]5\\d{2}",
177-
"PN": r"PCRN 1ZZ",
178-
"PW": r">96940",
179-
"RE": r"9[78]4\\d{2}",
180-
"SH": r"(ASCN|STHL) 1ZZ",
181-
"SJ": r"\\d{4}",
182-
"SO": r"\\d{5}",
183-
"SZ": r"[HLMS]\\d{3}",
184-
"TC": r"TKCA 1ZZ",
185-
"WF": r"986\\d{2}",
186-
"XK": r"\\d{5}",
187-
"YT": r"976\\d{2}"
188-
}
189-
190210
def example_integration():
191211
texts = ["10 Downing Street London SW1A 2AA"]
192212
extraction_keyword = "zip code"

0 commit comments

Comments
 (0)