@@ -3,17 +3,38 @@ import re
3
3
import spacy
4
4
from typing import List, Tuple
5
5
6
- def zipcode_extraction (text : str , extraction_keyword : str , country_id : str ) -> List[Tuple[str , int ]]:
6
+
7
+
8
+ loaded_models = {}
9
+ def load_spacy (spacy_model ):
10
+ if spacy_model not in loaded_models:
11
+ loaded_models[spacy_model] = spacy.load(spacy_model)
12
+ return loaded_models[spacy_model]
13
+
14
+ compiled_regex = {}
15
+
16
+ def get_regex (country_id :str ):
17
+ global compiled_regex
18
+
19
+ if country_id not in compiled_regex:
20
+ r = zip_regex_lookup.get(country_id)
21
+ if not r:
22
+ raise Exception (" unknown country ISO code" )
23
+ compiled_regex[country_id] = re.compile(r)
24
+ return compiled_regex[country_id]
25
+
26
+
27
+ def zipcode_extraction (text : str , extraction_keyword : str , country_id : str , spacy_model : str = " en_core_web_sm" ) -> List[Tuple[str , int , int ]]:
7
28
"""
8
29
@param text: the input text
9
30
@param extraction_keyword: the label that is assigned to extracted words
10
31
@param country_id: ISO code of a country
11
32
@return: extracted zip code positions
12
33
"""
13
- nlp = spacy.load( " en_core_web_sm " )
34
+ nlp = load_spacy(spacy_model )
14
35
doc = nlp(text)
15
36
16
- regex = re.compile(zip_codes[ country_id] )
37
+ regex = get_regex( country_id)
17
38
18
39
zipcode_positions = []
19
40
for match in regex.finditer(text):
@@ -22,171 +43,171 @@ def zipcode_extraction(text: str, extraction_keyword: str, country_id: str) -> L
22
43
zipcode_positions.append((extraction_keyword, span.start, span.end))
23
44
return zipcode_positions
24
45
25
- # ↑ necessary bricks function
26
- # -----------------------------------------------------------------------------------------
27
- # ↓ example implementation (code further down below)
28
-
29
- zip_codes = {
30
- " GB" : r " GIR[ ]? 0AA| (( AB| AL| B| BA| BB| BD| BH| BL| BN| BR| BS| BT| CA| CB| CF| CH| CM| CO| CR| CT| CV| CW| DA| DD| DE| DG| DH| DL| DN| DT| DY| E| EC| EH| EN| EX| FK| FY| G| GL| GY| GU| HA| HD| HG| HP| HR| HS| HU| HX| IG| IM| IP| IV| JE| KA| KT| KW| KY| L| LA| LD| LE| LL| LN| LS| LU| M| ME| MK| ML| N| NE| NG| NN| NP| NR| NW| OL| OX| PA| PE| PH| PL| PO| PR| RG| RH| RM| S| SA| SE| SG| SK| SL| SM| SN| SO| SP| SR| SS| ST| SW| SY| TA| TD| TF| TN| TQ| TR| TS| TW| UB| W| WA| WC| WD| WF| WN| WR| WS| WV| YO| ZE) ( \\ d[\\ dA-Z ]? [ ]? \\ d[ABD-HJLN-UW-Z ]{2} )) | BFPO[ ]? \\ d{1,4} " ,
31
- " JE" : r " JE\\ d[\\ dA-Z ]? [ ]? \\ d[ABD-HJLN-UW-Z ]{2} " ,
32
- " GG" : r " GY\\ d[\\ dA-Z ]? [ ]? \\ d[ABD-HJLN-UW-Z ]{2} " ,
33
- " IM" : r " IM\\ d[\\ dA-Z ]? [ ]? \\ d[ABD-HJLN-UW-Z ]{2} " ,
34
- " US" : r " \\ d{5} ( [ \\ - ]\\ d{4} ) ? " ,
35
- " CA" : r " [ABCEGHJKLMNPRSTVXY ]\\ d[ABCEGHJ-NPRSTV-Z ][ ]? \\ d[ABCEGHJ-NPRSTV-Z ]\\ d" ,
36
- " DE" : r " \\ d{5} " ,
37
- " JP" : r " \\ d{3} -\\ d{4} " ,
38
- " FR" : r " \\ d{2} [ ]? \\ d{3} " ,
39
- " AU" : r " \\ d{4} " ,
40
- " IT" : r " \\ d{5} " ,
41
- " CH" : r " \\ d{4} " ,
42
- " AT" : r " \\ d{4} " ,
43
- " ES" : r " \\ d{5} " ,
44
- " NL" : r " \\ d{4} [ ]? [A-Z ]{2} " ,
45
- " BE" : r " \\ d{4} " ,
46
- " DK" : r " \\ d{4} " ,
47
- " SE" : r " \\ d{3} [ ]? \\ d{2} " ,
48
- " NO" : r " \\ d{4} " ,
49
- " BR" : r " \\ d{5} [\\ - ]? \\ d{3} " ,
50
- " PT" : r " \\ d{4} ( [\\ - ]\\ d{3} ) ? " ,
51
- " FI" : r " \\ d{5} " ,
52
- " AX" : r " 22\\ d{3} " ,
53
- " KR" : r " \\ d{3} [\\ - ]\\ d{3} " ,
54
- " CN" : r " \\ d{6} " ,
55
- " TW" : r " \\ d{3} ( \\ d{2} ) " ,
56
- " SG" : r " \\ d{6} " ,
57
- " DZ" : r " \\ d{5} " ,
58
- " AD" : r " AD\\ d{3} " ,
59
- " AR" : r " ( [A-HJ-NP-Z ]) ? \\ d{4} ( [A-Z ]{3} ) ? " ,
60
- " AM" : r " ( 37) ? \\ d{4} " ,
61
- " AZ" : r " \\ d{4} " ,
62
- " BH" : r " (( 1[0-2 ]| [2-9 ]) \\ d{2} ) ? " ,
63
- " BD" : r " \\ d{4} " ,
64
- " BB" : r " ( BB\\ d{5} ) ? " ,
65
- " BY" : r " \\ d{6} " ,
46
+ zip_regex_lookup = {
47
+ " GB" : r " GIR[ ]? 0AA| (( AB| AL| B| BA| BB| BD| BH| BL| BN| BR| BS| BT| CA| CB| CF| CH| CM| CO| CR| CT| CV| CW| DA| DD| DE| DG| DH| DL| DN| DT| DY| E| EC| EH| EN| EX| FK| FY| G| GL| GY| GU| HA| HD| HG| HP| HR| HS| HU| HX| IG| IM| IP| IV| JE| KA| KT| KW| KY| L| LA| LD| LE| LL| LN| LS| LU| M| ME| MK| ML| N| NE| NG| NN| NP| NR| NW| OL| OX| PA| PE| PH| PL| PO| PR| RG| RH| RM| S| SA| SE| SG| SK| SL| SM| SN| SO| SP| SR| SS| ST| SW| SY| TA| TD| TF| TN| TQ| TR| TS| TW| UB| W| WA| WC| WD| WF| WN| WR| WS| WV| YO| ZE) ( \d [\d A-Z ]? [ ]? \d [ABD-HJLN-UW-Z ]{2} )) | BFPO[ ]? \d {1,4} " ,
48
+ " JE" : r " JE\d [\d A-Z ]? [ ]? \d [ABD-HJLN-UW-Z ]{2} " ,
49
+ " GG" : r " GY\d [\d A-Z ]? [ ]? \d [ABD-HJLN-UW-Z ]{2} " ,
50
+ " IM" : r " IM\d [\d A-Z ]? [ ]? \d [ABD-HJLN-UW-Z ]{2} " ,
51
+ " US" : r " \d {5} ( [ \\ - ]\d {4} ) ? " ,
52
+ " CA" : r " [ABCEGHJKLMNPRSTVXY ]\d [ABCEGHJ-NPRSTV-Z ][ ]? \d [ABCEGHJ-NPRSTV-Z ]\d " ,
53
+ " DE" : r " \d {5} " ,
54
+ " JP" : r " \d {3} -\d {4} " ,
55
+ " FR" : r " \d {2} [ ]? \d {3} " ,
56
+ " AU" : r " \d {4} " ,
57
+ " IT" : r " \d {5} " ,
58
+ " CH" : r " \d {4} " ,
59
+ " AT" : r " \d {4} " ,
60
+ " ES" : r " \d {5} " ,
61
+ " NL" : r " \d {4} [ ]? [A-Z ]{2} " ,
62
+ " BE" : r " \d {4} " ,
63
+ " DK" : r " \d {4} " ,
64
+ " SE" : r " \d {3} [ ]? \d {2} " ,
65
+ " NO" : r " \d {4} " ,
66
+ " BR" : r " \d {5} [\\ - ]? \d {3} " ,
67
+ " PT" : r " \d {4} ( [\\ - ]\d {3} ) ? " ,
68
+ " FI" : r " \d {5} " ,
69
+ " AX" : r " 22\d {3} " ,
70
+ " KR" : r " \d {3} [\\ - ]\d {3} " ,
71
+ " CN" : r " \d {6} " ,
72
+ " TW" : r " \d {3} ( \d {2} ) " ,
73
+ " SG" : r " \d {6} " ,
74
+ " DZ" : r " \d {5} " ,
75
+ " AD" : r " AD\d {3} " ,
76
+ " AR" : r " ( [A-HJ-NP-Z ]) ? \d {4} ( [A-Z ]{3} ) ? " ,
77
+ " AM" : r " ( 37) ? \d {4} " ,
78
+ " AZ" : r " \d {4} " ,
79
+ " BH" : r " (( 1[0-2 ]| [2-9 ]) \d {2} ) ? " ,
80
+ " BD" : r " \d {4} " ,
81
+ " BB" : r " ( BB\d {5} ) ? " ,
82
+ " BY" : r " \d {6} " ,
66
83
" BM" : r " [A-Z ]{2} [ ]? [A-Z0-9 ]{2} " ,
67
- " BA" : r " \\ d{5} " ,
84
+ " BA" : r " \d {5} " ,
68
85
" IO" : r " BBND 1ZZ" ,
69
- " BN" : r " [A-Z ]{2} [ ]? \\ d{4} " ,
70
- " BG" : r " \\ d{4} " ,
71
- " KH" : r " \\ d{5} " ,
72
- " CV" : r " \\ d{4} " ,
73
- " CL" : r " \\ d{7} " ,
74
- " CR" : r " \\ d{4,5} | \\ d{3} -\ \ d{4} " ,
75
- " HR" : r " \\ d{5} " ,
76
- " CY" : r " \\ d{4} " ,
77
- " CZ" : r " \\ d{3} [ ]? \ \ d{2} " ,
78
- " DO" : r " \\ d{5} " ,
79
- " EC" : r " ( [A-Z ]\\ d{4} [A-Z ]| (?: [A-Z ]{2} ) ? \ \ d{6} ) ? " ,
80
- " EG" : r " \\ d{5} " ,
81
- " EE" : r " \\ d{5} " ,
82
- " FO" : r " \\ d{3} " ,
83
- " GE" : r " \\ d{4} " ,
84
- " GR" : r " \\ d{3} [ ]? \ \ d{2} " ,
85
- " GL" : r " 39\\ d{2} " ,
86
- " GT" : r " \\ d{5} " ,
87
- " HT" : r " \\ d{4} " ,
88
- " HN" : r " (?: \\ d{5} ) ? " ,
89
- " HU" : r " \\ d{4} " ,
90
- " IS" : r " \\ d{3} " ,
91
- " IN" : r " \\ d{6} " ,
92
- " ID" : r " \\ d{5} " ,
93
- " IL" : r " \\ d{5} " ,
94
- " JO" : r " \\ d{5} " ,
95
- " KZ" : r " \\ d{6} " ,
96
- " KE" : r " \\ d{5} " ,
97
- " KW" : r " \\ d{5} " ,
98
- " LA" : r " \\ d{5} " ,
99
- " LV" : r " \\ d{4} " ,
100
- " LB" : r " ( \\ d{4} ( [ ]? \ \ d{4} ) ? ) ? " ,
86
+ " BN" : r " [A-Z ]{2} [ ]? \d {4} " ,
87
+ " BG" : r " \d {4} " ,
88
+ " KH" : r " \d {5} " ,
89
+ " CV" : r " \d {4} " ,
90
+ " CL" : r " \d {7} " ,
91
+ " CR" : r " \d {4,5} | \d {3} -\d {4} " ,
92
+ " HR" : r " \d {5} " ,
93
+ " CY" : r " \d {4} " ,
94
+ " CZ" : r " \d {3} [ ]? \d {2} " ,
95
+ " DO" : r " \d {5} " ,
96
+ " EC" : r " ( [A-Z ]\d {4} [A-Z ]| (?: [A-Z ]{2} ) ? \d {6} ) ? " ,
97
+ " EG" : r " \d {5} " ,
98
+ " EE" : r " \d {5} " ,
99
+ " FO" : r " \d {3} " ,
100
+ " GE" : r " \d {4} " ,
101
+ " GR" : r " \d {3} [ ]? \d {2} " ,
102
+ " GL" : r " 39\d {2} " ,
103
+ " GT" : r " \d {5} " ,
104
+ " HT" : r " \d {4} " ,
105
+ " HN" : r " (?: \d {5} ) ? " ,
106
+ " HU" : r " \d {4} " ,
107
+ " IS" : r " \d {3} " ,
108
+ " IN" : r " \d {6} " ,
109
+ " ID" : r " \d {5} " ,
110
+ " IL" : r " \d {5} " ,
111
+ " JO" : r " \d {5} " ,
112
+ " KZ" : r " \d {6} " ,
113
+ " KE" : r " \d {5} " ,
114
+ " KW" : r " \d {5} " ,
115
+ " LA" : r " \d {5} " ,
116
+ " LV" : r " \d {4} " ,
117
+ " LB" : r " ( \d {4} ( [ ]? \d {4} ) ? ) ? " ,
101
118
" LI" : r " ( 948[5-9 ]) | ( 949[0-7 ]) " ,
102
- " LT" : r " \\ d{5} " ,
103
- " LU" : r " \\ d{4} " ,
104
- " MK" : r " \\ d{4} " ,
105
- " MY" : r " \\ d{5} " ,
106
- " MV" : r " \\ d{5} " ,
107
- " MT" : r " [A-Z ]{3} [ ]? \\ d{2,4} " ,
108
- " MU" : r " ( \\ d{3} [A-Z ]{2} \ \ d{3} ) ? " ,
109
- " MX" : r " \\ d{5} " ,
110
- " MD" : r " \\ d{4} " ,
111
- " MC" : r " 980\\ d{2} " ,
112
- " MA" : r " \\ d{5} " ,
113
- " NP" : r " \\ d{5} " ,
114
- " NZ" : r " \\ d{4} " ,
115
- " NI" : r " (( \\ d{4} -) ? \\ d{3} -\\ d{3} ( -\ \ d{1} ) ? ) ? " ,
116
- " NG" : r " ( \\ d{6} ) ? " ,
117
- " OM" : r " ( PC ) ? \\ d{3} " ,
118
- " PK" : r " \\ d{5} " ,
119
- " PY" : r " \\ d{4} " ,
120
- " PH" : r " \\ d{4} " ,
121
- " PL" : r " \\ d{2} -\ \ d{3} " ,
122
- " PR" : r " 00[679 ]\\ d{2} ( [ \\ - ]\ \ d{4} ) ? " ,
123
- " RO" : r " \\ d{6} " ,
124
- " RU" : r " \\ d{6} " ,
125
- " SM" : r " 4789\\ d" ,
126
- " SA" : r " \\ d{5} " ,
127
- " SN" : r " \\ d{5} " ,
128
- " SK" : r " \\ d{3} [ ]? \ \ d{2} " ,
129
- " SI" : r " \\ d{4} " ,
130
- " ZA" : r " \\ d{4} " ,
131
- " LK" : r " \\ d{5} " ,
132
- " TJ" : r " \\ d{6} " ,
133
- " TH" : r " \\ d{5} " ,
134
- " TN" : r " \\ d{4} " ,
135
- " TR" : r " \\ d{5} " ,
136
- " TM" : r " \\ d{6} " ,
137
- " UA" : r " \\ d{5} " ,
138
- " UY" : r " \\ d{5} " ,
139
- " UZ" : r " \\ d{6} " ,
119
+ " LT" : r " \d {5} " ,
120
+ " LU" : r " \d {4} " ,
121
+ " MK" : r " \d {4} " ,
122
+ " MY" : r " \d {5} " ,
123
+ " MV" : r " \d {5} " ,
124
+ " MT" : r " [A-Z ]{3} [ ]? \d {2,4} " ,
125
+ " MU" : r " ( \d {3} [A-Z ]{2} \d {3} ) ? " ,
126
+ " MX" : r " \d {5} " ,
127
+ " MD" : r " \d {4} " ,
128
+ " MC" : r " 980\d {2} " ,
129
+ " MA" : r " \d {5} " ,
130
+ " NP" : r " \d {5} " ,
131
+ " NZ" : r " \d {4} " ,
132
+ " NI" : r " (( \d {4} -) ? \d {3} -\d {3} ( -\d {1} ) ? ) ? " ,
133
+ " NG" : r " ( \d {6} ) ? " ,
134
+ " OM" : r " ( PC ) ? \d {3} " ,
135
+ " PK" : r " \d {5} " ,
136
+ " PY" : r " \d {4} " ,
137
+ " PH" : r " \d {4} " ,
138
+ " PL" : r " \d {2} -\d {3} " ,
139
+ " PR" : r " 00[679 ]\d {2} ( [ \\ - ]\d {4} ) ? " ,
140
+ " RO" : r " \d {6} " ,
141
+ " RU" : r " \d {6} " ,
142
+ " SM" : r " 4789\d " ,
143
+ " SA" : r " \d {5} " ,
144
+ " SN" : r " \d {5} " ,
145
+ " SK" : r " \d {3} [ ]? \d {2} " ,
146
+ " SI" : r " \d {4} " ,
147
+ " ZA" : r " \d {4} " ,
148
+ " LK" : r " \d {5} " ,
149
+ " TJ" : r " \d {6} " ,
150
+ " TH" : r " \d {5} " ,
151
+ " TN" : r " \d {4} " ,
152
+ " TR" : r " \d {5} " ,
153
+ " TM" : r " \d {6} " ,
154
+ " UA" : r " \d {5} " ,
155
+ " UY" : r " \d {5} " ,
156
+ " UZ" : r " \d {6} " ,
140
157
" VA" : r " 00120" ,
141
- " VE" : r " \\ d{4} " ,
142
- " ZM" : r " \\ d{5} " ,
158
+ " VE" : r " \d {4} " ,
159
+ " ZM" : r " \d {5} " ,
143
160
" AS" : r " 96799" ,
144
161
" CC" : r " 6799" ,
145
- " CK" : r " \\ d{4} " ,
146
- " RS" : r " \\ d{6} " ,
147
- " ME" : r " 8\\ d{4} " ,
148
- " CS" : r " \\ d{5} " ,
149
- " YU" : r " \\ d{5} " ,
162
+ " CK" : r " \d {4} " ,
163
+ " RS" : r " \d {6} " ,
164
+ " ME" : r " 8\d {4} " ,
165
+ " CS" : r " \d {5} " ,
166
+ " YU" : r " \d {5} " ,
150
167
" CX" : r " 6798" ,
151
- " ET" : r " \\ d{4} " ,
168
+ " ET" : r " \d {4} " ,
152
169
" FK" : r " FIQQ 1ZZ" ,
153
170
" NF" : r " 2899" ,
154
- " FM" : r " ( 9694[1-4 ]) ( [ \\ - ]\\ d{4} ) ? " ,
155
- " GF" : r " 9[78 ]3\\ d{2} " ,
156
- " GN" : r " \\ d{3} " ,
157
- " GP" : r " 9[78 ][01 ]\\ d{2} " ,
171
+ " FM" : r " ( 9694[1-4 ]) ( [ \\ - ]\d {4} ) ? " ,
172
+ " GF" : r " 9[78 ]3\d {2} " ,
173
+ " GN" : r " \d {3} " ,
174
+ " GP" : r " 9[78 ][01 ]\d {2} " ,
158
175
" GS" : r " SIQQ 1ZZ" ,
159
- " GU" : r " 969[123 ]\\ d( [ \\ - ]\ \ d{4} ) ? " ,
160
- " GW" : r " \\ d{4} " ,
161
- " HM" : r " \\ d{4} " ,
162
- " IQ" : r " \\ d{5} " ,
163
- " KG" : r " \\ d{6} " ,
164
- " LR" : r " \\ d{4} " ,
165
- " LS" : r " \\ d{3} " ,
166
- " MG" : r " \\ d{3} " ,
167
- " MH" : r " 969[67 ]\\ d( [ \\ - ]\ \ d{4} ) ? " ,
168
- " MN" : r " \\ d{6} " ,
169
- " MP" : r " 9695[012 ]( [ \\ - ]\\ d{4} ) ? " ,
170
- " MQ" : r " 9[78 ]2\\ d{2} " ,
171
- " NC" : r " 988\\ d{2} " ,
172
- " NE" : r " \\ d{4} " ,
173
- " VI" : r " 008(( [0-4 ]\\ d) | ( 5[01 ])) ( [ \\ - ]\ \ d{4} ) ? " ,
174
- " PF" : r " 987\\ d{2} " ,
175
- " PG" : r " \\ d{3} " ,
176
- " PM" : r " 9[78 ]5\\ d{2} " ,
176
+ " GU" : r " 969[123 ]\d ( [ \\ - ]\d {4} ) ? " ,
177
+ " GW" : r " \d {4} " ,
178
+ " HM" : r " \d {4} " ,
179
+ " IQ" : r " \d {5} " ,
180
+ " KG" : r " \d {6} " ,
181
+ " LR" : r " \d {4} " ,
182
+ " LS" : r " \d {3} " ,
183
+ " MG" : r " \d {3} " ,
184
+ " MH" : r " 969[67 ]\d ( [ \\ - ]\d {4} ) ? " ,
185
+ " MN" : r " \d {6} " ,
186
+ " MP" : r " 9695[012 ]( [ \\ - ]\d {4} ) ? " ,
187
+ " MQ" : r " 9[78 ]2\d {2} " ,
188
+ " NC" : r " 988\d {2} " ,
189
+ " NE" : r " \d {4} " ,
190
+ " VI" : r " 008(( [0-4 ]\d ) | ( 5[01 ])) ( [ \\ - ]\d {4} ) ? " ,
191
+ " PF" : r " 987\d {2} " ,
192
+ " PG" : r " \d {3} " ,
193
+ " PM" : r " 9[78 ]5\d {2} " ,
177
194
" PN" : r " PCRN 1ZZ" ,
178
195
" PW" : r " >96940" ,
179
- " RE" : r " 9[78 ]4\\ d{2} " ,
196
+ " RE" : r " 9[78 ]4\d {2} " ,
180
197
" SH" : r " ( ASCN| STHL) 1ZZ" ,
181
- " SJ" : r " \\ d{4} " ,
182
- " SO" : r " \\ d{5} " ,
183
- " SZ" : r " [HLMS ]\\ d{3} " ,
198
+ " SJ" : r " \d {4} " ,
199
+ " SO" : r " \d {5} " ,
200
+ " SZ" : r " [HLMS ]\d {3} " ,
184
201
" TC" : r " TKCA 1ZZ" ,
185
- " WF" : r " 986\\ d{2} " ,
186
- " XK" : r " \\ d{5} " ,
187
- " YT" : r " 976\\ d{2} "
202
+ " WF" : r " 986\d {2} " ,
203
+ " XK" : r " \d {5} " ,
204
+ " YT" : r " 976\d {2} "
188
205
}
189
206
207
+ # ↑ necessary bricks function
208
+ # -----------------------------------------------------------------------------------------
209
+ # ↓ example implementation (code further down below)
210
+
190
211
def example_integration ():
191
212
texts = [" 10 Downing Street London SW1A 2AA" ]
192
213
extraction_keyword = " zip code"
0 commit comments