|
8 | 8 | "# Using Sherlock out-of-the-box\n",
|
9 | 9 | "This notebook shows how to predict a semantic type for a given table column.\n",
|
10 | 10 | "The steps are basically:\n",
|
11 |
| - "- Extract features from a column.\n", |
| 11 | + "- Download files for word embedding and paragraph vector feature extraction (downloads only once) and initialize feature extraction models.\n", |
| 12 | + "- Extract features from table columns.\n", |
12 | 13 | "- Initialize Sherlock.\n",
|
13 | 14 | "- Make a prediction for the feature representation of the column."
|
14 | 15 | ]
|
|
44 | 45 | "metadata": {},
|
45 | 46 | "outputs": [
|
46 | 47 | {
|
47 |
| - "name": "stderr", |
48 |
| - "output_type": "stream", |
49 |
| - "text": [ |
50 |
| - "UsageError: Environment does not have key: PYTHONHASHSEED\n" |
51 |
| - ] |
| 48 | + "data": { |
| 49 | + "text/plain": [ |
| 50 | + "'13'" |
| 51 | + ] |
| 52 | + }, |
| 53 | + "execution_count": 2, |
| 54 | + "metadata": {}, |
| 55 | + "output_type": "execute_result" |
52 | 56 | }
|
53 | 57 | ],
|
54 | 58 | "source": [
|
|
57 | 61 | },
|
58 | 62 | {
|
59 | 63 | "cell_type": "markdown",
|
60 |
| - "id": "2b3b7967", |
| 64 | + "id": "f1101303", |
61 | 65 | "metadata": {},
|
62 | 66 | "source": [
|
63 |
| - "## Extract features" |
64 |
| - ] |
65 |
| - }, |
66 |
| - { |
67 |
| - "cell_type": "code", |
68 |
| - "execution_count": 8, |
69 |
| - "id": "164f74ff", |
70 |
| - "metadata": {}, |
71 |
| - "outputs": [], |
72 |
| - "source": [ |
73 |
| - "# helpers.download_data()" |
| 67 | + "## Initialize feature extraction models" |
74 | 68 | ]
|
75 | 69 | },
|
76 | 70 | {
|
|
93 | 87 | " \n",
|
94 | 88 | "All files for extracting word and paragraph embeddings are present.\n",
|
95 | 89 | "Initialising word embeddings\n",
|
96 |
| - "Initialise Word Embeddings process took 0:00:05.607905 seconds.\n", |
97 |
| - "Initialise Doc2Vec Model, 400 dim, process took 0:00:02.443327 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)\n", |
98 |
| - "Initialised NLTK, process took 0:00:00.181374 seconds.\n" |
| 90 | + "Initialise Word Embeddings process took 0:00:05.513540 seconds.\n", |
| 91 | + "Initialise Doc2Vec Model, 400 dim, process took 0:00:04.191875 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)\n", |
| 92 | + "Initialised NLTK, process took 0:00:00.209930 seconds.\n" |
99 | 93 | ]
|
100 | 94 | },
|
101 | 95 | {
|
|
117 | 111 | "initialise_nltk()"
|
118 | 112 | ]
|
119 | 113 | },
|
| 114 | + { |
| 115 | + "cell_type": "markdown", |
| 116 | + "id": "2b3b7967", |
| 117 | + "metadata": {}, |
| 118 | + "source": [ |
| 119 | + "## Extract features" |
| 120 | + ] |
| 121 | + }, |
120 | 122 | {
|
121 | 123 | "cell_type": "code",
|
122 |
| - "execution_count": 35, |
| 124 | + "execution_count": 4, |
123 | 125 | "id": "db04ccf9",
|
124 | 126 | "metadata": {},
|
125 | 127 | "outputs": [],
|
|
128 | 130 | " [\n",
|
129 | 131 | " [\"Jane Smith\", \"Lute Ahorn\", \"Anna James\"],\n",
|
130 | 132 | " [\"Amsterdam\", \"Haarlem\", \"Zwolle\"],\n",
|
| 133 | + " [\"Chabot Street 19\", \"1200 fifth Avenue\", \"Binnenkant 22, 1011BH\"]\n", |
131 | 134 | " ],\n",
|
132 | 135 | " name=\"values\"\n",
|
133 | 136 | ")"
|
134 | 137 | ]
|
135 | 138 | },
|
136 | 139 | {
|
137 | 140 | "cell_type": "code",
|
138 |
| - "execution_count": 36, |
| 141 | + "execution_count": 5, |
139 | 142 | "id": "4875f6c7",
|
140 | 143 | "metadata": {},
|
141 | 144 | "outputs": [
|
142 | 145 | {
|
143 | 146 | "data": {
|
144 | 147 | "text/plain": [
|
145 |
| - "0 [Jane Smith, Lute Ahorn, Anna James]\n", |
146 |
| - "1 [Amsterdam, Haarlem, Zwolle]\n", |
| 148 | + "0 [Jane Smith, Lute Ahorn, Anna James]\n", |
| 149 | + "1 [Amsterdam, Haarlem, Zwolle]\n", |
| 150 | + "2 [Chabot Street 19, 1200 fifth Avenue, Binnenka...\n", |
147 | 151 | "Name: values, dtype: object"
|
148 | 152 | ]
|
149 | 153 | },
|
150 |
| - "execution_count": 36, |
| 154 | + "execution_count": 5, |
151 | 155 | "metadata": {},
|
152 | 156 | "output_type": "execute_result"
|
153 | 157 | }
|
|
158 | 162 | },
|
159 | 163 | {
|
160 | 164 | "cell_type": "code",
|
161 |
| - "execution_count": 37, |
| 165 | + "execution_count": 8, |
162 | 166 | "id": "f7f2c846",
|
163 | 167 | "metadata": {},
|
164 | 168 | "outputs": [
|
165 | 169 | {
|
166 | 170 | "name": "stderr",
|
167 | 171 | "output_type": "stream",
|
168 | 172 | "text": [
|
169 |
| - "Extracting Features: 100%|██████████| 2/2 [00:00<00:00, 62.37it/s]\n" |
| 173 | + "Extracting Features: 100%|██████████| 3/3 [00:00<00:00, 167.51it/s]" |
170 | 174 | ]
|
171 | 175 | },
|
172 | 176 | {
|
|
175 | 179 | "text": [
|
176 | 180 | "Exporting 1588 column features\n"
|
177 | 181 | ]
|
| 182 | + }, |
| 183 | + { |
| 184 | + "name": "stderr", |
| 185 | + "output_type": "stream", |
| 186 | + "text": [ |
| 187 | + "\n" |
| 188 | + ] |
178 | 189 | }
|
179 | 190 | ],
|
180 | 191 | "source": [
|
181 | 192 | "extract_features(\n",
|
182 | 193 | " \"../temporary.csv\",\n",
|
183 | 194 | " data\n",
|
184 | 195 | ")\n",
|
185 |
| - "feature_vector = pd.read_csv(\"../temporary.csv\", dtype=np.float32)" |
| 196 | + "feature_vectors = pd.read_csv(\"../temporary.csv\", dtype=np.float32)" |
186 | 197 | ]
|
187 | 198 | },
|
188 | 199 | {
|
189 | 200 | "cell_type": "code",
|
190 |
| - "execution_count": 38, |
| 201 | + "execution_count": 9, |
191 | 202 | "id": "0c42ce71",
|
192 | 203 | "metadata": {},
|
193 | 204 | "outputs": [
|
|
241 | 252 | " <td>0.0</td>\n",
|
242 | 253 | " <td>0.0</td>\n",
|
243 | 254 | " <td>0.0</td>\n",
|
244 |
| - " <td>0.0</td>\n", |
| 255 | + " <td>0.000000</td>\n", |
245 | 256 | " <td>0.0</td>\n",
|
246 | 257 | " <td>0.0</td>\n",
|
247 | 258 | " <td>0.0</td>\n",
|
248 | 259 | " <td>0.0</td>\n",
|
249 | 260 | " <td>-3.0</td>\n",
|
250 | 261 | " <td>0.0</td>\n",
|
251 | 262 | " <td>...</td>\n",
|
252 |
| - " <td>-0.115819</td>\n", |
253 |
| - " <td>0.023961</td>\n", |
254 |
| - " <td>-0.130739</td>\n", |
255 |
| - " <td>0.006393</td>\n", |
256 |
| - " <td>-0.135118</td>\n", |
257 |
| - " <td>-0.071956</td>\n", |
258 |
| - " <td>-0.051051</td>\n", |
259 |
| - " <td>-0.068307</td>\n", |
260 |
| - " <td>0.087342</td>\n", |
261 |
| - " <td>-0.145716</td>\n", |
| 263 | + " <td>-0.116468</td>\n", |
| 264 | + " <td>0.023982</td>\n", |
| 265 | + " <td>-0.130867</td>\n", |
| 266 | + " <td>0.006825</td>\n", |
| 267 | + " <td>-0.135098</td>\n", |
| 268 | + " <td>-0.070616</td>\n", |
| 269 | + " <td>-0.052172</td>\n", |
| 270 | + " <td>-0.067250</td>\n", |
| 271 | + " <td>0.086256</td>\n", |
| 272 | + " <td>-0.144385</td>\n", |
262 | 273 | " </tr>\n",
|
263 | 274 | " <tr>\n",
|
264 | 275 | " <th>1</th>\n",
|
265 | 276 | " <td>0.0</td>\n",
|
266 | 277 | " <td>0.0</td>\n",
|
267 | 278 | " <td>0.0</td>\n",
|
268 |
| - " <td>0.0</td>\n", |
| 279 | + " <td>0.000000</td>\n", |
269 | 280 | " <td>0.0</td>\n",
|
270 | 281 | " <td>0.0</td>\n",
|
271 | 282 | " <td>0.0</td>\n",
|
272 | 283 | " <td>0.0</td>\n",
|
273 | 284 | " <td>-3.0</td>\n",
|
274 | 285 | " <td>0.0</td>\n",
|
275 | 286 | " <td>...</td>\n",
|
276 |
| - " <td>-0.054351</td>\n", |
277 |
| - " <td>0.023650</td>\n", |
278 |
| - " <td>-0.165681</td>\n", |
279 |
| - " <td>-0.016137</td>\n", |
280 |
| - " <td>-0.059402</td>\n", |
281 |
| - " <td>0.008454</td>\n", |
282 |
| - " <td>-0.044624</td>\n", |
283 |
| - " <td>0.025160</td>\n", |
284 |
| - " <td>0.037831</td>\n", |
285 |
| - " <td>-0.086235</td>\n", |
| 287 | + " <td>-0.054949</td>\n", |
| 288 | + " <td>0.024502</td>\n", |
| 289 | + " <td>-0.166001</td>\n", |
| 290 | + " <td>-0.014375</td>\n", |
| 291 | + " <td>-0.058199</td>\n", |
| 292 | + " <td>0.009978</td>\n", |
| 293 | + " <td>-0.046423</td>\n", |
| 294 | + " <td>0.025163</td>\n", |
| 295 | + " <td>0.036946</td>\n", |
| 296 | + " <td>-0.086611</td>\n", |
| 297 | + " </tr>\n", |
| 298 | + " <tr>\n", |
| 299 | + " <th>2</th>\n", |
| 300 | + " <td>1.0</td>\n", |
| 301 | + " <td>0.0</td>\n", |
| 302 | + " <td>1.0</td>\n", |
| 303 | + " <td>0.666667</td>\n", |
| 304 | + " <td>0.0</td>\n", |
| 305 | + " <td>2.0</td>\n", |
| 306 | + " <td>1.0</td>\n", |
| 307 | + " <td>3.0</td>\n", |
| 308 | + " <td>-1.5</td>\n", |
| 309 | + " <td>0.0</td>\n", |
| 310 | + " <td>...</td>\n", |
| 311 | + " <td>-0.022804</td>\n", |
| 312 | + " <td>0.001741</td>\n", |
| 313 | + " <td>0.047479</td>\n", |
| 314 | + " <td>0.118293</td>\n", |
| 315 | + " <td>-0.093435</td>\n", |
| 316 | + " <td>0.036759</td>\n", |
| 317 | + " <td>-0.004508</td>\n", |
| 318 | + " <td>-0.087898</td>\n", |
| 319 | + " <td>-0.117796</td>\n", |
| 320 | + " <td>-0.191386</td>\n", |
286 | 321 | " </tr>\n",
|
287 | 322 | " </tbody>\n",
|
288 | 323 | "</table>\n",
|
289 |
| - "<p>2 rows × 1588 columns</p>\n", |
| 324 | + "<p>3 rows × 1588 columns</p>\n", |
290 | 325 | "</div>"
|
291 | 326 | ],
|
292 | 327 | "text/plain": [
|
293 | 328 | " n_[0]-agg-any n_[0]-agg-all n_[0]-agg-mean n_[0]-agg-var n_[0]-agg-min \\\n",
|
294 |
| - "0 0.0 0.0 0.0 0.0 0.0 \n", |
295 |
| - "1 0.0 0.0 0.0 0.0 0.0 \n", |
| 329 | + "0 0.0 0.0 0.0 0.000000 0.0 \n", |
| 330 | + "1 0.0 0.0 0.0 0.000000 0.0 \n", |
| 331 | + "2 1.0 0.0 1.0 0.666667 0.0 \n", |
296 | 332 | "\n",
|
297 | 333 | " n_[0]-agg-max n_[0]-agg-median n_[0]-agg-sum n_[0]-agg-kurtosis \\\n",
|
298 | 334 | "0 0.0 0.0 0.0 -3.0 \n",
|
299 | 335 | "1 0.0 0.0 0.0 -3.0 \n",
|
| 336 | + "2 2.0 1.0 3.0 -1.5 \n", |
300 | 337 | "\n",
|
301 | 338 | " n_[0]-agg-skewness ... par_vec_390 par_vec_391 par_vec_392 \\\n",
|
302 |
| - "0 0.0 ... -0.115819 0.023961 -0.130739 \n", |
303 |
| - "1 0.0 ... -0.054351 0.023650 -0.165681 \n", |
| 339 | + "0 0.0 ... -0.116468 0.023982 -0.130867 \n", |
| 340 | + "1 0.0 ... -0.054949 0.024502 -0.166001 \n", |
| 341 | + "2 0.0 ... -0.022804 0.001741 0.047479 \n", |
304 | 342 | "\n",
|
305 | 343 | " par_vec_393 par_vec_394 par_vec_395 par_vec_396 par_vec_397 \\\n",
|
306 |
| - "0 0.006393 -0.135118 -0.071956 -0.051051 -0.068307 \n", |
307 |
| - "1 -0.016137 -0.059402 0.008454 -0.044624 0.025160 \n", |
| 344 | + "0 0.006825 -0.135098 -0.070616 -0.052172 -0.067250 \n", |
| 345 | + "1 -0.014375 -0.058199 0.009978 -0.046423 0.025163 \n", |
| 346 | + "2 0.118293 -0.093435 0.036759 -0.004508 -0.087898 \n", |
308 | 347 | "\n",
|
309 | 348 | " par_vec_398 par_vec_399 \n",
|
310 |
| - "0 0.087342 -0.145716 \n", |
311 |
| - "1 0.037831 -0.086235 \n", |
| 349 | + "0 0.086256 -0.144385 \n", |
| 350 | + "1 0.036946 -0.086611 \n", |
| 351 | + "2 -0.117796 -0.191386 \n", |
312 | 352 | "\n",
|
313 |
| - "[2 rows x 1588 columns]" |
| 353 | + "[3 rows x 1588 columns]" |
314 | 354 | ]
|
315 | 355 | },
|
316 |
| - "execution_count": 38, |
| 356 | + "execution_count": 9, |
317 | 357 | "metadata": {},
|
318 | 358 | "output_type": "execute_result"
|
319 | 359 | }
|
320 | 360 | ],
|
321 | 361 | "source": [
|
322 |
| - "feature_vector" |
| 362 | + "feature_vectors" |
323 | 363 | ]
|
324 | 364 | },
|
325 |
| - { |
326 |
| - "cell_type": "code", |
327 |
| - "execution_count": null, |
328 |
| - "id": "52047a6b", |
329 |
| - "metadata": {}, |
330 |
| - "outputs": [], |
331 |
| - "source": [] |
332 |
| - }, |
333 | 365 | {
|
334 | 366 | "cell_type": "code",
|
335 | 367 | "execution_count": null,
|
|
343 | 375 | "id": "9027fa4a",
|
344 | 376 | "metadata": {},
|
345 | 377 | "source": [
|
346 |
| - "## Initialize Sherlock." |
| 378 | + "## Initialize Sherlock" |
347 | 379 | ]
|
348 | 380 | },
|
349 | 381 | {
|
350 | 382 | "cell_type": "code",
|
351 |
| - "execution_count": 39, |
| 383 | + "execution_count": 11, |
352 | 384 | "id": "b9ec13ec",
|
353 | 385 | "metadata": {},
|
354 | 386 | "outputs": [],
|
355 | 387 | "source": [
|
356 | 388 | "model = SherlockModel();\n",
|
357 |
| - "model.initialize_model_from_json(with_weights=True);" |
| 389 | + "model.initialize_model_from_json(with_weights=True, model_id=\"sherlock\");" |
358 | 390 | ]
|
359 | 391 | },
|
360 | 392 | {
|
|
375 | 407 | },
|
376 | 408 | {
|
377 | 409 | "cell_type": "code",
|
378 |
| - "execution_count": 40, |
| 410 | + "execution_count": 12, |
379 | 411 | "id": "fc079fa9",
|
380 | 412 | "metadata": {},
|
381 | 413 | "outputs": [],
|
382 | 414 | "source": [
|
383 |
| - "predicted_labels = model.predict(feature_vector, \"sherlock\")" |
| 415 | + "predicted_labels = model.predict(feature_vectors, \"sherlock\")" |
384 | 416 | ]
|
385 | 417 | },
|
386 | 418 | {
|
387 | 419 | "cell_type": "code",
|
388 |
| - "execution_count": 41, |
| 420 | + "execution_count": 13, |
389 | 421 | "id": "0feb9584",
|
390 | 422 | "metadata": {},
|
391 | 423 | "outputs": [
|
392 | 424 | {
|
393 | 425 | "data": {
|
394 | 426 | "text/plain": [
|
395 |
| - "array(['creator', 'city'], dtype=object)" |
| 427 | + "array(['person', 'city', 'address'], dtype=object)" |
396 | 428 | ]
|
397 | 429 | },
|
398 |
| - "execution_count": 41, |
| 430 | + "execution_count": 13, |
399 | 431 | "metadata": {},
|
400 | 432 | "output_type": "execute_result"
|
401 | 433 | }
|
|
0 commit comments