Skip to content

Commit 5b3ac69

Browse files
Bugfix/drive downloads (#42)
* Black changes. * Black changes * Add gdown dependency, and missing file overwriting ignore. * Update test import. * Remove unncessary import. * Update original model with new trained weights, add tested notebooks from clean state, save classes file after fitting model. * Removeimport of removed dependency. * Check if the zip file is present, to avoid confusion with symlink dir. * Change data download, remove symlink dir.
1 parent 6c7e847 commit 5b3ac69

14 files changed

+841
-758
lines changed

data/data

Lines changed: 0 additions & 1 deletion
This file was deleted.

model_files/classes_sherlock.npy

1.37 KB
Binary file not shown.

model_files/sherlock_weights.h5

-148 Bytes
Binary file not shown.

notebooks/00-use-sherlock-out-of-the-box.ipynb

Lines changed: 112 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
"# Using Sherlock out-of-the-box\n",
99
"This notebook shows how to predict a semantic type for a given table column.\n",
1010
"The steps are basically:\n",
11-
"- Extract features from a column.\n",
11+
"- Download files for word embedding and paragraph vector feature extraction (downloads only once) and initialize feature extraction models.\n",
12+
"- Extract features from table columns.\n",
1213
"- Initialize Sherlock.\n",
1314
"- Make a prediction for the feature representation of the column."
1415
]
@@ -44,11 +45,14 @@
4445
"metadata": {},
4546
"outputs": [
4647
{
47-
"name": "stderr",
48-
"output_type": "stream",
49-
"text": [
50-
"UsageError: Environment does not have key: PYTHONHASHSEED\n"
51-
]
48+
"data": {
49+
"text/plain": [
50+
"'13'"
51+
]
52+
},
53+
"execution_count": 2,
54+
"metadata": {},
55+
"output_type": "execute_result"
5256
}
5357
],
5458
"source": [
@@ -57,20 +61,10 @@
5761
},
5862
{
5963
"cell_type": "markdown",
60-
"id": "2b3b7967",
64+
"id": "f1101303",
6165
"metadata": {},
6266
"source": [
63-
"## Extract features"
64-
]
65-
},
66-
{
67-
"cell_type": "code",
68-
"execution_count": 8,
69-
"id": "164f74ff",
70-
"metadata": {},
71-
"outputs": [],
72-
"source": [
73-
"# helpers.download_data()"
67+
"## Initialize feature extraction models"
7468
]
7569
},
7670
{
@@ -93,9 +87,9 @@
9387
" \n",
9488
"All files for extracting word and paragraph embeddings are present.\n",
9589
"Initialising word embeddings\n",
96-
"Initialise Word Embeddings process took 0:00:05.607905 seconds.\n",
97-
"Initialise Doc2Vec Model, 400 dim, process took 0:00:02.443327 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)\n",
98-
"Initialised NLTK, process took 0:00:00.181374 seconds.\n"
90+
"Initialise Word Embeddings process took 0:00:05.513540 seconds.\n",
91+
"Initialise Doc2Vec Model, 400 dim, process took 0:00:04.191875 seconds. (filename = ../sherlock/features/par_vec_trained_400.pkl)\n",
92+
"Initialised NLTK, process took 0:00:00.209930 seconds.\n"
9993
]
10094
},
10195
{
@@ -117,9 +111,17 @@
117111
"initialise_nltk()"
118112
]
119113
},
114+
{
115+
"cell_type": "markdown",
116+
"id": "2b3b7967",
117+
"metadata": {},
118+
"source": [
119+
"## Extract features"
120+
]
121+
},
120122
{
121123
"cell_type": "code",
122-
"execution_count": 35,
124+
"execution_count": 4,
123125
"id": "db04ccf9",
124126
"metadata": {},
125127
"outputs": [],
@@ -128,26 +130,28 @@
128130
" [\n",
129131
" [\"Jane Smith\", \"Lute Ahorn\", \"Anna James\"],\n",
130132
" [\"Amsterdam\", \"Haarlem\", \"Zwolle\"],\n",
133+
" [\"Chabot Street 19\", \"1200 fifth Avenue\", \"Binnenkant 22, 1011BH\"]\n",
131134
" ],\n",
132135
" name=\"values\"\n",
133136
")"
134137
]
135138
},
136139
{
137140
"cell_type": "code",
138-
"execution_count": 36,
141+
"execution_count": 5,
139142
"id": "4875f6c7",
140143
"metadata": {},
141144
"outputs": [
142145
{
143146
"data": {
144147
"text/plain": [
145-
"0 [Jane Smith, Lute Ahorn, Anna James]\n",
146-
"1 [Amsterdam, Haarlem, Zwolle]\n",
148+
"0 [Jane Smith, Lute Ahorn, Anna James]\n",
149+
"1 [Amsterdam, Haarlem, Zwolle]\n",
150+
"2 [Chabot Street 19, 1200 fifth Avenue, Binnenka...\n",
147151
"Name: values, dtype: object"
148152
]
149153
},
150-
"execution_count": 36,
154+
"execution_count": 5,
151155
"metadata": {},
152156
"output_type": "execute_result"
153157
}
@@ -158,15 +162,15 @@
158162
},
159163
{
160164
"cell_type": "code",
161-
"execution_count": 37,
165+
"execution_count": 8,
162166
"id": "f7f2c846",
163167
"metadata": {},
164168
"outputs": [
165169
{
166170
"name": "stderr",
167171
"output_type": "stream",
168172
"text": [
169-
"Extracting Features: 100%|██████████| 2/2 [00:00<00:00, 62.37it/s]\n"
173+
"Extracting Features: 100%|██████████| 3/3 [00:00<00:00, 167.51it/s]"
170174
]
171175
},
172176
{
@@ -175,19 +179,26 @@
175179
"text": [
176180
"Exporting 1588 column features\n"
177181
]
182+
},
183+
{
184+
"name": "stderr",
185+
"output_type": "stream",
186+
"text": [
187+
"\n"
188+
]
178189
}
179190
],
180191
"source": [
181192
"extract_features(\n",
182193
" \"../temporary.csv\",\n",
183194
" data\n",
184195
")\n",
185-
"feature_vector = pd.read_csv(\"../temporary.csv\", dtype=np.float32)"
196+
"feature_vectors = pd.read_csv(\"../temporary.csv\", dtype=np.float32)"
186197
]
187198
},
188199
{
189200
"cell_type": "code",
190-
"execution_count": 38,
201+
"execution_count": 9,
191202
"id": "0c42ce71",
192203
"metadata": {},
193204
"outputs": [
@@ -241,95 +252,116 @@
241252
" <td>0.0</td>\n",
242253
" <td>0.0</td>\n",
243254
" <td>0.0</td>\n",
244-
" <td>0.0</td>\n",
255+
" <td>0.000000</td>\n",
245256
" <td>0.0</td>\n",
246257
" <td>0.0</td>\n",
247258
" <td>0.0</td>\n",
248259
" <td>0.0</td>\n",
249260
" <td>-3.0</td>\n",
250261
" <td>0.0</td>\n",
251262
" <td>...</td>\n",
252-
" <td>-0.115819</td>\n",
253-
" <td>0.023961</td>\n",
254-
" <td>-0.130739</td>\n",
255-
" <td>0.006393</td>\n",
256-
" <td>-0.135118</td>\n",
257-
" <td>-0.071956</td>\n",
258-
" <td>-0.051051</td>\n",
259-
" <td>-0.068307</td>\n",
260-
" <td>0.087342</td>\n",
261-
" <td>-0.145716</td>\n",
263+
" <td>-0.116468</td>\n",
264+
" <td>0.023982</td>\n",
265+
" <td>-0.130867</td>\n",
266+
" <td>0.006825</td>\n",
267+
" <td>-0.135098</td>\n",
268+
" <td>-0.070616</td>\n",
269+
" <td>-0.052172</td>\n",
270+
" <td>-0.067250</td>\n",
271+
" <td>0.086256</td>\n",
272+
" <td>-0.144385</td>\n",
262273
" </tr>\n",
263274
" <tr>\n",
264275
" <th>1</th>\n",
265276
" <td>0.0</td>\n",
266277
" <td>0.0</td>\n",
267278
" <td>0.0</td>\n",
268-
" <td>0.0</td>\n",
279+
" <td>0.000000</td>\n",
269280
" <td>0.0</td>\n",
270281
" <td>0.0</td>\n",
271282
" <td>0.0</td>\n",
272283
" <td>0.0</td>\n",
273284
" <td>-3.0</td>\n",
274285
" <td>0.0</td>\n",
275286
" <td>...</td>\n",
276-
" <td>-0.054351</td>\n",
277-
" <td>0.023650</td>\n",
278-
" <td>-0.165681</td>\n",
279-
" <td>-0.016137</td>\n",
280-
" <td>-0.059402</td>\n",
281-
" <td>0.008454</td>\n",
282-
" <td>-0.044624</td>\n",
283-
" <td>0.025160</td>\n",
284-
" <td>0.037831</td>\n",
285-
" <td>-0.086235</td>\n",
287+
" <td>-0.054949</td>\n",
288+
" <td>0.024502</td>\n",
289+
" <td>-0.166001</td>\n",
290+
" <td>-0.014375</td>\n",
291+
" <td>-0.058199</td>\n",
292+
" <td>0.009978</td>\n",
293+
" <td>-0.046423</td>\n",
294+
" <td>0.025163</td>\n",
295+
" <td>0.036946</td>\n",
296+
" <td>-0.086611</td>\n",
297+
" </tr>\n",
298+
" <tr>\n",
299+
" <th>2</th>\n",
300+
" <td>1.0</td>\n",
301+
" <td>0.0</td>\n",
302+
" <td>1.0</td>\n",
303+
" <td>0.666667</td>\n",
304+
" <td>0.0</td>\n",
305+
" <td>2.0</td>\n",
306+
" <td>1.0</td>\n",
307+
" <td>3.0</td>\n",
308+
" <td>-1.5</td>\n",
309+
" <td>0.0</td>\n",
310+
" <td>...</td>\n",
311+
" <td>-0.022804</td>\n",
312+
" <td>0.001741</td>\n",
313+
" <td>0.047479</td>\n",
314+
" <td>0.118293</td>\n",
315+
" <td>-0.093435</td>\n",
316+
" <td>0.036759</td>\n",
317+
" <td>-0.004508</td>\n",
318+
" <td>-0.087898</td>\n",
319+
" <td>-0.117796</td>\n",
320+
" <td>-0.191386</td>\n",
286321
" </tr>\n",
287322
" </tbody>\n",
288323
"</table>\n",
289-
"<p>2 rows × 1588 columns</p>\n",
324+
"<p>3 rows × 1588 columns</p>\n",
290325
"</div>"
291326
],
292327
"text/plain": [
293328
" n_[0]-agg-any n_[0]-agg-all n_[0]-agg-mean n_[0]-agg-var n_[0]-agg-min \\\n",
294-
"0 0.0 0.0 0.0 0.0 0.0 \n",
295-
"1 0.0 0.0 0.0 0.0 0.0 \n",
329+
"0 0.0 0.0 0.0 0.000000 0.0 \n",
330+
"1 0.0 0.0 0.0 0.000000 0.0 \n",
331+
"2 1.0 0.0 1.0 0.666667 0.0 \n",
296332
"\n",
297333
" n_[0]-agg-max n_[0]-agg-median n_[0]-agg-sum n_[0]-agg-kurtosis \\\n",
298334
"0 0.0 0.0 0.0 -3.0 \n",
299335
"1 0.0 0.0 0.0 -3.0 \n",
336+
"2 2.0 1.0 3.0 -1.5 \n",
300337
"\n",
301338
" n_[0]-agg-skewness ... par_vec_390 par_vec_391 par_vec_392 \\\n",
302-
"0 0.0 ... -0.115819 0.023961 -0.130739 \n",
303-
"1 0.0 ... -0.054351 0.023650 -0.165681 \n",
339+
"0 0.0 ... -0.116468 0.023982 -0.130867 \n",
340+
"1 0.0 ... -0.054949 0.024502 -0.166001 \n",
341+
"2 0.0 ... -0.022804 0.001741 0.047479 \n",
304342
"\n",
305343
" par_vec_393 par_vec_394 par_vec_395 par_vec_396 par_vec_397 \\\n",
306-
"0 0.006393 -0.135118 -0.071956 -0.051051 -0.068307 \n",
307-
"1 -0.016137 -0.059402 0.008454 -0.044624 0.025160 \n",
344+
"0 0.006825 -0.135098 -0.070616 -0.052172 -0.067250 \n",
345+
"1 -0.014375 -0.058199 0.009978 -0.046423 0.025163 \n",
346+
"2 0.118293 -0.093435 0.036759 -0.004508 -0.087898 \n",
308347
"\n",
309348
" par_vec_398 par_vec_399 \n",
310-
"0 0.087342 -0.145716 \n",
311-
"1 0.037831 -0.086235 \n",
349+
"0 0.086256 -0.144385 \n",
350+
"1 0.036946 -0.086611 \n",
351+
"2 -0.117796 -0.191386 \n",
312352
"\n",
313-
"[2 rows x 1588 columns]"
353+
"[3 rows x 1588 columns]"
314354
]
315355
},
316-
"execution_count": 38,
356+
"execution_count": 9,
317357
"metadata": {},
318358
"output_type": "execute_result"
319359
}
320360
],
321361
"source": [
322-
"feature_vector"
362+
"feature_vectors"
323363
]
324364
},
325-
{
326-
"cell_type": "code",
327-
"execution_count": null,
328-
"id": "52047a6b",
329-
"metadata": {},
330-
"outputs": [],
331-
"source": []
332-
},
333365
{
334366
"cell_type": "code",
335367
"execution_count": null,
@@ -343,18 +375,18 @@
343375
"id": "9027fa4a",
344376
"metadata": {},
345377
"source": [
346-
"## Initialize Sherlock."
378+
"## Initialize Sherlock"
347379
]
348380
},
349381
{
350382
"cell_type": "code",
351-
"execution_count": 39,
383+
"execution_count": 11,
352384
"id": "b9ec13ec",
353385
"metadata": {},
354386
"outputs": [],
355387
"source": [
356388
"model = SherlockModel();\n",
357-
"model.initialize_model_from_json(with_weights=True);"
389+
"model.initialize_model_from_json(with_weights=True, model_id=\"sherlock\");"
358390
]
359391
},
360392
{
@@ -375,27 +407,27 @@
375407
},
376408
{
377409
"cell_type": "code",
378-
"execution_count": 40,
410+
"execution_count": 12,
379411
"id": "fc079fa9",
380412
"metadata": {},
381413
"outputs": [],
382414
"source": [
383-
"predicted_labels = model.predict(feature_vector, \"sherlock\")"
415+
"predicted_labels = model.predict(feature_vectors, \"sherlock\")"
384416
]
385417
},
386418
{
387419
"cell_type": "code",
388-
"execution_count": 41,
420+
"execution_count": 13,
389421
"id": "0feb9584",
390422
"metadata": {},
391423
"outputs": [
392424
{
393425
"data": {
394426
"text/plain": [
395-
"array(['creator', 'city'], dtype=object)"
427+
"array(['person', 'city', 'address'], dtype=object)"
396428
]
397429
},
398-
"execution_count": 41,
430+
"execution_count": 13,
399431
"metadata": {},
400432
"output_type": "execute_result"
401433
}

0 commit comments

Comments
 (0)