Skip to content

Commit 6254a62

Browse files
#21 Factorize word_vectors reading and preparation to speed up feature preparation (#22)
1 parent 35fd7c5 commit 6254a62

File tree

2 files changed

+17
-11
lines changed

2 files changed

+17
-11
lines changed

sherlock/features/preprocessing.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,19 @@ def prepare_feature_extraction():
5252

5353
print("All files for extracting word and paragraph embeddings are present.")
5454

55+
def prepare_word_embeddings():
56+
57+
word_vectors_f = open('../sherlock/features/glove.6B.50d.txt', encoding='utf-8')
58+
word_to_embedding = {}
59+
60+
for w in word_vectors_f:
61+
62+
term, vector = w.strip().split(' ', 1)
63+
vector = np.array(vector.split(' '), dtype=float)
64+
word_to_embedding[term] = vector
65+
66+
return word_to_embedding
67+
5568

5669
def convert_string_lists_to_lists(
5770
data: Union[pd.DataFrame, pd.Series],
@@ -116,6 +129,8 @@ def extract_features(data: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame:
116129
"""
117130
prepare_feature_extraction()
118131

132+
word_to_embedding = prepare_word_embeddings()
133+
119134
features_list = []
120135
df_par = pd.DataFrame()
121136
n_samples = 1000
@@ -137,7 +152,7 @@ def extract_features(data: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame:
137152

138153
f = OrderedDict(
139154
list(extract_bag_of_characters_features(raw_sample).items()) +
140-
list(extract_word_embeddings_features(raw_sample).items()) +
155+
list(extract_word_embeddings_features(raw_sample, word_to_embedding).items()) +
141156
list(extract_bag_of_words_features(raw_sample, n_values).items())
142157
)
143158
features_list.append(f)

sherlock/features/word_embeddings.py

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,12 @@
66

77
# Input: a single column in the form of a pandas series
88
# Output: ordered dictionary holding word embedding features
9-
def extract_word_embeddings_features(values):
9+
def extract_word_embeddings_features(values, word_to_embedding):
1010

1111
num_embeddings = 50
1212
f = OrderedDict()
1313
embeddings = []
1414

15-
word_vectors_f = open('../sherlock/features/glove.6B.50d.txt', encoding='utf-8')
16-
word_to_embedding = {}
17-
18-
for w in word_vectors_f:
19-
20-
term, vector = w.strip().split(' ', 1)
21-
vector = np.array(vector.split(' '), dtype=float)
22-
word_to_embedding[term] = vector
23-
2415
values = values.dropna()
2516

2617
for v in values:

0 commit comments

Comments
 (0)