@@ -52,6 +52,19 @@ def prepare_feature_extraction():
52
52
53
53
print ("All files for extracting word and paragraph embeddings are present." )
54
54
55
+ def prepare_word_embeddings ():
56
+
57
+ word_vectors_f = open ('../sherlock/features/glove.6B.50d.txt' , encoding = 'utf-8' )
58
+ word_to_embedding = {}
59
+
60
+ for w in word_vectors_f :
61
+
62
+ term , vector = w .strip ().split (' ' , 1 )
63
+ vector = np .array (vector .split (' ' ), dtype = float )
64
+ word_to_embedding [term ] = vector
65
+
66
+ return word_to_embedding
67
+
55
68
56
69
def convert_string_lists_to_lists (
57
70
data : Union [pd .DataFrame , pd .Series ],
@@ -116,6 +129,8 @@ def extract_features(data: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame:
116
129
"""
117
130
prepare_feature_extraction ()
118
131
132
+ word_to_embedding = prepare_word_embeddings ()
133
+
119
134
features_list = []
120
135
df_par = pd .DataFrame ()
121
136
n_samples = 1000
@@ -137,7 +152,7 @@ def extract_features(data: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame:
137
152
138
153
f = OrderedDict (
139
154
list (extract_bag_of_characters_features (raw_sample ).items ()) +
140
- list (extract_word_embeddings_features (raw_sample ).items ()) +
155
+ list (extract_word_embeddings_features (raw_sample , word_to_embedding ).items ()) +
141
156
list (extract_bag_of_words_features (raw_sample , n_values ).items ())
142
157
)
143
158
features_list .append (f )
0 commit comments