Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@ You can find the pre-processed datasets and the pre-trained word embeddings in [

You can also download the original datasets of Restaurant domain and Beer domain in [[Download]](https://drive.google.com/open?id=1qzbTiJ2IL5ATZYNMp2DRkHvbFYsnOVAQ). For preprocessing, put the decompressed zip file in the main folder and run
```
python word2vec.py
python preprocess.py
python3 word2vec.py
python3 preprocess.py
```
respectively in code/ . The preprocessed files and trained word embeddings for each domain will be saved in a folder preprocessed_data/.

## Train
Under code/ and type the following command for training:
```
THEANO_FLAGS="device=gpu0,floatX=float32" python train.py \
KERAS_BACKEND=theano THEANO_FLAGS="device=gpu0,floatX=float32" python train.py \
--emb ../preprocessed_data/$domain/w2v_embedding \
--domain $domain \
-o output_dir \
Expand All @@ -26,7 +26,7 @@ After training, two output files will be saved in code/output_dir/$domain/: 1) *
## Evaluation
Under code/ and type the following command:
```
THEANO_FLAGS="device=gpu0,floatX=float32" python evaluation.py \
THEANO_FLAGS="device=gpu0,floatX=float32" python3 evaluation.py \
--domain $domain \
-o output_dir \
```
Expand All @@ -40,7 +40,7 @@ One example of trained model for the restaurant domain has been put in pre_train

## Dependencies

python 2
python 3

* keras 1.2.1
* theano 0.9.0
Expand Down
6 changes: 3 additions & 3 deletions code/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,12 @@ def preprocess_test(domain):
out2.write(label+'\n')

def preprocess(domain):
print '\t'+domain+' train set ...'
print( '\t'+domain+' train set ...')
preprocess_train(domain)
print '\t'+domain+' test set ...'
print( '\t'+domain+' test set ...')
preprocess_test(domain)

print 'Preprocessing raw review sentences ...'
print( 'Preprocessing raw review sentences ...')
preprocess('restaurant')
preprocess('beer')

Expand Down
22 changes: 11 additions & 11 deletions code/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def create_vocab(domain, maxlen=0, vocab_size=0):
word_freqs[w] = 1
total_words += 1

print (' %i total words, %i unique words' % (total_words, unique_words))
print( (' %i total words, %i unique words' % (total_words, unique_words)))
sorted_word_freqs = sorted(word_freqs.items(), key=operator.itemgetter(1), reverse=True)

vocab = {'<pad>':0, '<unk>':1, '<num>':2}
Expand All @@ -41,7 +41,7 @@ def create_vocab(domain, maxlen=0, vocab_size=0):
if vocab_size > 0 and index > vocab_size + 2:
break
if vocab_size > 0:
print (' keep the top %i words' % vocab_size)
print( (' keep the top %i words' % vocab_size))

#Write (vocab, frequence) to a txt file
vocab_file = codecs.open('../preprocessed_data/%s/vocab' % domain, mode='w', encoding='utf8')
Expand Down Expand Up @@ -86,19 +86,19 @@ def read_dataset(domain, phase, vocab, maxlen):
if maxlen_x < len(indices):
maxlen_x = len(indices)

print ' <num> hit rate: %.2f%%, <unk> hit rate: %.2f%%' % (100*num_hit/total, 100*unk_hit/total)
print( ' <num> hit rate: %.2f%%, <unk> hit rate: %.2f%%' % (100*num_hit/total, 100*unk_hit/total))
return data_x, maxlen_x



def get_data(domain, vocab_size=0, maxlen=0):
print 'Reading data from', domain
print ' Creating vocab ...'
print( 'Reading data from', domain)
print( ' Creating vocab ...')
vocab = create_vocab(domain, maxlen, vocab_size)
print ' Reading dataset ...'
print ' train set'
print( ' Reading dataset ...')
print( ' train set')
train_x, train_maxlen = read_dataset(domain, 'train', vocab, maxlen)
print ' test set'
print( ' test set')
test_x, test_maxlen = read_dataset(domain, 'test', vocab, maxlen)
maxlen = max(train_maxlen, test_maxlen)
return vocab, train_x, test_x, maxlen
Expand All @@ -107,7 +107,7 @@ def get_data(domain, vocab_size=0, maxlen=0):

if __name__ == "__main__":
vocab, train_x, test_x, maxlen = get_data('restaurant')
print len(train_x)
print len(test_x)
print maxlen
print( len(train_x))
print( len(test_x))
print( maxlen)

4 changes: 1 addition & 3 deletions code/run_script.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@


THEANO_FLAGS="device=gpu0,floatX=float32" python train.py \
KERAS_BACKEND=theano THEANO_FLAGS="device=gpu0,floatX=float32" python3 train.py \
--emb ../preprocessed_data/restaurant/w2v_embedding \
--domain restaurant \
-o output_dir \
Expand Down
16 changes: 8 additions & 8 deletions code/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@
train_x = sequence.pad_sequences(train_x, maxlen=overall_maxlen)
test_x = sequence.pad_sequences(test_x, maxlen=overall_maxlen)

print 'Number of training examples: ', len(train_x)
print 'Length of vocab: ', len(vocab)
print ('Number of training examples: ', len(train_x))
print ('Length of vocab: ', len(vocab))

def sentence_batch_generator(data, batch_size):
n_batch = len(data) / batch_size
Expand Down Expand Up @@ -129,13 +129,13 @@ def max_margin_loss(y_true, y_pred):
batches_per_epoch = 1000

min_loss = float('inf')
for ii in xrange(args.epochs):
for ii in range(args.epochs):
t0 = time()
loss, max_margin_loss = 0., 0.

for b in tqdm(xrange(batches_per_epoch)):
sen_input = sen_gen.next()
neg_input = neg_gen.next()
for b in tqdm(range(batches_per_epoch)):
sen_input = next(sen_gen)
neg_input = next(neg_gen)

batch_loss, batch_max_margin_loss = model.train_on_batch([sen_input, neg_input], np.ones((args.batch_size, 1)))
loss += batch_loss / batches_per_epoch
Expand All @@ -157,8 +157,8 @@ def max_margin_loss(y_true, y_pred):
sims = word_emb.dot(desc.T)
ordered_words = np.argsort(sims)[::-1]
desc_list = [vocab_inv[w] for w in ordered_words[:100]]
print 'Aspect %d:' % ind
print desc_list
print ('Aspect %d:' % ind)
print (desc_list)
aspect_file.write('Aspect %d:\n' % ind)
aspect_file.write(' '.join(desc_list) + '\n\n')

Expand Down
2 changes: 1 addition & 1 deletion code/w2vEmbReader.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def get_emb_given_word(self, word):

def get_emb_matrix_given_vocab(self, vocab, emb_matrix):
counter = 0.
for word, index in vocab.iteritems():
for word, index in vocab.items():
try:
emb_matrix[index] = self.embeddings[word]
counter += 1
Expand Down
2 changes: 1 addition & 1 deletion code/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def main(domain):
model.save(model_file)


print 'Pre-training word embeddings ...'
print( 'Pre-training word embeddings ...')
main('restaurant')
main('beer')

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
keras==1.2.1
theano==0.9.0
numpy==1.13.3
numpy==1.17.0
scikit-learn
gensim==0.12.4
tqdm
Expand Down