diff --git a/key_value_memory/memn2n_kv.py b/key_value_memory/memn2n_kv.py
index 7521f7e..7ea265f 100644
--- a/key_value_memory/memn2n_kv.py
+++ b/key_value_memory/memn2n_kv.py
@@ -33,7 +33,7 @@ def add_gradient_noise(t, stddev=1e-3, name=None):
 
     0.001 was said to be a good fixed value for memory networks [2].
     """
-    with tf.op_scope([t, stddev], name, "add_gradient_noise") as name:
+    with tf.name_scope(name, "add_gradient_noise", [t, stddev]) as name:
         t = tf.convert_to_tensor(t, name="t")
         gn = tf.random_normal(tf.shape(t), stddev=stddev)
         return tf.add(t, gn, name=name)
@@ -44,11 +44,11 @@ def zero_nil_slot(t, name=None):
     The nil_slot is a dummy slot and should not be trained and influence
     the training algorithm.
     """
-    with tf.op_scope([t], name, "zero_nil_slot") as name:
+    with tf.name_scope(name, "zero_nil_slot", [t]) as name:
         t = tf.convert_to_tensor(t, name="t")
         s = tf.shape(t)[1]
         z = tf.zeros(tf.pack([1, s]))
-        return tf.concat(0, [z, tf.slice(t, [1, 0], [-1, -1])], name=name)
+        return tf.concat([z, tf.slice(t, [1, 0], [-1, -1])], 0, name=name)
 
 class MemN2N_KV(object):
     """Key Value Memory Network."""
@@ -120,10 +120,10 @@ def __init__(self, batch_size, vocab_size,
         # Embedding layer
         with tf.device('/cpu:0'), tf.name_scope("embedding"):
             nil_word_slot = tf.zeros([1, embedding_size])
-            self.W = tf.concat(0, [nil_word_slot, tf.get_variable('W', shape=[vocab_size-1, embedding_size],
-                                                                  initializer=tf.contrib.layers.xavier_initializer())])
-            self.W_memory = tf.concat(0, [nil_word_slot, tf.get_variable('W_memory', shape=[vocab_size-1, embedding_size],
-                                                                         initializer=tf.contrib.layers.xavier_initializer())])
+            self.W = tf.concat([nil_word_slot, tf.get_variable('W', shape=[vocab_size-1, embedding_size],
+                                                                  initializer=tf.contrib.layers.xavier_initializer())], 0)
+            self.W_memory = tf.concat([nil_word_slot, tf.get_variable('W_memory', shape=[vocab_size-1, embedding_size],
+                                                                         initializer=tf.contrib.layers.xavier_initializer())], 0)
             # self.W_memory = self.W
             self._nil_vars = set([self.W.name, self.W_memory.name])
             # shape: [batch_size, query_size, embedding_size]
@@ -186,7 +186,7 @@ def __init__(self, batch_size, vocab_size,
             #logits = tf.nn.dropout(tf.matmul(o, self.B) + logits_bias, self.keep_prob)
             probs = tf.nn.softmax(tf.cast(logits, tf.float32))
             
-            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, tf.cast(self._labels, tf.float32), name='cross_entropy')
+            cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf.cast(self._labels, tf.float32), name='cross_entropy')
             cross_entropy_sum = tf.reduce_sum(cross_entropy, name="cross_entropy_sum")
 
             # loss op
diff --git a/key_value_memory/single.py b/key_value_memory/single.py
index 739920b..2d674d8 100644
--- a/key_value_memory/single.py
+++ b/key_value_memory/single.py
@@ -4,10 +4,11 @@
 from __future__ import print_function
 
 from data_utils import load_task, vectorize_data
-from sklearn import cross_validation, metrics
+from sklearn import model_selection, metrics
 from memn2n_kv import MemN2N_KV
 from itertools import chain
 from six.moves import range
+from functools import reduce
 
 import tensorflow as tf
 import numpy as np
@@ -44,7 +45,7 @@
 word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
 
 max_story_size = max(map(len, (s for s, _, _ in data)))
-mean_story_size = int(np.mean(map(len, (s for s, _, _ in data))))
+mean_story_size = int(np.mean(list(map(len, (s for s, _, _ in data)))))
 sentence_size = max(map(len, chain.from_iterable(s for s, _, _ in data)))
 query_size = max(map(len, (q for _, q, _ in data)))
 memory_size = min(FLAGS.memory_size, max_story_size)
@@ -57,7 +58,7 @@
 
 # train/validation/test sets
 S, Q, A = vectorize_data(train, word_idx, sentence_size, memory_size)
-trainS, valS, trainQ, valQ, trainA, valA = cross_validation.train_test_split(S, Q, A, test_size=.1)
+trainS, valS, trainQ, valQ, trainA, valA = model_selection.train_test_split(S, Q, A, test_size=.1)
 testS, testQ, testA = vectorize_data(test, word_idx, sentence_size, memory_size)
 
 print("Training set shape", trainS.shape)
@@ -76,7 +77,7 @@
 val_labels = np.argmax(valA, axis=1)
 
 batch_size = FLAGS.batch_size
-batches = zip(range(0, n_train-batch_size, batch_size), range(batch_size, n_train, batch_size))
+batches = list(zip(range(0, n_train-batch_size, batch_size), range(batch_size, n_train, batch_size)))
 
 with tf.Graph().as_default():
     session_conf = tf.ConfigProto(
@@ -110,7 +111,7 @@
                 nil_grads_and_vars.append((g, v))
 
         train_op = optimizer.apply_gradients(nil_grads_and_vars, name="train_op", global_step=global_step)
-        sess.run(tf.initialize_all_variables())
+        sess.run(tf.global_variables_initializer())
 
         def train_step(s, q, a):
             feed_dict = {