tensorflow
diff --git a/‎tensor2tensor/layers/common_attention.py
Lines changed: 149 additions & 0 deletions b/‎tensor2tensor/layers/common_attention.py
Lines changed: 149 additions & 0 deletions
diff --git a/‎tensor2tensor/layers/common_attention_test.py
Lines changed: 44 additions & 0 deletions b/‎tensor2tensor/layers/common_attention_test.py
Lines changed: 44 additions & 0 deletions
diff --git a/‎tensor2tensor/layers/common_layers.py
Lines changed: 121 additions & 2 deletions b/‎tensor2tensor/layers/common_layers.py
Lines changed: 121 additions & 2 deletions
@@ -30,6 +30,8 @@
 
 import tensorflow as tf
 
+from tensorflow.python.framework import function
+
 
 def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
   """Adds a bunch of sinusoids of different frequencies to a Tensor.
@@ -1100,3 +1102,150 @@ def local_expert_attention(
         additional_dispatch_params=additional_dispatch_params,
         pad_remover=pad_remover
     )
+
+
+def scaled_dot_product_attention_simple(q, k, v, bias, name=None):
+  """scaled dot-product attention.  One head.  One spatial dimension.
+
+  Args:
+    q: a Tensor with shape [batch, length_q, depth_k]
+    k: a Tensor with shape [batch, length_kv, depth_k]
+    v: a Tensor with shape [batch, length_kv, depth_v]
+    bias: optional Tensor broadcastable to [batch, length_q, length_kv]
+    name: an optional string
+
+  Returns:
+    A Tensor.
+  """
+  with tf.variable_scope(
+      name, default_name="scaled_dot_product_attention_simple"):
+    scalar = tf.rsqrt(tf.to_float(tf.shape(q)[2]))
+    logits = tf.matmul(q * scalar, k, transpose_b=True)
+    if bias is not None:
+      logits += bias
+    weights = tf.nn.softmax(logits, name="attention_weights")
+    return tf.matmul(weights, v)
+
+
+_function_cache = {}
+
+
+def multihead_self_attention_memory_efficient(x,
+                                              bias,
+                                              num_heads,
+                                              head_size=None,
+                                              epsilon=1e-6,
+                                              forget=True,
+                                              test_vars=None,
+                                              name=None):
+  """Multihead scaled-dot-product self-attention.
+
+  Includes layer norm.
+
+  Returns multihead-self-attention(layer_norm(x))
+
+  Computes one attention head at a time to avoid exhausting memory.
+
+  If forget=True, then forget all forwards activations and recompute on
+  the backwards pass.
+
+  Args:
+    x: a Tensor with shape [batch, length, input_size]
+    bias: an attention bias tensor broadcastable to [batch, 1, length, length]
+    num_heads: an integer
+    head_size: an optional integer - defaults to input_size/num_heads
+    epsilon: a float, for layer norm
+    forget: a boolean - forget forwards activations and recompute on backprop
+    test_vars: optional tuple of variables for testing purposes
+    name: an optional string
+
+  Returns:
+    A Tensor.
+  """
+  io_size = x.get_shape().as_list()[-1]
+  if head_size is None:
+    assert io_size % num_heads == 0
+    head_size = io_size / num_heads
+
+  def forward_internal(x, wqkv, wo, attention_bias, norm_scale, norm_bias):
+    """Forward function."""
+    n = common_layers.layer_norm_compute_python(
+        x, epsilon, norm_scale, norm_bias)
+    wqkv_split = tf.unstack(wqkv, num=num_heads)
+    wo_split = tf.unstack(wo, num=num_heads)
+    y = 0
+    for h in xrange(num_heads):
+      with tf.control_dependencies([y] if h > 0 else []):
+        combined = tf.nn.conv1d(n, wqkv_split[h], 1, "SAME")
+        q, k, v = tf.split(combined, 3, axis=2)
+        o = scaled_dot_product_attention_simple(q, k, v, attention_bias)
+        y += tf.nn.conv1d(o, wo_split[h], 1, "SAME")
+    return y
+
+  key = ("multihead_self_attention_memory_efficient %s %s" %
+         (num_heads, epsilon))
+  if not forget:
+    forward_fn = forward_internal
+  elif key in _function_cache:
+    forward_fn = _function_cache[key]
+  else:
+    @function.Defun(compiled=True)
+    def grad_fn(x, wqkv, wo, attention_bias, norm_scale, norm_bias, dy):
+      with tf.control_dependencies([dy]):
+        n = common_layers.layer_norm_compute_python(
+            x, epsilon, norm_scale, norm_bias)
+        wqkv_split = tf.unstack(wqkv, num=num_heads)
+        wo_split = tf.unstack(wo, num=num_heads)
+        deps = []
+        dwqkvs = []
+        dwos = []
+        dn = 0
+        for h in xrange(num_heads):
+          with tf.control_dependencies(deps):
+            combined = tf.nn.conv1d(n, wqkv_split[h], 1, "SAME")
+            q, k, v = tf.split(combined, 3, axis=2)
+            o = scaled_dot_product_attention_simple(q, k, v, attention_bias)
+            partial_y = tf.nn.conv1d(o, wo_split[h], 1, "SAME")
+            pdn, dwqkvh, dwoh = tf.gradients(
+                ys=[partial_y],
+                xs=[n, wqkv_split[h], wo_split[h]],
+                grad_ys=[dy])
+            dn += pdn
+            dwqkvs.append(dwqkvh)
+            dwos.append(dwoh)
+            deps = [dn, dwqkvh, dwoh]
+        dwqkv = tf.stack(dwqkvs)
+        dwo = tf.stack(dwos)
+        with tf.control_dependencies(deps):
+          dx, dnorm_scale, dnorm_bias = tf.gradients(
+              ys=[n], xs=[x, norm_scale, norm_bias], grad_ys=[dn])
+        return (dx, dwqkv, dwo, tf.zeros_like(attention_bias),
+                dnorm_scale, dnorm_bias)
+
+    @function.Defun(grad_func=grad_fn, compiled=True,
+                    separate_compiled_gradients=True)
+    def forward_fn(x, wqkv, wo, attention_bias, norm_scale, norm_bias):
+      return forward_internal(
+          x, wqkv, wo, attention_bias, norm_scale, norm_bias)
+    _function_cache[key] = forward_fn
+
+  if bias is not None:
+    bias = tf.squeeze(bias, 1)
+  with tf.variable_scope(name, default_name="multihead_attention", values=[x]):
+    # TODO(noam): it would be nice to save memory by casting x to float16
+    # here, but this causes problems with the gradients.  Figure out if there
+    # is a way to leave the gradients as float32.
+    if test_vars is not None:
+      wqkv, wo, norm_scale, norm_bias = list(test_vars)
+    else:
+      wqkv = tf.get_variable(
+          "wqkv", [num_heads, 1, io_size, 3 * head_size],
+          initializer=tf.random_normal_initializer(stddev=io_size**-0.5))
+      wo = tf.get_variable(
+          "wo", [num_heads, 1, head_size, io_size],
+          initializer=tf.random_normal_initializer(
+              stddev=(head_size * num_heads)**-0.5))
+      norm_scale, norm_bias = common_layers.layer_norm_vars(io_size)
+    y = forward_fn(x, wqkv, wo, bias, norm_scale, norm_bias)
+    y.set_shape(x.get_shape())
+    return y
@@ -23,6 +23,7 @@
 
 import numpy as np
 from tensor2tensor.layers import common_attention
+from tensor2tensor.layers import common_layers
 
 import tensorflow as tf
 
@@ -117,6 +118,49 @@ def testLocalUnmaskedAttention2DMatchingBlockLength(self):
       res = session.run(a)
     self.assertEqual(res.shape, (5, 4, 25, 25, 16))
 
+  def testMultiheadSelfAttentionMemoryEfficient(self):
+    num_heads = 4
+    io_size = 16
+    batch = 2
+    length = 7
+    head_size = 5
+    x = np.random.rand(batch, length, io_size)
+    dy = np.random.rand(batch, length, io_size)
+    with self.test_session() as session:
+      x = tf.to_float(x)
+      dy = tf.to_float(dy)
+      bias = common_attention.attention_bias_lower_triangle(length)
+      wqkv = tf.get_variable(
+          "wqkv", [num_heads, 1, io_size, 3 * head_size],
+          initializer=tf.random_normal_initializer(stddev=io_size**-0.5))
+      wo = tf.get_variable(
+          "wo", [num_heads, 1, head_size, io_size],
+          initializer=tf.random_normal_initializer(
+              stddev=(head_size * num_heads)**-0.5))
+      norm_scale, norm_bias = common_layers.layer_norm_vars(io_size)
+      y = common_attention.multihead_self_attention_memory_efficient(
+          x, bias, num_heads, head_size=head_size, forget=False,
+          test_vars=(wqkv, wo, norm_scale, norm_bias))
+      y_forget = common_attention.multihead_self_attention_memory_efficient(
+          x, bias, num_heads, head_size=head_size, forget=True,
+          test_vars=(wqkv, wo, norm_scale, norm_bias))
+      dx, dwqkv, dwo, dnorm_scale, dnorm_bias = tf.gradients(
+          ys=[y], xs=[x, wqkv, wo, norm_scale, norm_bias], grad_ys=[dy])
+      dx_f, dwqkv_f, dwo_f, dnorm_scale_f, dnorm_bias_f = tf.gradients(
+          ys=[y_forget], xs=[x, wqkv, wo, norm_scale, norm_bias], grad_ys=[dy])
+      session.run(tf.global_variables_initializer())
+      (y, y_forget,
+       dx, dwqkv, dwo, dnorm_scale, dnorm_bias,
+       dx_f, dwqkv_f, dwo_f, dnorm_scale_f, dnorm_bias_f) = session.run(
+           [y, y_forget,
+            dx, dwqkv, dwo, dnorm_scale, dnorm_bias,
+            dx_f, dwqkv_f, dwo_f, dnorm_scale_f, dnorm_bias_f])
+    self.assertAllClose(y, y_forget)
+    self.assertAllClose(dwo, dwo_f)
+    self.assertAllClose(dwqkv, dwqkv_f)
+    self.assertAllClose(dnorm_scale, dnorm_scale_f)
+    self.assertAllClose(dnorm_bias, dnorm_bias_f)
+    self.assertAllClose(dx, dx_f)
 
 if __name__ == "__main__":
   tf.test.main()
@@ -425,6 +425,15 @@ def conv_fn(inputs, filters, kernel_size, **kwargs):
   return conv_internal(conv_fn, inputs, filters, kernel_size, **kwargs)
 
 
+def layer_norm_vars(filters):
+  """Create Variables for layer norm."""
+  scale = tf.get_variable(
+      "layer_norm_scale", [filters], initializer=tf.ones_initializer())
+  bias = tf.get_variable(
+      "layer_norm_bias", [filters], initializer=tf.zeros_initializer())
+  return scale, bias
+
+
 def layer_norm_compute_python(x, epsilon, scale, bias):
   """Layer norm raw computation."""
   mean = tf.reduce_mean(x, axis=[-1], keep_dims=True)
@@ -1773,7 +1782,7 @@ def smoothing_cross_entropy_factored_grad(op, dy):
   b = op.inputs[1]
   labels = op.inputs[2]
   confidence = op.inputs[3]
-  num_splits = 32
+  num_splits = 16
   vocab_size = tf.shape(b)[0]
   labels = approximate_split(labels, num_splits)
   a = approximate_split(a, num_splits)
@@ -1817,7 +1826,7 @@ def smoothing_cross_entropy_factored(a, b, labels, confidence):
   Returns:
     A Tensor with shape [batch]
   """
-  num_splits = 32
+  num_splits = 16
   vocab_size = tf.shape(b)[0]
   labels = approximate_split(labels, num_splits)
   a = approximate_split(a, num_splits)
@@ -1957,3 +1966,113 @@ def identity(*args):
 
     id_out = identity(*(inputs + train_vars + outputs))
     return id_out
+
+
+_function_cache = {}
+
+
+def conv_hidden_relu_memory_efficient(x,
+                                      filter_size,
+                                      epsilon=1e-6,
+                                      forget=True,
+                                      test_vars=None,
+                                      name=None):
+  """LayerNorm, Conv, ReLU, Conv.
+
+  All convolutions have kernel size 1.
+
+  returns conv(relu(conv(layer_norm(x))))
+
+  Args:
+    x: input Tensor with shape [batch, length, io_size]
+    filter_size: an integer - size of the hidden layer.
+    epsilon: a float (for layer norm)
+    forget: a boolean - forget forwards activations and recompute on backprop
+    test_vars: optional tuple of variables for testing purposes
+    name: an optional string
+
+  Returns:
+    a Tensor with shape [batch, length, io_size]
+  """
+  io_size = x.get_shape().as_list()[-1]
+
+  def forward_internal(x, f1, f2, scale, bias):
+    """Forward function."""
+    # split batch-wise to avoid exhausting memory in cast the batch is large
+    # and the hidden layer is large.
+    num_splits = 4
+    x_flat = tf.reshape(x, [-1, 1, tf.shape(x)[2]])
+    xs = approximate_split(x_flat, num_splits)
+    ys = []
+    for i in xrange(num_splits):
+      with tf.control_dependencies(ys[-1:]):
+        n = layer_norm_compute_python(xs[i], epsilon, scale, bias)
+        y = tf.nn.conv1d(n, f1, 1, "SAME")
+        y = tf.nn.relu(y)
+        y = tf.nn.conv1d(y, f2, 1, "SAME")
+        ys.append(y)
+    y = tf.concat(ys, 0)
+    y = tf.reshape(y, tf.shape(x))
+    return y
+  key = ("conv_hidden_relu_memory_efficient %s" % epsilon)
+  if not forget:
+    forward_fn = forward_internal
+  elif key in _function_cache:
+    forward_fn = _function_cache[key]
+  else:
+    @function.Defun(compiled=True)
+    def grad_fn(x, f1, f2, scale, bias, dy):
+      with tf.control_dependencies([dy]):
+        num_splits = 4
+        x_shape = tf.shape(x)
+        flat_shape = [-1, 1, x_shape[2]]
+        x = tf.reshape(x, flat_shape)
+        dy = tf.reshape(dy, flat_shape)
+        xs = approximate_split(x, num_splits)
+        dys = approximate_split(dy, num_splits)
+        dxs = []
+        df1 = 0
+        df2 = 0
+        dscale = 0
+        dbias = 0
+        deps = []
+        for i in xrange(num_splits):
+          with tf.control_dependencies(deps):
+            n = layer_norm_compute_python(xs[i], epsilon, scale, bias)
+            y = tf.nn.conv1d(n, f1, 1, "SAME")
+            y = tf.nn.relu(y)
+            y = tf.nn.conv1d(y, f2, 1, "SAME")
+            dxi, pdf1, pdf2, pdscale, pdbias = tf.gradients(
+                ys=[y], xs=[xs[i], f1, f2, scale, bias], grad_ys=[dys[i]])
+            df1 += pdf1
+            df2 += pdf2
+            dscale += pdscale
+            dbias += pdbias
+            dxs.append(dxi)
+            deps = [dxi, df1, df2, dscale, dbias]
+        with tf.control_dependencies(deps):
+          dx = tf.concat(dxs, 0)
+          dx = tf.reshape(dx, x_shape)
+          return dx, df1, df2, dscale, dbias
+
+    @function.Defun(grad_func=grad_fn, compiled=True,
+                    separate_compiled_gradients=True)
+    def forward_fn(x, f1, f2, scale, bias):
+      return forward_internal(x, f1, f2, scale, bias)
+
+  with tf.variable_scope(name, default_name="ffn2", values=[x]):
+    # TODO(noam): it would be nice to save memory by casting x to float16
+    # here, but this causes problems with the gradients.  Figure out if there
+    # is a way to leave the gradients as float32.
+    if test_vars is not None:
+      f1, f2, scale, bias = list(test_vars)
+    else:
+      f1 = tf.get_variable("f1", [1, io_size, filter_size])
+      f2 = tf.get_variable("f2", [1, filter_size, io_size])
+      scale, bias = layer_norm_vars(io_size)
+    if forget:
+      y = forward_fn(x, f1, f2, scale, bias)
+    else:
+      y = forward_internal(x, f1, f2, scale, bias)
+    y.set_shape(x.get_shape())
+    return y