s/keep_dims/keepdims since keep_dims in tf.reduce_mean is deprecated.

T2T Team · Ryan Sepassi · commit 4af78a7c7d9b · 2018-05-19T16:55:28.000-07:00
PiperOrigin-RevId: 197200583
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
@@ -228,9 +228,9 @@ def standardize_images(x):
   """Image standardization on batches."""
   with tf.name_scope("standardize_images", [x]):
     x = tf.to_float(x)
-    x_mean = tf.reduce_mean(x, axis=[1, 2, 3], keep_dims=True)
+    x_mean = tf.reduce_mean(x, axis=[1, 2, 3], keepdims=True)
     x_variance = tf.reduce_mean(
-        tf.square(x - x_mean), axis=[1, 2, 3], keep_dims=True)
+        tf.square(x - x_mean), axis=[1, 2, 3], keepdims=True)
     x_shape = shape_list(x)
     num_pixels = tf.to_float(x_shape[1] * x_shape[2] * x_shape[3])
     x = (x - x_mean) / tf.maximum(tf.sqrt(x_variance), tf.rsqrt(num_pixels))
@@ -604,8 +604,8 @@ def layer_norm_vars(filters):
 def layer_norm_compute_python(x, epsilon, scale, bias):
   """Layer norm raw computation."""
   epsilon, scale, bias = [tf.cast(t, x.dtype) for t in [epsilon, scale, bias]]
-  mean = tf.reduce_mean(x, axis=[-1], keep_dims=True)
-  variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keep_dims=True)
+  mean = tf.reduce_mean(x, axis=[-1], keepdims=True)
+  variance = tf.reduce_mean(tf.square(x - mean), axis=[-1], keepdims=True)
   norm_x = (x - mean) * tf.rsqrt(variance + epsilon)
   return norm_x * scale + bias
 
@@ -1289,7 +1289,7 @@ def mask_from_embedding(emb):
   Returns:
     a 0.0/1.0 Tensor with shape [batch, width, height, 1].
   """
-  return weights_nonzero(tf.reduce_sum(tf.abs(emb), axis=3, keep_dims=True))
+  return weights_nonzero(tf.reduce_sum(tf.abs(emb), axis=3, keepdims=True))
 
 
 def mask_leq(target_length, source_length):
@@ -1913,7 +1913,7 @@ def global_pool_1d(inputs, pooling_type="MAX", mask=None):
       if mask is not None:
         # Some elems are dummy elems so we can't just reduce the average.
         output = tf.reduce_sum(inputs, axis=1)
-        num_elems = tf.reduce_sum(mask, axis=1, keep_dims=True)
+        num_elems = tf.reduce_sum(mask, axis=1, keepdims=True)
         output = tf.div(output, tf.maximum(num_elems, 1))
       else:
         output = tf.reduce_mean(inputs, axis=1)
@@ -2977,7 +2977,7 @@ def argmax_with_score(logits, axis=None):
 
 
 def log_prob_from_logits(logits, reduce_axis=-1):
-  return logits - tf.reduce_logsumexp(logits, axis=reduce_axis, keep_dims=True)
+  return logits - tf.reduce_logsumexp(logits, axis=reduce_axis, keepdims=True)
 
 
 def top_1_tpu(inputs):
@@ -2992,7 +2992,7 @@ def top_1_tpu(inputs):
     values: a Tensor with shape [...]
     indices: a Tensor with shape [...]
   """
-  inputs_max = tf.reduce_max(inputs, axis=-1, keep_dims=True)
+  inputs_max = tf.reduce_max(inputs, axis=-1, keepdims=True)
   mask = tf.to_int32(tf.equal(inputs_max, inputs))
   index = tf.range(tf.shape(inputs)[-1]) * mask
   return tf.squeeze(inputs_max, -1), tf.reduce_max(index, axis=-1)
diff --git a/tensor2tensor/models/research/basic_conv_gen.py b/tensor2tensor/models/research/basic_conv_gen.py
@@ -113,7 +113,7 @@ def body(self, features):
     x = x[:, :inputs_shape[1], :inputs_shape[2], :]
 
     # Reward prediction.
-    reward_pred = tf.reduce_mean(x, axis=[1, 2], keep_dims=True)
+    reward_pred = tf.reduce_mean(x, axis=[1, 2], keepdims=True)
     return {"targets": x, "target_reward": reward_pred}
 
   def infer(self, features, *args, **kwargs):
diff --git a/tensor2tensor/models/research/transformer_vae.py b/tensor2tensor/models/research/transformer_vae.py
@@ -92,9 +92,9 @@ def top_k_softmax(x, k):
   """Calculate softmax(x), select top-k and rescale to sum to 1."""
   x = tf.nn.softmax(x)
   top_x, _ = tf.nn.top_k(x, k=k+1)
-  min_top = tf.reduce_min(top_x, axis=-1, keep_dims=True)
+  min_top = tf.reduce_min(top_x, axis=-1, keepdims=True)
   x = tf.nn.relu((x - min_top) + 1e-12)
-  x /= tf.reduce_sum(x, axis=-1, keep_dims=True)
+  x /= tf.reduce_sum(x, axis=-1, keepdims=True)
   return x, tf.reduce_max(top_x, axis=-1)
 
 
diff --git a/tensor2tensor/utils/adafactor.py b/tensor2tensor/utils/adafactor.py
@@ -237,7 +237,7 @@ def _resource_apply_dense(self, grad, var):
       vr_update = tf.assign(vr, new_vr, use_locking=self._use_locking)
       vc_update = tf.assign(vc, new_vc, use_locking=self._use_locking)
       updates = [vr_update, vc_update]
-      long_term_mean = tf.reduce_mean(new_vr, -1, keep_dims=True)
+      long_term_mean = tf.reduce_mean(new_vr, -1, keepdims=True)
       r_factor = tf.rsqrt(new_vr / long_term_mean)
       c_factor = tf.rsqrt(new_vc)
       x = grad * tf.expand_dims(r_factor, -1) * tf.expand_dims(c_factor, -2)
diff --git a/tensor2tensor/utils/diet.py b/tensor2tensor/utils/diet.py
@@ -192,10 +192,10 @@ def update_variable(self, var, grad_var):
     beta2_pow = tf.pow(params.beta2, global_step)
     if params.factored_second_moment_accumulator and len(var.shape) == 2:
       vr_update = tf.assign(slots["adam_vr"], slots["adam_vr"] * params.beta2 +
-                            tf.reduce_mean(grad_squared, 1, keep_dims=True) *
+                            tf.reduce_mean(grad_squared, 1, keepdims=True) *
                             (1.0 - params.beta2))
       vc_update = tf.assign(slots["adam_vc"], slots["adam_vc"] * params.beta2 +
-                            tf.reduce_mean(grad_squared, 0, keep_dims=True) *
+                            tf.reduce_mean(grad_squared, 0, keepdims=True) *
                             (1.0 - params.beta2))
       with tf.control_dependencies([vr_update, vc_update]):
         vr = tf.sqrt(slots["adam_vr"] / (1.0 - beta2_pow)) + params.epsilon
diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py
@@ -1257,7 +1257,7 @@ def local_moe_tpu(inputs,
   mask_1 *= tf.to_float(tf.less(position_in_expert_1, expert_capacity_f))
   # [batch, 1, num_experts]
   # How many examples in this sequence go to this expert
-  mask_1_count = tf.reduce_sum(mask_1, axis=1, keep_dims=True)
+  mask_1_count = tf.reduce_sum(mask_1, axis=1, keepdims=True)
   # [batch, length] - mostly ones, but zeros where something didn't fit
   mask_1_flat = tf.reduce_sum(mask_1, axis=2)
   position_in_expert_1 = tf.reduce_sum(position_in_expert_1, axis=2)
@@ -1284,7 +1284,7 @@ def local_moe_tpu(inputs,
       common_layers.cumsum(mask_2, axis=1, exclusive=True) + mask_1_count)
   position_in_expert_2 *= mask_2
   mask_2 *= tf.to_float(tf.less(position_in_expert_2, expert_capacity_f))
-  mask_2_count = tf.reduce_sum(mask_2, axis=1, keep_dims=True)
+  mask_2_count = tf.reduce_sum(mask_2, axis=1, keepdims=True)
   mask_2_flat = tf.reduce_sum(mask_2, axis=2)
   position_in_expert_2 = tf.reduce_sum(position_in_expert_2, axis=2)
   gate_2 *= mask_2_flat
diff --git a/tensor2tensor/utils/quantization.py b/tensor2tensor/utils/quantization.py
@@ -88,7 +88,7 @@ def simulated_quantize(x, num_bits, noise):
   shape = x.get_shape().as_list()
   if not (len(shape) >= 2 and shape[-1] > 1):
     return x
-  max_abs = tf.reduce_max(tf.abs(x), -1, keep_dims=True) + 1e-9
+  max_abs = tf.reduce_max(tf.abs(x), -1, keepdims=True) + 1e-9
   max_int = 2 ** (num_bits - 1) - 1
   scale = max_abs / max_int
   x /= scale