Correct inference to run with RealModality for time series problems.

Lukasz Kaiser · Copybara-Service · commit 3704b1f674c0 · 2018-06-14T21:23:55.000-07:00
PiperOrigin-RevId: 200667028
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
@@ -1051,7 +1051,7 @@ def decode(self, ids, strip_extraneous=False):
       ValueError: if the ids are not of the appropriate size.
     """
     del strip_extraneous
-    return " ".join(ids)
+    return " ".join([str(i) for i in ids])
 
 
 def strip_ids(ids, ids_to_strip):
diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
@@ -739,6 +739,10 @@ class RealModality(modality.Modality):
   * Top is a linear projection layer to vocab_size.
   """
 
+  @property
+  def top_is_pointwise(self):
+    return True
+
   def bottom(self, x):
     with tf.variable_scope("real"):
       return tf.layers.dense(tf.to_float(x), self._body_input_depth,
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
@@ -213,6 +213,9 @@ def _greedy_infer(self, features, decode_length):
     Raises:
       NotImplementedError: If there are multiple data shards.
     """
+    # For real-valued modalities use the slow decode path for now.
+    if self._target_modality_is_real:
+      return  super(Transformer, self)._greedy_infer(features, decode_length)
     with tf.variable_scope(self.name):
       return self._fast_decode(features, decode_length)
 
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
@@ -141,6 +141,12 @@ def _custom_getter(self):
     else:
       return None
 
+  @property
+  def _target_modality_is_real(self):
+    """Whether the target modality is real-valued."""
+    target_modality = self._problem_hparams.target_modality
+    return target_modality.name.startswith("real_")
+
   def call(self, inputs, **kwargs):
     del kwargs
     features = inputs
@@ -732,7 +738,11 @@ def _slow_greedy_infer(self, features, decode_length):
     def infer_step(recent_output, recent_logits, unused_loss):
       """Inference step."""
       if not tf.contrib.eager.in_eager_mode():
-        recent_output.set_shape([None, None, None, 1])
+        if self._target_modality_is_real:
+          dim = self._problem_hparams.target_modality.top_dimensionality
+          recent_output.set_shape([None, None, None, dim])
+        else:
+          recent_output.set_shape([None, None, None, 1])
       padded = tf.pad(recent_output, [[0, 0], [0, 1], [0, 0], [0, 0]])
       features["targets"] = padded
       # This is inefficient in that it generates samples at all timesteps,
@@ -745,10 +755,14 @@ def infer_step(recent_output, recent_logits, unused_loss):
       else:
         cur_sample = samples[:,
                              common_layers.shape_list(recent_output)[1], :, :]
-      cur_sample = tf.to_int64(tf.expand_dims(cur_sample, axis=1))
-      samples = tf.concat([recent_output, cur_sample], axis=1)
-      if not tf.contrib.eager.in_eager_mode():
-        samples.set_shape([None, None, None, 1])
+      if self._target_modality_is_real:
+        cur_sample = tf.expand_dims(cur_sample, axis=1)
+        samples = tf.concat([recent_output, cur_sample], axis=1)
+      else:
+        cur_sample = tf.to_int64(tf.expand_dims(cur_sample, axis=1))
+        samples = tf.concat([recent_output, cur_sample], axis=1)
+        if not tf.contrib.eager.in_eager_mode():
+          samples.set_shape([None, None, None, 1])
 
       # Assuming we have one shard for logits.
       logits = tf.concat([recent_logits, logits[:, -1:]], 1)
@@ -764,7 +778,11 @@ def infer_step(recent_output, recent_logits, unused_loss):
       batch_size = common_layers.shape_list(initial_output)[0]
     else:
       batch_size = common_layers.shape_list(features["inputs"])[0]
-      initial_output = tf.zeros((batch_size, 0, 1, 1), dtype=tf.int64)
+      if self._target_modality_is_real:
+        dim = self._problem_hparams.target_modality.top_dimensionality
+        initial_output = tf.zeros((batch_size, 0, 1, dim), dtype=tf.float32)
+      else:
+        initial_output = tf.zeros((batch_size, 0, 1, 1), dtype=tf.int64)
     # Hack: foldl complains when the output shape is less specified than the
     # input shape, so we confuse it about the input shape.
     initial_output = tf.slice(initial_output, [0, 0, 0, 0],
@@ -783,10 +801,17 @@ def infer_step(recent_output, recent_logits, unused_loss):
 
     # Initial values of result, logits and loss.
     result = initial_output
-    # tensor of shape [batch_size, time, 1, 1, vocab_size]
-    logits = tf.zeros((batch_size, 0, 1, 1, target_modality.top_dimensionality))
+    if self._target_modality_is_real:
+      logits = tf.zeros((batch_size, 0, 1, target_modality.top_dimensionality))
+      logits_shape_inv = [None, None, None, None]
+    else:
+      # tensor of shape [batch_size, time, 1, 1, vocab_size]
+      logits = tf.zeros((batch_size, 0, 1, 1,
+                         target_modality.top_dimensionality))
+      logits_shape_inv = [None, None, None, None, None]
     if not tf.contrib.eager.in_eager_mode():
-      logits.set_shape([None, None, None, None, None])
+      logits.set_shape(logits_shape_inv)
+
     loss = 0.0
 
     def while_exit_cond(result, logits, loss):  # pylint: disable=unused-argument
@@ -822,7 +847,7 @@ def fn_not_eos():
         infer_step, [result, logits, loss],
         shape_invariants=[
             tf.TensorShape([None, None, None, None]),
-            tf.TensorShape([None, None, None, None, None]),
+            tf.TensorShape(logits_shape_inv),
             tf.TensorShape([]),
         ],
         back_prop=False,
@@ -857,6 +882,8 @@ def sample(self, features):
        losses: a dictionary: {loss-name (string): floating point `Scalar`}.
     """
     logits, losses = self(features)  # pylint: disable=not-callable
+    if self._target_modality_is_real:
+      return logits, logits, losses  # Raw numbers returned from real modality.
     if self.hparams.sampling_method == "argmax":
       samples = tf.argmax(logits, axis=-1)
     else: