Introducing StackWrapper.

blazejosinski · Copybara-Service · commit 36e144600f31 · 2018-08-16T21:16:53.000-07:00
PiperOrigin-RevId: 209098352
diff --git a/tensor2tensor/data_generators/gym_problems.py b/tensor2tensor/data_generators/gym_problems.py
@@ -48,7 +48,7 @@
 
 def standard_atari_env_spec(env):
   """Parameters of environment specification."""
-  standard_wrappers = [[tf_atari_wrappers.StackAndSkipWrapper, {"skip": 4}]]
+  standard_wrappers = [[tf_atari_wrappers.StackWrapper, {"history": 4}]]
   env_lambda = None
   if isinstance(env, str):
     env_lambda = lambda: gym.make(env)
diff --git a/tensor2tensor/rl/envs/tf_atari_wrappers.py b/tensor2tensor/rl/envs/tf_atari_wrappers.py
@@ -151,6 +151,47 @@ def _reset_non_empty(self, indices):
       return tf.gather(self.observ, indices)
 
 
+class StackWrapper(WrapperBase):
+  """ A wrapper which stacks previously seen frames. """
+
+  def __init__(self, batch_env, history=4):
+    super(StackWrapper, self).__init__(batch_env)
+    self.history = history
+    self.old_shape = batch_env.observ.shape.as_list()
+    observs_shape = self.old_shape[:-1] + [self.old_shape[-1] * self.history]
+    observ_dtype = tf.float32
+    self._observ = tf.Variable(tf.zeros(observs_shape, observ_dtype),
+                               trainable=False)
+
+  def simulate(self, action):
+    reward, done = self._batch_env.simulate(action)
+    with tf.control_dependencies([reward, done]):
+      new_observ = self._batch_env.observ + 0
+      old_observ = tf.gather(
+          self._observ.read_value(),
+          range(self.old_shape[-1], self.old_shape[-1] * self.history),
+          axis=-1)
+      with tf.control_dependencies([new_observ, old_observ]):
+        with tf.control_dependencies([self._observ.assign(
+            tf.concat([old_observ, new_observ], axis=-1))]):
+          return tf.identity(reward), tf.identity(done)
+
+  def _reset_non_empty(self, indices):
+    # pylint: disable=protected-access
+    new_values = self._batch_env._reset_non_empty(indices)
+    # pylint: enable=protected-access
+    inx = tf.concat(
+        [
+            tf.ones(tf.size(tf.shape(new_values)), dtype=tf.int32)[:-1],
+            [self.history]
+        ],
+        axis=0)
+    assign_op = tf.scatter_update(self._observ, indices, tf.tile(
+        new_values, inx))
+    with tf.control_dependencies([assign_op]):
+      return tf.gather(self.observ, indices)
+
+
 class AutoencoderWrapper(WrapperBase):
   """ Transforms the observations taking the bottleneck
       state of an autoencoder"""