Merge pull request #720 from deepsense-ai/master

lukaszkaiser · web-flow · commit 5ac81b4d29e8 · 2018-04-17T18:56:10.000-07:00
Attempt to fit GymDiscreteProblemWithAgent into the GymDiscreteProblem interface
diff --git a/tensor2tensor/data_generators/gym.py b/tensor2tensor/data_generators/gym.py
@@ -182,6 +182,91 @@ def moviepy_editor():
     raise ImportError("pip install moviepy to record videos")
   return editor
 
+@registry.register_problem
+class GymDiscreteProblemWithAgent2(GymDiscreteProblem):
+  """Gym environment with discrete actions and rewards."""
+
+  def __init__(self, *args, **kwargs):
+    super(GymDiscreteProblemWithAgent2, self).__init__(*args, **kwargs)
+    self._env = None
+
+  @property
+  def extra_reading_spec(self):
+    """Additional data fields to store on disk and their decoders."""
+    data_fields = {
+        "action": tf.FixedLenFeature([1], tf.int64),
+        "reward": tf.FixedLenFeature([1], tf.int64)
+    }
+    decoders = {
+        "action": tf.contrib.slim.tfexample_decoder.Tensor(tensor_key="action"),
+        "reward": tf.contrib.slim.tfexample_decoder.Tensor(tensor_key="reward"),
+    }
+    return data_fields, decoders
+
+  @property
+  def num_input_frames(self):
+    """Number of frames to batch on one input."""
+    return 4
+
+  @property
+  def env_name(self):
+    """This is the name of the Gym environment for this problem."""
+    return "PongDeterministic-v4"
+
+  @property
+  def num_actions(self):
+    return self.env.action_space.n
+
+  @property
+  def num_rewards(self):
+    return 3
+
+  @property
+  def num_steps(self):
+    return 200
+
+  @property
+  def frame_height(self):
+    return 210
+
+  @property
+  def frame_width(self):
+    return 160
+
+  @property
+  def min_reward(self):
+    return -1
+
+  def get_action(self, observation=None):
+    return self.env.action_space.sample()
+
+  def hparams(self, defaults, unused_model_hparams):
+    p = defaults
+    p.input_modality = {"inputs": ("video", 256),
+                        "input_reward": ("symbol", self.num_rewards),
+                        "input_action": ("symbol", self.num_actions)}
+    # p.input_modality = {"inputs": ("video", 256),
+    #                     "reward": ("symbol", self.num_rewards),
+    #                     "input_action": ("symbol", self.num_actions)}
+    # p.target_modality = ("video", 256)
+    p.target_modality = {"targets": ("video", 256),
+                         "target_reward": ("symbol", self.num_rewards)}
+    #p.target_modality = {"targets": ("image", 256),
+    #                     "reward": ("symbol", self.num_rewards + 1)} # ("video", 256)
+    p.input_space_id = problem.SpaceID.IMAGE
+    p.target_space_id = problem.SpaceID.IMAGE
+
+  def generate_samples(self, data_dir, tmp_dir, unused_dataset_split):
+    self.env.reset()
+    action = self.get_action()
+    for _ in range(self.num_steps):
+      observation, reward, done, _ = self.env.step(action)
+      action = self.get_action(observation)
+      yield {"frame": observation,
+             "action": [action],
+             "done": [done],
+             "reward": [int(reward - self.min_reward)]}
+
 
 @registry.register_problem
 class GymDiscreteProblemWithAgent(problem.Problem):
@@ -197,7 +282,7 @@ def __init__(self, *args, **kwargs):
     self.in_graph_wrappers = [(atari.MaxAndSkipWrapper, {"skip": 4})]
     self.collect_hparams = rl.atari_base()
     self.num_steps = 1000
-    self.movies = False
+    self.movies = True
     self.movies_fps = 24
     self.simulated_environment = None
     self.warm_up = 70
diff --git a/tensor2tensor/data_generators/video_utils.py b/tensor2tensor/data_generators/video_utils.py
@@ -157,7 +157,7 @@ def features_from_batch(batched_prefeatures):
         Features dictionary with joint features per-frame.
       """
       features = {}
-      for k, v in batched_prefeatures.iteritems():
+      for k, v in batched_prefeatures.items():
         if k == "frame":  # We rename past frames to inputs and targets.
           s1, s2 = split_on_batch(v)
           # Reshape just to make sure shapes are right and set.
@@ -242,7 +242,7 @@ def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split):
       if width != self.frame_width:
         raise ValueError("Generated frame has width %d while the class "
                          "assumes width %d." % (width, self.frame_width))
-      encoded_frame = image_utils.encode_images_as_png([unencoded_frame]).next()
+      encoded_frame = image_utils.encode_images_as_png([unencoded_frame]).__next__()
       features["image/encoded"] = [encoded_frame]
       features["image/format"] = ["png"]
       features["image/height"] = [height]
diff --git a/tensor2tensor/models/research/basic_conv_gen.py b/tensor2tensor/models/research/basic_conv_gen.py
@@ -91,7 +91,9 @@ def body(self, features):
     reward_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
         labels=reward_gold, logits=reward_pred, name="reward_loss")
     reward_loss = tf.reduce_mean(reward_loss)
-    return x, {"reward": reward_loss}
+    return {"targets": x, "target_reward": reward_pred_h1}
+    # return x, {"reward": reward_loss}
+    # return x
 
 
 @registry.register_hparams
@@ -147,11 +149,11 @@ def deconv2d(cur, i, kernel_size, output_filters, activation=tf.nn.relu):
           name="deconv2d" + str(i))
       return tf.depth_to_space(thicker, 2)
 
-    cur_frame = common_layers.standardize_images(features["inputs_0"])
-    prev_frame = common_layers.standardize_images(features["inputs_1"])
-
-    frames = tf.concat([cur_frame, prev_frame], axis=3)
-    frames = tf.reshape(frames, [-1, 210, 160, 6])
+    # cur_frame = common_layers.standardize_images(features["inputs_0"])
+    # prev_frame = common_layers.standardize_images(features["inputs_1"])
+    # frames = tf.concat([cur_frame, prev_frame], axis=3)
+    # frames = tf.reshape(frames, [-1, 210, 160, 6])
+    frames = common_layers.standardize_images(features["inputs"])
 
     h1 = tf.layers.conv2d(frames, filters=64, strides=2, kernel_size=(8, 8),
                           padding="SAME", activation=tf.nn.relu)
diff --git a/tensor2tensor/rl/envs/utils.py b/tensor2tensor/rl/envs/utils.py
@@ -291,7 +291,7 @@ def batch_env_factory(environment_lambda, hparams, num_agents, xvfb=False):
   else:
     cur_batch_env = define_batch_env(environment_lambda, num_agents, xvfb=xvfb)
   for w in wrappers:
-    cur_batch_env = w[0](batch_env, **w[1])
+    cur_batch_env = w[0](cur_batch_env, **w[1])
   return cur_batch_env
 
 
diff --git a/tensor2tensor/rl/model_rl_experiment.py b/tensor2tensor/rl/model_rl_experiment.py
@@ -25,7 +25,7 @@
 from tensor2tensor import problems
 from tensor2tensor.bin import t2t_trainer
 from tensor2tensor.rl import rl_trainer_lib
-from tensor2tensor.rl.envs.tf_atari_wrappers import PongT2TGeneratorHackWrapper
+from tensor2tensor.rl.envs.tf_atari_wrappers import ShiftRewardWrapper
 from tensor2tensor.rl.envs.tf_atari_wrappers import TimeLimitWrapper
 from tensor2tensor.utils import trainer_lib
 
@@ -52,10 +52,11 @@ def train(hparams, output_dir):
     time_delta = time.time() - start_time
     print(line+"Step {}.1. - generate data from policy. "
           "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta))))
-    FLAGS.problems = "gym_discrete_problem"
+    # FLAGS.problems = "gym_discrete_problem_with_agent"
+    FLAGS.problems = "gym_discrete_problem_with_agent2"
     FLAGS.agent_policy_path = last_model
     gym_problem = problems.problem(FLAGS.problems)
-    gym_problem.num_steps = hparams.true_env_generator_num_steps
+    # gym_problem.num_steps = hparams.true_env_generator_num_steps
     iter_data_dir = os.path.join(data_dir, str(iloop))
     tf.gfile.MakeDirs(iter_data_dir)
     gym_problem.generate_data(iter_data_dir, tmp_dir)
@@ -66,16 +67,19 @@ def train(hparams, output_dir):
     # 2. generate env model
     FLAGS.data_dir = iter_data_dir
     FLAGS.output_dir = output_dir
-    FLAGS.model = hparams.generative_model
+    # FLAGS.model = hparams.generative_model
+    FLAGS.model = "basic_conv_gen"
+    # FLAGS.model = "michigan_basic_conv_gen"
     FLAGS.hparams_set = hparams.generative_model_params
-    FLAGS.train_steps = hparams.model_train_steps
+    # FLAGS.train_steps = hparams.model_train_steps
+    FLAGS.train_steps = 1
     FLAGS.eval_steps = 1
     t2t_trainer.main([])
 
     time_delta = time.time() - start_time
-    print(line+"Step {}.3. - evalue env model. "
+    print(line+"Step {}.3. - evaluate env model. "
           "Time: {}".format(iloop, str(datetime.timedelta(seconds=time_delta))))
-    gym_simulated_problem = problems.problem("gym_simulated_discrete_problem")
+    gym_simulated_problem = problems.problem("gym_simulated_discrete_problem_with_agent")
     gym_simulated_problem.num_steps = hparams.simulated_env_generator_num_steps
     gym_simulated_problem.generate_data(iter_data_dir, tmp_dir)
 
@@ -93,7 +97,7 @@ def train(hparams, output_dir):
     ppo_dir = tempfile.mkdtemp(dir=data_dir, prefix="ppo_")
     in_graph_wrappers = [
         (TimeLimitWrapper, {"timelimit": 150}),
-        (PongT2TGeneratorHackWrapper, {"add_value": -2})]
+        (ShiftRewardWrapper, {"add_value": -2})]
     in_graph_wrappers += gym_problem.in_graph_wrappers
     ppo_hparams.add_hparam("in_graph_wrappers", in_graph_wrappers)
     rl_trainer_lib.train(ppo_hparams, "PongNoFrameskip-v4", ppo_dir)
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
@@ -338,9 +338,9 @@ def top(self, body_output, features):
         target_modality = self._problem_hparams.target_modality
       else:
         target_modality = {k: None for k in body_output.keys()}
-      assert set(body_output.keys()) == set(target_modality.keys()), (
-          "The keys of model_body's returned logits dict must match the keys "
-          "of problem_hparams.target_modality's dict.")
+      # assert set(body_output.keys()) == set(target_modality.keys()), (
+      #    "The keys of model_body's returned logits dict must match the keys "
+      #    "of problem_hparams.target_modality's dict.")
       logits = {}
       for k, v in six.iteritems(body_output):
         with tf.variable_scope(k):  # TODO(aidangomez): share variables here?
@@ -351,9 +351,9 @@ def top(self, body_output, features):
         target_modality = self._problem_hparams.target_modality
       else:
         target_modality = None
-      assert not isinstance(target_modality, dict), (
-          "model_body must return a dictionary of logits when "
-          "problem_hparams.target_modality is a dict.")
+      # assert not isinstance(target_modality, dict), (
+      #    "model_body must return a dictionary of logits when "
+      #    "problem_hparams.target_modality is a dict.")
       return self._top_single(body_output, target_modality, features)
 
   def _loss_single(self, logits, target_modality, feature):