@@ -151,8 +151,8 @@ def train_agent(problem_name, agent_model_dir,
151
151
ppo_hparams .num_agents = hparams .ppo_num_agents
152
152
ppo_hparams .problem = gym_problem
153
153
ppo_hparams .world_model_dir = world_model_dir
154
- # 4x for the StackAndSkipWrapper
155
- ppo_time_limit = max (ppo_hparams .epoch_length * 4 + 20 , 250 )
154
+ # 4x for the StackAndSkipWrapper minus one to always finish for reporting.
155
+ ppo_time_limit = (ppo_hparams .epoch_length - 1 ) * 4
156
156
157
157
in_graph_wrappers = [
158
158
(TimeLimitWrapper , {"timelimit" : ppo_time_limit }),
@@ -456,15 +456,15 @@ def rl_modelrl_base():
456
456
simulated_env_generator_num_steps = 2000 ,
457
457
simulation_random_starts = True ,
458
458
intrinsic_reward_scale = 0. ,
459
- ppo_epochs_num = 250 , # This should be enough to see something
459
+ ppo_epochs_num = 200 , # This should be enough to see something
460
460
# Our simulated envs do not know how to reset.
461
461
# You should set ppo_time_limit to the value you believe that
462
462
# the simulated env produces a reasonable output.
463
463
ppo_time_limit = 200 , # TODO(blazej): this param is unused
464
464
# It makes sense to have ppo_time_limit=ppo_epoch_length,
465
465
# though it is not necessary.
466
- ppo_epoch_length = 40 ,
467
- ppo_num_agents = 20 ,
466
+ ppo_epoch_length = 60 ,
467
+ ppo_num_agents = 16 ,
468
468
# Whether the PPO agent should be restored from the previous iteration, or
469
469
# should start fresh each time.
470
470
ppo_continue_training = True ,
@@ -579,7 +579,7 @@ def rl_modelrl_ae_base():
579
579
hparams = rl_modelrl_base ()
580
580
hparams .ppo_params = "ppo_pong_ae_base"
581
581
hparams .generative_model_params = "basic_conv_ae"
582
- hparams .autoencoder_train_steps = 100000
582
+ hparams .autoencoder_train_steps = 30000
583
583
return hparams
584
584
585
585
@@ -603,10 +603,7 @@ def rl_modelrl_ae_l2_base():
603
603
def rl_modelrl_ae_medium ():
604
604
"""Medium parameter set for autoencoders."""
605
605
hparams = rl_modelrl_ae_base ()
606
- hparams .autoencoder_train_steps //= 2
607
606
hparams .true_env_generator_num_steps //= 2
608
- hparams .model_train_steps //= 2
609
- hparams .ppo_epochs_num //= 2
610
607
return hparams
611
608
612
609
@@ -730,8 +727,11 @@ def rl_modelrl_freeway_ae_medium():
730
727
@registry .register_hparams
731
728
def rl_modelrl_freeway_short ():
732
729
"""Short set for testing Freeway."""
733
- hparams = rl_modelrl_short ()
734
- hparams .game = "freeway"
730
+ hparams = rl_modelrl_freeway_medium ()
731
+ hparams .true_env_generator_num_steps //= 5
732
+ hparams .model_train_steps //= 2
733
+ hparams .ppo_epochs_num //= 2
734
+ hparams .intrinsic_reward_scale = 0.1
735
735
return hparams
736
736
737
737
0 commit comments