21
21
import tensorflow as tf
22
22
23
23
24
+ def get_optimiser (config ):
25
+
26
+ if config .optimizer == 'Adam' :
27
+ return tf .train .AdamOptimizer (config .learning_rate )
28
+
29
+ return config .optimizer (config .learning_rate )
30
+
31
+
24
32
def define_ppo_step (observation , action , reward , done , value , old_pdf ,
25
33
policy_factory , config ):
26
- """A step of PPO."""
34
+
27
35
new_policy_dist , new_value , _ = policy_factory (observation )
28
36
new_pdf = new_policy_dist .prob (action )
29
37
@@ -43,27 +51,30 @@ def define_ppo_step(observation, action, reward, done, value, old_pdf,
43
51
ratio * advantage_normalized )
44
52
policy_loss = - tf .reduce_mean (surrogate_objective )
45
53
46
- value_error = calculate_discounted_return (
47
- reward , new_value , done , config .gae_gamma , config .gae_lambda ) - value
54
+ value_error = calculate_generalized_advantage_estimator (
55
+ reward , new_value , done , config .gae_gamma , config .gae_lambda )
48
56
value_loss = config .value_loss_coef * tf .reduce_mean (value_error ** 2 )
49
57
50
58
entropy = new_policy_dist .entropy ()
51
59
entropy_loss = - config .entropy_loss_coef * tf .reduce_mean (entropy )
52
60
53
- total_loss = policy_loss + value_loss + entropy_loss
61
+ optimizer = get_optimiser (config )
62
+ losses = [policy_loss , value_loss , entropy_loss ]
54
63
55
- optimization_op = tf .contrib .layers .optimize_loss (
56
- loss = total_loss ,
57
- global_step = tf .train .get_or_create_global_step (),
58
- optimizer = config .optimizer ,
59
- learning_rate = config .learning_rate )
64
+ gradients = [list (zip (* optimizer .compute_gradients (loss ))) for loss in losses ]
60
65
61
- with tf .control_dependencies ([optimization_op ]):
62
- return [tf .identity (x ) for x in (policy_loss , value_loss , entropy_loss )]
66
+ gradients_norms = [tf .global_norm (gradient [0 ]) for gradient in gradients ]
67
+
68
+ gradients_flat = sum ([gradient [0 ] for gradient in gradients ], ())
69
+ gradients_variables_flat = sum ([gradient [1 ] for gradient in gradients ], ())
70
+
71
+ optimize_op = optimizer .apply_gradients (zip (gradients_flat , gradients_variables_flat ))
72
+
73
+ with tf .control_dependencies ([optimize_op ]):
74
+ return [tf .identity (x ) for x in losses + gradients_norms ]
63
75
64
76
65
77
def define_ppo_epoch (memory , policy_factory , config ):
66
- """An epoch of PPO."""
67
78
observation , reward , done , action , old_pdf , value = memory
68
79
69
80
# This is to avoid propagating gradients though simulation of simulation
@@ -74,59 +85,39 @@ def define_ppo_epoch(memory, policy_factory, config):
74
85
value = tf .stop_gradient (value )
75
86
old_pdf = tf .stop_gradient (old_pdf )
76
87
77
- policy_loss , value_loss , entropy_loss = tf .scan (
78
- lambda _1 , _2 : define_ppo_step ( # pylint: disable=g-long-lambda
79
- observation , action , reward , done , value ,
80
- old_pdf , policy_factory , config ),
88
+ ppo_step_rets = tf .scan (
89
+ lambda _1 , _2 : define_ppo_step (observation , action , reward , done , value ,
90
+ old_pdf , policy_factory , config ),
81
91
tf .range (config .optimization_epochs ),
82
- [0. , 0. , 0. ],
92
+ [0. , 0. , 0. , 0. , 0. , 0. ],
83
93
parallel_iterations = 1 )
84
94
85
- summaries = [tf .summary . scalar ( "policy loss" , tf . reduce_mean (policy_loss )),
86
- tf . summary . scalar ( "value loss " , tf . reduce_mean ( value_loss )) ,
87
- tf . summary . scalar ( "entropy loss" , tf . reduce_mean ( entropy_loss )) ]
95
+ ppo_summaries = [tf .reduce_mean (ret ) for ret in ppo_step_rets ]
96
+ summaries_names = [ "policy_loss " , " value_loss" , "entropy_loss" ,
97
+ "policy_gradient" , "value_gradient" , "entropy_gradient" ]
88
98
99
+ summaries = [tf .summary .scalar (summary_name , summary )
100
+ for summary_name , summary in zip (summaries_names , ppo_summaries )]
89
101
losses_summary = tf .summary .merge (summaries )
90
102
91
- losses_summary = tf .Print (losses_summary ,
92
- [tf .reduce_mean (policy_loss )], "policy loss: " )
93
- losses_summary = tf .Print (losses_summary ,
94
- [tf .reduce_mean (value_loss )], "value loss: " )
95
- losses_summary = tf .Print (losses_summary ,
96
- [tf .reduce_mean (entropy_loss )], "entropy loss: " )
103
+ for summary_name , summary in zip (summaries_names , ppo_summaries ):
104
+ losses_summary = tf .Print (losses_summary , [summary ], summary_name + ": " )
97
105
98
106
return losses_summary
99
107
108
+ def calculate_generalized_advantage_estimator (reward , value , done , gae_gamma , gae_lambda ):
109
+ """Generalized advantage estimator"""
100
110
101
- def calculate_discounted_return (reward , value , done , discount , unused_lambda ):
102
- """Discounted Monte-Carlo returns."""
103
- done = tf .cast (done , tf .float32 )
104
- reward2 = done [- 1 , :] * reward [- 1 , :] + (1 - done [- 1 , :]) * value [- 1 , :]
105
- reward = tf .concat ([reward [:- 1 ,], reward2 [None , ...]], axis = 0 )
106
- return_ = tf .reverse (tf .scan (
107
- lambda agg , cur : cur [0 ] + (1 - cur [1 ]) * discount * agg , # fn
108
- [tf .reverse (reward , [0 ]), # elem
109
- tf .reverse (done , [0 ])],
110
- tf .zeros_like (reward [0 , :]), # initializer
111
- 1 ,
112
- False ), [0 ])
113
- return tf .check_numerics (return_ , "return" )
114
-
115
-
116
- def calculate_generalized_advantage_estimator (
117
- reward , value , done , gae_gamma , gae_lambda ):
118
- """Generalized advantage estimator."""
119
- # Below is slight weirdness, we set the last reward to 0.
120
- # This makes the adventantage to be 0 in the last timestep.
121
- reward = tf .concat ([reward [:- 1 , :], value [- 1 :, :]], axis = 0 )
122
- next_value = tf .concat ([value [1 :, :], tf .zeros_like (value [- 1 :, :])], axis = 0 )
123
- next_not_done = 1 - tf .cast (tf .concat (
124
- [done [1 :, :], tf .zeros_like (done [- 1 :, :])], axis = 0 ), tf .float32 )
111
+ # Below is slight wierdness, we set the last reward to 0.
112
+ # This makes the adventantage to be 0 in the last timestep
113
+ reward = tf .concat ([reward [:- 1 ,:], value [- 1 :,:]], axis = 0 )
114
+ next_value = tf .concat ([value [1 :,:], tf .zeros_like (value [- 1 :, :])], axis = 0 )
115
+ next_not_done = 1 - tf .cast (tf .concat ([done [1 :, :], tf .zeros_like (done [- 1 :, :])], axis = 0 ), tf .float32 )
125
116
delta = reward + gae_gamma * next_value * next_not_done - value
126
117
127
118
return_ = tf .reverse (tf .scan (
128
119
lambda agg , cur : cur [0 ] + cur [1 ] * gae_gamma * gae_lambda * agg ,
129
120
[tf .reverse (delta , [0 ]), tf .reverse (next_not_done , [0 ])],
130
121
tf .zeros_like (delta [0 , :]),
131
- 1 , False ), [0 ])
132
- return tf .check_numerics (tf . stop_gradient ( return_ ), " return" )
122
+ parallel_iterations = 1 ), [0 ])
123
+ return tf .check_numerics (return_ , ' return' )
0 commit comments