|
20 | 20 |
|
21 | 21 | import tensorflow as tf
|
22 | 22 |
|
23 |
| -def get_otimiser(config): |
| 23 | + |
| 24 | +def get_optimiser(config): |
24 | 25 |
|
25 | 26 | if config.optimizer=='Adam':
|
26 | 27 | return tf.train.AdamOptimizer(config.learning_rate)
|
27 | 28 |
|
28 | 29 | return config.optimizer(config.learning_rate)
|
29 | 30 |
|
30 | 31 |
|
31 |
| - |
32 | 32 | def define_ppo_step(observation, action, reward, done, value, old_pdf,
|
33 | 33 | policy_factory, config):
|
34 | 34 |
|
@@ -58,7 +58,7 @@ def define_ppo_step(observation, action, reward, done, value, old_pdf,
|
58 | 58 | entropy = new_policy_dist.entropy()
|
59 | 59 | entropy_loss = -config.entropy_loss_coef * tf.reduce_mean(entropy)
|
60 | 60 |
|
61 |
| - optimizer = get_otimiser(config) |
| 61 | + optimizer = get_optimiser(config) |
62 | 62 | losses = [policy_loss, value_loss, entropy_loss]
|
63 | 63 |
|
64 | 64 | gradients = [list(zip(*optimizer.compute_gradients(loss))) for loss in losses]
|
@@ -86,8 +86,8 @@ def define_ppo_epoch(memory, policy_factory, config):
|
86 | 86 | old_pdf = tf.stop_gradient(old_pdf)
|
87 | 87 |
|
88 | 88 | ppo_step_rets = tf.scan(
|
89 |
| - lambda a, x: define_ppo_step(observation, action, reward, done, value, |
90 |
| - old_pdf, policy_factory, config), |
| 89 | + lambda _1, _2: define_ppo_step(observation, action, reward, done, value, |
| 90 | + old_pdf, policy_factory, config), |
91 | 91 | tf.range(config.optimization_epochs),
|
92 | 92 | [0., 0., 0., 0., 0., 0.],
|
93 | 93 | parallel_iterations=1)
|
|
0 commit comments