@@ -109,7 +109,7 @@ def encode(x, x_space, hparams, name):
109
109
with tf .variable_scope (name ):
110
110
(encoder_input , encoder_self_attention_bias ,
111
111
_ ) = transformer .transformer_prepare_encoder (x , x_space , hparams )
112
- encoder_input = tf .nn .dropout (encoder_input , 1.0 - hparams .residual_dropout )
112
+ encoder_input = tf .nn .dropout (encoder_input , 1.0 - hparams .dropout )
113
113
return transformer .transformer_encoder (
114
114
encoder_input , encoder_self_attention_bias , hparams )
115
115
@@ -143,7 +143,7 @@ def vae_transformer_internal(inputs, targets, target_space, hparams):
143
143
max_prestep = hparams .kl_warmup_steps
144
144
prob_targets = 0.95 if is_training else 1.0
145
145
targets_dropout_max = common_layers .inverse_lin_decay (max_prestep ) - 0.01
146
- targets = dropmask (targets , targets_dropout_max , is_training )
146
+ targets = dropmask (targets , targets_dropout_max * 0.7 , is_training )
147
147
targets = tf .cond (tf .less (tf .random_uniform ([]), prob_targets ),
148
148
lambda : targets , lambda : tf .zeros_like (targets ))
149
149
@@ -168,7 +168,7 @@ def vae_transformer_internal(inputs, targets, target_space, hparams):
168
168
# ret = tf.squeeze(to_decode, axis=2)
169
169
170
170
# Randomize decoder inputs..
171
- kl_loss *= common_layers .inverse_exp_decay (max_prestep ) * 3 .0
171
+ kl_loss *= common_layers .inverse_exp_decay (max_prestep ) * 10 .0
172
172
return tf .expand_dims (ret , axis = 2 ), kl_loss
173
173
174
174
0 commit comments