train v function :param states: state batch :param rewards_to_go: rewards-to-go batch :return: None
(self, states, rewards_to_go)
| 267 | return gradient, loss |
| 268 | |
| 269 | def train_vf(self, states, rewards_to_go): |
| 270 | """ |
| 271 | train v function |
| 272 | :param states: state batch |
| 273 | :param rewards_to_go: rewards-to-go batch |
| 274 | :return: None |
| 275 | """ |
| 276 | with tf.GradientTape() as tape: |
| 277 | value = self.critic(states) |
| 278 | loss = tf.reduce_mean((rewards_to_go - value[:, 0])**2) |
| 279 | grad = tape.gradient(loss, self.critic.trainable_weights) |
| 280 | self.critic_optimizer.apply_gradients(zip(grad, self.critic.trainable_weights)) |
| 281 | |
| 282 | def kl(self, states, old_mean, old_log_std): |
| 283 | """ |