Update actor network :param reward: cumulative reward batch :param state: state batch :return: None
(self, reward, state)
| 139 | return kl_mean |
| 140 | |
| 141 | def train_critic(self, reward, state): |
| 142 | """ |
| 143 | Update actor network |
| 144 | :param reward: cumulative reward batch |
| 145 | :param state: state batch |
| 146 | :return: None |
| 147 | """ |
| 148 | reward = np.array(reward, dtype=np.float32) |
| 149 | with tf.GradientTape() as tape: |
| 150 | advantage = reward - self.critic(state) |
| 151 | loss = tf.reduce_mean(tf.square(advantage)) |
| 152 | grad = tape.gradient(loss, self.critic.trainable_weights) |
| 153 | self.critic_opt.apply_gradients(zip(grad, self.critic.trainable_weights)) |
| 154 | |
| 155 | def update(self): |
| 156 | """ |