update all networks in SAC
(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-2, gamma=0.99, soft_tau=1e-2)
| 273 | return target_net |
| 274 | |
| 275 | def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-2, gamma=0.99, soft_tau=1e-2): |
| 276 | """ update all networks in SAC """ |
| 277 | state, action, reward, next_state, done = self.replay_buffer.sample(batch_size) |
| 278 | |
| 279 | reward = reward[:, np.newaxis] # expand dim |
| 280 | done = done[:, np.newaxis] |
| 281 | |
| 282 | reward = reward_scale * (reward - np.mean(reward, axis=0)) / ( |
| 283 | np.std(reward, axis=0) + 1e-6 |
| 284 | ) # normalize with batch mean and std; plus a small number to prevent numerical problem |
| 285 | |
| 286 | # Training Q Function |
| 287 | new_next_action, next_log_prob, _, _, _ = self.policy_net.evaluate(next_state) |
| 288 | target_q_input = tf.concat([next_state, new_next_action], 1) # the dim 0 is number of samples |
| 289 | target_q_min = tf.minimum( |
| 290 | self.target_soft_q_net1(target_q_input), self.target_soft_q_net2(target_q_input) |
| 291 | ) - self.alpha * next_log_prob |
| 292 | target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward |
| 293 | q_input = tf.concat([state, action], 1) # the dim 0 is number of samples |
| 294 | |
| 295 | with tf.GradientTape() as q1_tape: |
| 296 | predicted_q_value1 = self.soft_q_net1(q_input) |
| 297 | q_value_loss1 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value1, target_q_value)) |
| 298 | q1_grad = q1_tape.gradient(q_value_loss1, self.soft_q_net1.trainable_weights) |
| 299 | self.soft_q_optimizer1.apply_gradients(zip(q1_grad, self.soft_q_net1.trainable_weights)) |
| 300 | |
| 301 | with tf.GradientTape() as q2_tape: |
| 302 | predicted_q_value2 = self.soft_q_net2(q_input) |
| 303 | q_value_loss2 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value2, target_q_value)) |
| 304 | q2_grad = q2_tape.gradient(q_value_loss2, self.soft_q_net2.trainable_weights) |
| 305 | self.soft_q_optimizer2.apply_gradients(zip(q2_grad, self.soft_q_net2.trainable_weights)) |
| 306 | |
| 307 | # Training Policy Function |
| 308 | with tf.GradientTape() as p_tape: |
| 309 | new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state) |
| 310 | new_q_input = tf.concat([state, new_action], 1) # the dim 0 is number of samples |
| 311 | """ implementation 1 """ |
| 312 | predicted_new_q_value = tf.minimum(self.soft_q_net1(new_q_input), self.soft_q_net2(new_q_input)) |
| 313 | # """ implementation 2 """ |
| 314 | # predicted_new_q_value = self.soft_q_net1(new_q_input) |
| 315 | policy_loss = tf.reduce_mean(self.alpha * log_prob - predicted_new_q_value) |
| 316 | p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights) |
| 317 | self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights)) |
| 318 | |
| 319 | # Updating alpha w.r.t entropy |
| 320 | # alpha: trade-off between exploration (max entropy) and exploitation (max Q) |
| 321 | if auto_entropy is True: |
| 322 | with tf.GradientTape() as alpha_tape: |
| 323 | alpha_loss = -tf.reduce_mean((self.log_alpha * (log_prob + target_entropy))) |
| 324 | alpha_grad = alpha_tape.gradient(alpha_loss, [self.log_alpha]) |
| 325 | self.alpha_optimizer.apply_gradients(zip(alpha_grad, [self.log_alpha])) |
| 326 | self.alpha = tf.math.exp(self.log_alpha) |
| 327 | else: # fixed alpha |
| 328 | self.alpha = 1. |
| 329 | alpha_loss = 0 |
| 330 | |
| 331 | # Soft update the target value nets |
| 332 | self.target_soft_q_net1 = self.target_soft_update(self.soft_q_net1, self.target_soft_q_net1, soft_tau) |
no test coverage detected