update all networks in TD3
(self, batch_size, eval_noise_scale, reward_scale=10., gamma=0.9, soft_tau=1e-2)
| 260 | return target_net |
| 261 | |
| 262 | def update(self, batch_size, eval_noise_scale, reward_scale=10., gamma=0.9, soft_tau=1e-2): |
| 263 | """ update all networks in TD3 """ |
| 264 | self.update_cnt += 1 |
| 265 | state, action, reward, next_state, done = self.replay_buffer.sample(batch_size) |
| 266 | |
| 267 | reward = reward[:, np.newaxis] # expand dim |
| 268 | done = done[:, np.newaxis] |
| 269 | |
| 270 | new_next_action = self.target_policy_net.evaluate( |
| 271 | next_state, eval_noise_scale=eval_noise_scale |
| 272 | ) # clipped normal noise |
| 273 | reward = reward_scale * (reward - np.mean(reward, axis=0)) / ( |
| 274 | np.std(reward, axis=0) + 1e-6 |
| 275 | ) # normalize with batch mean and std; plus a small number to prevent numerical problem |
| 276 | |
| 277 | # Training Q Function |
| 278 | target_q_input = tf.concat([next_state, new_next_action], 1) # the dim 0 is number of samples |
| 279 | target_q_min = tf.minimum(self.target_q_net1(target_q_input), self.target_q_net2(target_q_input)) |
| 280 | |
| 281 | target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward |
| 282 | q_input = tf.concat([state, action], 1) # input of q_net |
| 283 | |
| 284 | with tf.GradientTape() as q1_tape: |
| 285 | predicted_q_value1 = self.q_net1(q_input) |
| 286 | q_value_loss1 = tf.reduce_mean(tf.square(predicted_q_value1 - target_q_value)) |
| 287 | q1_grad = q1_tape.gradient(q_value_loss1, self.q_net1.trainable_weights) |
| 288 | self.q_optimizer1.apply_gradients(zip(q1_grad, self.q_net1.trainable_weights)) |
| 289 | |
| 290 | with tf.GradientTape() as q2_tape: |
| 291 | predicted_q_value2 = self.q_net2(q_input) |
| 292 | q_value_loss2 = tf.reduce_mean(tf.square(predicted_q_value2 - target_q_value)) |
| 293 | q2_grad = q2_tape.gradient(q_value_loss2, self.q_net2.trainable_weights) |
| 294 | self.q_optimizer2.apply_gradients(zip(q2_grad, self.q_net2.trainable_weights)) |
| 295 | |
| 296 | # Training Policy Function |
| 297 | if self.update_cnt % self.policy_target_update_interval == 0: |
| 298 | with tf.GradientTape() as p_tape: |
| 299 | new_action = self.policy_net.evaluate( |
| 300 | state, eval_noise_scale=0.0 |
| 301 | ) # no noise, deterministic policy gradients |
| 302 | new_q_input = tf.concat([state, new_action], 1) |
| 303 | # """ implementation 1 """ |
| 304 | # predicted_new_q_value = tf.minimum(self.q_net1(new_q_input),self.q_net2(new_q_input)) |
| 305 | """ implementation 2 """ |
| 306 | predicted_new_q_value = self.q_net1(new_q_input) |
| 307 | policy_loss = -tf.reduce_mean(predicted_new_q_value) |
| 308 | p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights) |
| 309 | self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights)) |
| 310 | |
| 311 | # Soft update the target nets |
| 312 | self.target_q_net1 = self.target_soft_update(self.q_net1, self.target_q_net1, soft_tau) |
| 313 | self.target_q_net2 = self.target_soft_update(self.q_net2, self.target_q_net2, soft_tau) |
| 314 | self.target_policy_net = self.target_soft_update(self.policy_net, self.target_policy_net, soft_tau) |
| 315 | |
| 316 | def save(self): # save trained weights |
| 317 | path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID])) |
no test coverage detected