MCPcopy
hub / github.com/tensorlayer/TensorLayer / update

Method update

examples/reinforcement_learning/tutorial_TD3.py:262–314  ·  view source on GitHub ↗

update all networks in TD3

(self, batch_size, eval_noise_scale, reward_scale=10., gamma=0.9, soft_tau=1e-2)

Source from the content-addressed store, hash-verified

260 return target_net
261
262 def update(self, batch_size, eval_noise_scale, reward_scale=10., gamma=0.9, soft_tau=1e-2):
263 """ update all networks in TD3 """
264 self.update_cnt += 1
265 state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)
266
267 reward = reward[:, np.newaxis] # expand dim
268 done = done[:, np.newaxis]
269
270 new_next_action = self.target_policy_net.evaluate(
271 next_state, eval_noise_scale=eval_noise_scale
272 ) # clipped normal noise
273 reward = reward_scale * (reward - np.mean(reward, axis=0)) / (
274 np.std(reward, axis=0) + 1e-6
275 ) # normalize with batch mean and std; plus a small number to prevent numerical problem
276
277 # Training Q Function
278 target_q_input = tf.concat([next_state, new_next_action], 1) # the dim 0 is number of samples
279 target_q_min = tf.minimum(self.target_q_net1(target_q_input), self.target_q_net2(target_q_input))
280
281 target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward
282 q_input = tf.concat([state, action], 1) # input of q_net
283
284 with tf.GradientTape() as q1_tape:
285 predicted_q_value1 = self.q_net1(q_input)
286 q_value_loss1 = tf.reduce_mean(tf.square(predicted_q_value1 - target_q_value))
287 q1_grad = q1_tape.gradient(q_value_loss1, self.q_net1.trainable_weights)
288 self.q_optimizer1.apply_gradients(zip(q1_grad, self.q_net1.trainable_weights))
289
290 with tf.GradientTape() as q2_tape:
291 predicted_q_value2 = self.q_net2(q_input)
292 q_value_loss2 = tf.reduce_mean(tf.square(predicted_q_value2 - target_q_value))
293 q2_grad = q2_tape.gradient(q_value_loss2, self.q_net2.trainable_weights)
294 self.q_optimizer2.apply_gradients(zip(q2_grad, self.q_net2.trainable_weights))
295
296 # Training Policy Function
297 if self.update_cnt % self.policy_target_update_interval == 0:
298 with tf.GradientTape() as p_tape:
299 new_action = self.policy_net.evaluate(
300 state, eval_noise_scale=0.0
301 ) # no noise, deterministic policy gradients
302 new_q_input = tf.concat([state, new_action], 1)
303 # """ implementation 1 """
304 # predicted_new_q_value = tf.minimum(self.q_net1(new_q_input),self.q_net2(new_q_input))
305 """ implementation 2 """
306 predicted_new_q_value = self.q_net1(new_q_input)
307 policy_loss = -tf.reduce_mean(predicted_new_q_value)
308 p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights)
309 self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights))
310
311 # Soft update the target nets
312 self.target_q_net1 = self.target_soft_update(self.q_net1, self.target_q_net1, soft_tau)
313 self.target_q_net2 = self.target_soft_update(self.q_net2, self.target_q_net2, soft_tau)
314 self.target_policy_net = self.target_soft_update(self.policy_net, self.target_policy_net, soft_tau)
315
316 def save(self): # save trained weights
317 path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))

Callers 1

tutorial_TD3.pyFile · 0.45

Calls 4

target_soft_updateMethod · 0.95
gradientMethod · 0.80
sampleMethod · 0.45
evaluateMethod · 0.45

Tested by

no test coverage detected