hub / github.com/tensorlayer/TensorLayer / update

Method update

examples/reinforcement_learning/tutorial_TD3.py:262–314 · view source on GitHub ↗

update all networks in TD3

(self, batch_size, eval_noise_scale, reward_scale=10., gamma=0.9, soft_tau=1e-2)

Source from the content-addressed store, hash-verified

260	return target_net
261
262	def update(self, batch_size, eval_noise_scale, reward_scale=10., gamma=0.9, soft_tau=1e-2):
263	""" update all networks in TD3 """
264	self.update_cnt += 1
265	state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)
266
267	reward = reward[:, np.newaxis] # expand dim
268	done = done[:, np.newaxis]
269
270	new_next_action = self.target_policy_net.evaluate(
271	next_state, eval_noise_scale=eval_noise_scale
272	) # clipped normal noise
273	reward = reward_scale * (reward - np.mean(reward, axis=0)) / (
274	np.std(reward, axis=0) + 1e-6
275	) # normalize with batch mean and std; plus a small number to prevent numerical problem
276
277	# Training Q Function
278	target_q_input = tf.concat([next_state, new_next_action], 1) # the dim 0 is number of samples
279	target_q_min = tf.minimum(self.target_q_net1(target_q_input), self.target_q_net2(target_q_input))
280
281	target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward
282	q_input = tf.concat([state, action], 1) # input of q_net
283
284	with tf.GradientTape() as q1_tape:
285	predicted_q_value1 = self.q_net1(q_input)
286	q_value_loss1 = tf.reduce_mean(tf.square(predicted_q_value1 - target_q_value))
287	q1_grad = q1_tape.gradient(q_value_loss1, self.q_net1.trainable_weights)
288	self.q_optimizer1.apply_gradients(zip(q1_grad, self.q_net1.trainable_weights))
289
290	with tf.GradientTape() as q2_tape:
291	predicted_q_value2 = self.q_net2(q_input)
292	q_value_loss2 = tf.reduce_mean(tf.square(predicted_q_value2 - target_q_value))
293	q2_grad = q2_tape.gradient(q_value_loss2, self.q_net2.trainable_weights)
294	self.q_optimizer2.apply_gradients(zip(q2_grad, self.q_net2.trainable_weights))
295
296	# Training Policy Function
297	if self.update_cnt % self.policy_target_update_interval == 0:
298	with tf.GradientTape() as p_tape:
299	new_action = self.policy_net.evaluate(
300	state, eval_noise_scale=0.0
301	) # no noise, deterministic policy gradients
302	new_q_input = tf.concat([state, new_action], 1)
303	# """ implementation 1 """
304	# predicted_new_q_value = tf.minimum(self.q_net1(new_q_input),self.q_net2(new_q_input))
305	""" implementation 2 """
306	predicted_new_q_value = self.q_net1(new_q_input)
307	policy_loss = -tf.reduce_mean(predicted_new_q_value)
308	p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights)
309	self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights))
310
311	# Soft update the target nets
312	self.target_q_net1 = self.target_soft_update(self.q_net1, self.target_q_net1, soft_tau)
313	self.target_q_net2 = self.target_soft_update(self.q_net2, self.target_q_net2, soft_tau)
314	self.target_policy_net = self.target_soft_update(self.policy_net, self.target_policy_net, soft_tau)
315
316	def save(self): # save trained weights
317	path = os.path.join('model', '_'.join([ALG_NAME, ENV_ID]))

Callers 1

tutorial_TD3.pyFile · 0.45

Calls 4

target_soft_updateMethod · 0.95

gradientMethod · 0.80

sampleMethod · 0.45

evaluateMethod · 0.45

Tested by

no test coverage detected