hub / github.com/tensorlayer/TensorLayer / update

Method update

examples/reinforcement_learning/tutorial_SAC.py:275–333 · view source on GitHub ↗

update all networks in SAC

(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-2, gamma=0.99, soft_tau=1e-2)

Source from the content-addressed store, hash-verified

273	return target_net
274
275	def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-2, gamma=0.99, soft_tau=1e-2):
276	""" update all networks in SAC """
277	state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)
278
279	reward = reward[:, np.newaxis] # expand dim
280	done = done[:, np.newaxis]
281
282	reward = reward_scale * (reward - np.mean(reward, axis=0)) / (
283	np.std(reward, axis=0) + 1e-6
284	) # normalize with batch mean and std; plus a small number to prevent numerical problem
285
286	# Training Q Function
287	new_next_action, next_log_prob, _, _, _ = self.policy_net.evaluate(next_state)
288	target_q_input = tf.concat([next_state, new_next_action], 1) # the dim 0 is number of samples
289	target_q_min = tf.minimum(
290	self.target_soft_q_net1(target_q_input), self.target_soft_q_net2(target_q_input)
291	) - self.alpha * next_log_prob
292	target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward
293	q_input = tf.concat([state, action], 1) # the dim 0 is number of samples
294
295	with tf.GradientTape() as q1_tape:
296	predicted_q_value1 = self.soft_q_net1(q_input)
297	q_value_loss1 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value1, target_q_value))
298	q1_grad = q1_tape.gradient(q_value_loss1, self.soft_q_net1.trainable_weights)
299	self.soft_q_optimizer1.apply_gradients(zip(q1_grad, self.soft_q_net1.trainable_weights))
300
301	with tf.GradientTape() as q2_tape:
302	predicted_q_value2 = self.soft_q_net2(q_input)
303	q_value_loss2 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value2, target_q_value))
304	q2_grad = q2_tape.gradient(q_value_loss2, self.soft_q_net2.trainable_weights)
305	self.soft_q_optimizer2.apply_gradients(zip(q2_grad, self.soft_q_net2.trainable_weights))
306
307	# Training Policy Function
308	with tf.GradientTape() as p_tape:
309	new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state)
310	new_q_input = tf.concat([state, new_action], 1) # the dim 0 is number of samples
311	""" implementation 1 """
312	predicted_new_q_value = tf.minimum(self.soft_q_net1(new_q_input), self.soft_q_net2(new_q_input))
313	# """ implementation 2 """
314	# predicted_new_q_value = self.soft_q_net1(new_q_input)
315	policy_loss = tf.reduce_mean(self.alpha * log_prob - predicted_new_q_value)
316	p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights)
317	self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights))
318
319	# Updating alpha w.r.t entropy
320	# alpha: trade-off between exploration (max entropy) and exploitation (max Q)
321	if auto_entropy is True:
322	with tf.GradientTape() as alpha_tape:
323	alpha_loss = -tf.reduce_mean((self.log_alpha * (log_prob + target_entropy)))
324	alpha_grad = alpha_tape.gradient(alpha_loss, [self.log_alpha])
325	self.alpha_optimizer.apply_gradients(zip(alpha_grad, [self.log_alpha]))
326	self.alpha = tf.math.exp(self.log_alpha)
327	else: # fixed alpha
328	self.alpha = 1.
329	alpha_loss = 0
330
331	# Soft update the target value nets
332	self.target_soft_q_net1 = self.target_soft_update(self.soft_q_net1, self.target_soft_q_net1, soft_tau)

Callers 1

tutorial_SAC.pyFile · 0.45

Calls 4

target_soft_updateMethod · 0.95

gradientMethod · 0.80

sampleMethod · 0.45

evaluateMethod · 0.45

Tested by

no test coverage detected