MCPcopy
hub / github.com/tensorlayer/TensorLayer / update

Method update

examples/reinforcement_learning/tutorial_SAC.py:275–333  ·  view source on GitHub ↗

update all networks in SAC

(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-2, gamma=0.99, soft_tau=1e-2)

Source from the content-addressed store, hash-verified

273 return target_net
274
275 def update(self, batch_size, reward_scale=10., auto_entropy=True, target_entropy=-2, gamma=0.99, soft_tau=1e-2):
276 """ update all networks in SAC """
277 state, action, reward, next_state, done = self.replay_buffer.sample(batch_size)
278
279 reward = reward[:, np.newaxis] # expand dim
280 done = done[:, np.newaxis]
281
282 reward = reward_scale * (reward - np.mean(reward, axis=0)) / (
283 np.std(reward, axis=0) + 1e-6
284 ) # normalize with batch mean and std; plus a small number to prevent numerical problem
285
286 # Training Q Function
287 new_next_action, next_log_prob, _, _, _ = self.policy_net.evaluate(next_state)
288 target_q_input = tf.concat([next_state, new_next_action], 1) # the dim 0 is number of samples
289 target_q_min = tf.minimum(
290 self.target_soft_q_net1(target_q_input), self.target_soft_q_net2(target_q_input)
291 ) - self.alpha * next_log_prob
292 target_q_value = reward + (1 - done) * gamma * target_q_min # if done==1, only reward
293 q_input = tf.concat([state, action], 1) # the dim 0 is number of samples
294
295 with tf.GradientTape() as q1_tape:
296 predicted_q_value1 = self.soft_q_net1(q_input)
297 q_value_loss1 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value1, target_q_value))
298 q1_grad = q1_tape.gradient(q_value_loss1, self.soft_q_net1.trainable_weights)
299 self.soft_q_optimizer1.apply_gradients(zip(q1_grad, self.soft_q_net1.trainable_weights))
300
301 with tf.GradientTape() as q2_tape:
302 predicted_q_value2 = self.soft_q_net2(q_input)
303 q_value_loss2 = tf.reduce_mean(tf.losses.mean_squared_error(predicted_q_value2, target_q_value))
304 q2_grad = q2_tape.gradient(q_value_loss2, self.soft_q_net2.trainable_weights)
305 self.soft_q_optimizer2.apply_gradients(zip(q2_grad, self.soft_q_net2.trainable_weights))
306
307 # Training Policy Function
308 with tf.GradientTape() as p_tape:
309 new_action, log_prob, z, mean, log_std = self.policy_net.evaluate(state)
310 new_q_input = tf.concat([state, new_action], 1) # the dim 0 is number of samples
311 """ implementation 1 """
312 predicted_new_q_value = tf.minimum(self.soft_q_net1(new_q_input), self.soft_q_net2(new_q_input))
313 # """ implementation 2 """
314 # predicted_new_q_value = self.soft_q_net1(new_q_input)
315 policy_loss = tf.reduce_mean(self.alpha * log_prob - predicted_new_q_value)
316 p_grad = p_tape.gradient(policy_loss, self.policy_net.trainable_weights)
317 self.policy_optimizer.apply_gradients(zip(p_grad, self.policy_net.trainable_weights))
318
319 # Updating alpha w.r.t entropy
320 # alpha: trade-off between exploration (max entropy) and exploitation (max Q)
321 if auto_entropy is True:
322 with tf.GradientTape() as alpha_tape:
323 alpha_loss = -tf.reduce_mean((self.log_alpha * (log_prob + target_entropy)))
324 alpha_grad = alpha_tape.gradient(alpha_loss, [self.log_alpha])
325 self.alpha_optimizer.apply_gradients(zip(alpha_grad, [self.log_alpha]))
326 self.alpha = tf.math.exp(self.log_alpha)
327 else: # fixed alpha
328 self.alpha = 1.
329 alpha_loss = 0
330
331 # Soft update the target value nets
332 self.target_soft_q_net1 = self.target_soft_update(self.soft_q_net1, self.target_soft_q_net1, soft_tau)

Callers 1

tutorial_SAC.pyFile · 0.45

Calls 4

target_soft_updateMethod · 0.95
gradientMethod · 0.80
sampleMethod · 0.45
evaluateMethod · 0.45

Tested by

no test coverage detected