MCPcopy
hub / github.com/tensorlayer/TensorLayer / train_actor

Method train_actor

examples/reinforcement_learning/tutorial_PPO.py:111–139  ·  view source on GitHub ↗

Update policy network :param state: state batch :param action: action batch :param adv: advantage batch :param old_pi: old pi distribution :return: kl_mean or None

(self, state, action, adv, old_pi)

Source from the content-addressed store, hash-verified

109 self.action_bound = action_bound
110
111 def train_actor(self, state, action, adv, old_pi):
112 """
113 Update policy network
114 :param state: state batch
115 :param action: action batch
116 :param adv: advantage batch
117 :param old_pi: old pi distribution
118 :return: kl_mean or None
119 """
120 with tf.GradientTape() as tape:
121 mean, std = self.actor(state), tf.exp(self.actor.logstd)
122 pi = tfp.distributions.Normal(mean, std)
123
124 ratio = tf.exp(pi.log_prob(action) - old_pi.log_prob(action))
125 surr = ratio * adv
126 if self.method == 'penalty': # ppo penalty
127 kl = tfp.distributions.kl_divergence(old_pi, pi)
128 kl_mean = tf.reduce_mean(kl)
129 loss = -(tf.reduce_mean(surr - self.lam * kl))
130 else: # ppo clip
131 loss = -tf.reduce_mean(
132 tf.minimum(surr,
133 tf.clip_by_value(ratio, 1. - self.epsilon, 1. + self.epsilon) * adv)
134 )
135 a_gard = tape.gradient(loss, self.actor.trainable_weights)
136 self.actor_opt.apply_gradients(zip(a_gard, self.actor.trainable_weights))
137
138 if self.method == 'kl_pen':
139 return kl_mean
140
141 def train_critic(self, reward, state):
142 """

Callers 1

updateMethod · 0.95

Calls 1

gradientMethod · 0.80

Tested by

no test coverage detected