Update parameters :return: None
(self)
| 163 | ) # add randomness to action selection for exploration |
| 164 | |
| 165 | def learn(self): |
| 166 | """ |
| 167 | Update parameters |
| 168 | :return: None |
| 169 | """ |
| 170 | self.var *= .9995 |
| 171 | indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE) |
| 172 | datas = self.memory[indices, :] |
| 173 | states = datas[:, :self.state_dim] |
| 174 | actions = datas[:, self.state_dim:self.state_dim + self.action_dim] |
| 175 | rewards = datas[:, -self.state_dim - 1:-self.state_dim] |
| 176 | states_ = datas[:, -self.state_dim:] |
| 177 | |
| 178 | with tf.GradientTape() as tape: |
| 179 | actions_ = self.actor_target(states_) |
| 180 | q_ = self.critic_target([states_, actions_]) |
| 181 | y = rewards + GAMMA * q_ |
| 182 | q = self.critic([states, actions]) |
| 183 | td_error = tf.losses.mean_squared_error(y, q) |
| 184 | critic_grads = tape.gradient(td_error, self.critic.trainable_weights) |
| 185 | self.critic_opt.apply_gradients(zip(critic_grads, self.critic.trainable_weights)) |
| 186 | |
| 187 | with tf.GradientTape() as tape: |
| 188 | a = self.actor(states) |
| 189 | q = self.critic([states, a]) |
| 190 | actor_loss = -tf.reduce_mean(q) # maximize the q |
| 191 | actor_grads = tape.gradient(actor_loss, self.actor.trainable_weights) |
| 192 | self.actor_opt.apply_gradients(zip(actor_grads, self.actor.trainable_weights)) |
| 193 | self.ema_update() |
| 194 | |
| 195 | def store_transition(self, s, a, r, s_): |
| 196 | """ |
no test coverage detected