( env, sess, total_t, experience_replay_buffer, model, target_model, image_transformer, gamma, batch_size, epsilon, epsilon_change, epsilon_min)
| 276 | |
| 277 | |
| 278 | def play_one( |
| 279 | env, |
| 280 | sess, |
| 281 | total_t, |
| 282 | experience_replay_buffer, |
| 283 | model, |
| 284 | target_model, |
| 285 | image_transformer, |
| 286 | gamma, |
| 287 | batch_size, |
| 288 | epsilon, |
| 289 | epsilon_change, |
| 290 | epsilon_min): |
| 291 | |
| 292 | t0 = datetime.now() |
| 293 | |
| 294 | # Reset the environment |
| 295 | obs = env.reset() |
| 296 | obs_small = image_transformer.transform(obs, sess) |
| 297 | state = np.stack([obs_small] * 4, axis=2) |
| 298 | loss = None |
| 299 | |
| 300 | |
| 301 | total_time_training = 0 |
| 302 | num_steps_in_episode = 0 |
| 303 | episode_reward = 0 |
| 304 | |
| 305 | done = False |
| 306 | while not done: |
| 307 | |
| 308 | # Update target network |
| 309 | if total_t % TARGET_UPDATE_PERIOD == 0: |
| 310 | target_model.copy_from(model) |
| 311 | print("Copied model parameters to target network. total_t = %s, period = %s" % (total_t, TARGET_UPDATE_PERIOD)) |
| 312 | |
| 313 | |
| 314 | # Take action |
| 315 | action = model.sample_action(state, epsilon) |
| 316 | obs, reward, done, _ = env.step(action) |
| 317 | obs_small = image_transformer.transform(obs, sess) |
| 318 | next_state = update_state(state, obs_small) |
| 319 | |
| 320 | # Compute total reward |
| 321 | episode_reward += reward |
| 322 | |
| 323 | # Save the latest experience |
| 324 | experience_replay_buffer.add_experience(action, obs_small, reward, done) |
| 325 | |
| 326 | # Train the model, keep track of time |
| 327 | t0_2 = datetime.now() |
| 328 | loss = learn(model, target_model, experience_replay_buffer, gamma, batch_size) |
| 329 | dt = datetime.now() - t0_2 |
| 330 | |
| 331 | # More debugging info |
| 332 | total_time_training += dt.total_seconds() |
| 333 | num_steps_in_episode += 1 |
| 334 | |
| 335 |
no test coverage detected