(
env_fn,
ac_kwargs=dict(),
seed=0,
save_folder=None,
num_train_episodes=100,
test_agent_every=25,
replay_size=int(1e6),
gamma=0.99,
decay=0.995,
mu_lr=1e-3,
q_lr=1e-3,
batch_size=100,
start_steps=10000,
action_noise=0.1,
max_episode_length=1000)
| 85 | |
| 86 | ### Implement the DDPG algorithm ### |
| 87 | def ddpg( |
| 88 | env_fn, |
| 89 | ac_kwargs=dict(), |
| 90 | seed=0, |
| 91 | save_folder=None, |
| 92 | num_train_episodes=100, |
| 93 | test_agent_every=25, |
| 94 | replay_size=int(1e6), |
| 95 | gamma=0.99, |
| 96 | decay=0.995, |
| 97 | mu_lr=1e-3, |
| 98 | q_lr=1e-3, |
| 99 | batch_size=100, |
| 100 | start_steps=10000, |
| 101 | action_noise=0.1, |
| 102 | max_episode_length=1000): |
| 103 | |
| 104 | tf.set_random_seed(seed) |
| 105 | np.random.seed(seed) |
| 106 | |
| 107 | env, test_env = env_fn(), env_fn() |
| 108 | |
| 109 | # comment out this line if you don't want to record a video of the agent |
| 110 | if save_folder is not None: |
| 111 | test_env = gym.wrappers.Monitor(test_env, save_folder) |
| 112 | |
| 113 | # get size of state space and action space |
| 114 | num_states = env.observation_space.shape[0] |
| 115 | num_actions = env.action_space.shape[0] |
| 116 | |
| 117 | # Maximum value of action |
| 118 | # Assumes both low and high values are the same |
| 119 | # Assumes all actions have the same bounds |
| 120 | # May NOT be the case for all environments |
| 121 | action_max = env.action_space.high[0] |
| 122 | |
| 123 | # Create Tensorflow placeholders (neural network inputs) |
| 124 | X = tf.placeholder(dtype=tf.float32, shape=(None, num_states)) # state |
| 125 | A = tf.placeholder(dtype=tf.float32, shape=(None, num_actions)) # action |
| 126 | X2 = tf.placeholder(dtype=tf.float32, shape=(None, num_states)) # next state |
| 127 | R = tf.placeholder(dtype=tf.float32, shape=(None,)) # reward |
| 128 | D = tf.placeholder(dtype=tf.float32, shape=(None,)) # done |
| 129 | |
| 130 | # Main network outputs |
| 131 | with tf.variable_scope('main'): |
| 132 | mu, q, q_mu = CreateNetworks(X, A, num_actions, action_max, **ac_kwargs) |
| 133 | |
| 134 | # Target networks |
| 135 | with tf.variable_scope('target'): |
| 136 | # We don't need the Q network output with arbitrary input action A |
| 137 | # because that's not actually used in our loss functions |
| 138 | # NOTE 1: The state input is X2, NOT X |
| 139 | # We only care about max_a{ Q(s', a) } |
| 140 | # Where this is equal to Q(s', mu(s')) |
| 141 | # This is because it's used in the target calculation: r + gamma * max_a{ Q(s',a) } |
| 142 | # Where s' = X2 |
| 143 | # NOTE 2: We ignore the first 2 networks for the same reason |
| 144 | _, _, q_mu_targ = CreateNetworks(X2, A, num_actions, action_max, **ac_kwargs) |
no test coverage detected