| 36 | ### Create both the actor and critic networks at once ### |
| 37 | ### Q(s, mu(s)) returns the maximum Q for a given state s ### |
| 38 | def CreateNetworks( |
| 39 | s, a, |
| 40 | num_actions, |
| 41 | action_max, |
| 42 | hidden_sizes=(300,), |
| 43 | hidden_activation=tf.nn.relu, |
| 44 | output_activation=tf.tanh): |
| 45 | |
| 46 | with tf.variable_scope('mu'): |
| 47 | mu = action_max * ANN(s, list(hidden_sizes)+[num_actions], hidden_activation, output_activation) |
| 48 | with tf.variable_scope('q'): |
| 49 | input_ = tf.concat([s, a], axis=-1) # (state, action) |
| 50 | q = tf.squeeze(ANN(input_, list(hidden_sizes)+[1], hidden_activation, None), axis=1) |
| 51 | with tf.variable_scope('q', reuse=True): |
| 52 | # reuse is True, so it reuses the weights from the previously defined Q network |
| 53 | input_ = tf.concat([s, mu], axis=-1) # (state, mu(state)) |
| 54 | q_mu = tf.squeeze(ANN(input_, list(hidden_sizes)+[1], hidden_activation, None), axis=1) |
| 55 | return mu, q, q_mu |
| 56 | |
| 57 | |
| 58 | ### The experience replay memory ### |