(self, num_outputs, reg=0.01)
| 37 | |
| 38 | class PolicyNetwork: |
| 39 | def __init__(self, num_outputs, reg=0.01): |
| 40 | self.num_outputs = num_outputs |
| 41 | |
| 42 | # Graph inputs |
| 43 | # After resizing we have 4 consecutive frames of size 84 x 84 |
| 44 | self.states = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X") |
| 45 | # Advantage = G - V(s) |
| 46 | self.advantage = tf.placeholder(shape=[None], dtype=tf.float32, name="y") |
| 47 | # Selected actions |
| 48 | self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="actions") |
| 49 | |
| 50 | # Since we set reuse=False here, that means we MUST |
| 51 | # create the PolicyNetwork before creating the ValueNetwork |
| 52 | # ValueNetwork will use reuse=True |
| 53 | with tf.variable_scope("shared", reuse=False): |
| 54 | fc1 = build_feature_extractor(self.states) |
| 55 | |
| 56 | # Use a separate scope for output and loss |
| 57 | with tf.variable_scope("policy_network"): |
| 58 | self.logits = tf.contrib.layers.fully_connected(fc1, num_outputs, activation_fn=None) |
| 59 | self.probs = tf.nn.softmax(self.logits) |
| 60 | |
| 61 | # Sample an action |
| 62 | cdist = tf.distributions.Categorical(logits=self.logits) |
| 63 | self.sample_action = cdist.sample() |
| 64 | |
| 65 | # Add regularization to increase exploration |
| 66 | self.entropy = -tf.reduce_sum(self.probs * tf.log(self.probs), axis=1) |
| 67 | |
| 68 | # Get the predictions for the chosen actions only |
| 69 | batch_size = tf.shape(self.states)[0] |
| 70 | gather_indices = tf.range(batch_size) * tf.shape(self.probs)[1] + self.actions |
| 71 | self.selected_action_probs = tf.gather(tf.reshape(self.probs, [-1]), gather_indices) |
| 72 | |
| 73 | self.loss = tf.log(self.selected_action_probs) * self.advantage + reg * self.entropy |
| 74 | self.loss = -tf.reduce_sum(self.loss, name="loss") |
| 75 | |
| 76 | # training |
| 77 | self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6) |
| 78 | |
| 79 | # we'll need these later for running gradient descent steps |
| 80 | self.grads_and_vars = self.optimizer.compute_gradients(self.loss) |
| 81 | self.grads_and_vars = [[grad, var] for grad, var in self.grads_and_vars if grad is not None] |
| 82 | |
| 83 | |
| 84 | class ValueNetwork: |
nothing calls this directly
no test coverage detected