MCPcopy Index your code
hub / github.com/lazyprogrammer/machine_learning_examples / __init__

Method __init__

rl2/a3c/nets.py:39–81  ·  view source on GitHub ↗
(self, num_outputs, reg=0.01)

Source from the content-addressed store, hash-verified

37
38class PolicyNetwork:
39 def __init__(self, num_outputs, reg=0.01):
40 self.num_outputs = num_outputs
41
42 # Graph inputs
43 # After resizing we have 4 consecutive frames of size 84 x 84
44 self.states = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X")
45 # Advantage = G - V(s)
46 self.advantage = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
47 # Selected actions
48 self.actions = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")
49
50 # Since we set reuse=False here, that means we MUST
51 # create the PolicyNetwork before creating the ValueNetwork
52 # ValueNetwork will use reuse=True
53 with tf.variable_scope("shared", reuse=False):
54 fc1 = build_feature_extractor(self.states)
55
56 # Use a separate scope for output and loss
57 with tf.variable_scope("policy_network"):
58 self.logits = tf.contrib.layers.fully_connected(fc1, num_outputs, activation_fn=None)
59 self.probs = tf.nn.softmax(self.logits)
60
61 # Sample an action
62 cdist = tf.distributions.Categorical(logits=self.logits)
63 self.sample_action = cdist.sample()
64
65 # Add regularization to increase exploration
66 self.entropy = -tf.reduce_sum(self.probs * tf.log(self.probs), axis=1)
67
68 # Get the predictions for the chosen actions only
69 batch_size = tf.shape(self.states)[0]
70 gather_indices = tf.range(batch_size) * tf.shape(self.probs)[1] + self.actions
71 self.selected_action_probs = tf.gather(tf.reshape(self.probs, [-1]), gather_indices)
72
73 self.loss = tf.log(self.selected_action_probs) * self.advantage + reg * self.entropy
74 self.loss = -tf.reduce_sum(self.loss, name="loss")
75
76 # training
77 self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
78
79 # we'll need these later for running gradient descent steps
80 self.grads_and_vars = self.optimizer.compute_gradients(self.loss)
81 self.grads_and_vars = [[grad, var] for grad, var in self.grads_and_vars if grad is not None]
82
83
84class ValueNetwork:

Callers

nothing calls this directly

Calls 2

build_feature_extractorFunction · 0.85
sampleMethod · 0.45

Tested by

no test coverage detected