(self, D, ft, hidden_layer_sizes=[])
| 49 | # approximates pi(a | s) |
| 50 | class PolicyModel: |
| 51 | def __init__(self, D, ft, hidden_layer_sizes=[]): |
| 52 | self.ft = ft |
| 53 | |
| 54 | ##### hidden layers ##### |
| 55 | M1 = D |
| 56 | self.hidden_layers = [] |
| 57 | for M2 in hidden_layer_sizes: |
| 58 | layer = HiddenLayer(M1, M2) |
| 59 | self.hidden_layers.append(layer) |
| 60 | M1 = M2 |
| 61 | |
| 62 | # final layer mean |
| 63 | self.mean_layer = HiddenLayer(M1, 1, lambda x: x, use_bias=False, zeros=True) |
| 64 | |
| 65 | # final layer variance |
| 66 | self.stdv_layer = HiddenLayer(M1, 1, tf.nn.softplus, use_bias=False, zeros=False) |
| 67 | |
| 68 | # inputs and targets |
| 69 | self.X = tf.placeholder(tf.float32, shape=(None, D), name='X') |
| 70 | self.actions = tf.placeholder(tf.float32, shape=(None,), name='actions') |
| 71 | self.advantages = tf.placeholder(tf.float32, shape=(None,), name='advantages') |
| 72 | |
| 73 | # get final hidden layer |
| 74 | Z = self.X |
| 75 | for layer in self.hidden_layers: |
| 76 | Z = layer.forward(Z) |
| 77 | |
| 78 | # calculate output and cost |
| 79 | mean = self.mean_layer.forward(Z) |
| 80 | stdv = self.stdv_layer.forward(Z) + 1e-5 # smoothing |
| 81 | |
| 82 | # make them 1-D |
| 83 | mean = tf.reshape(mean, [-1]) |
| 84 | stdv = tf.reshape(stdv, [-1]) |
| 85 | |
| 86 | norm = tf.contrib.distributions.Normal(mean, stdv) |
| 87 | self.predict_op = tf.clip_by_value(norm.sample(), -1, 1) |
| 88 | |
| 89 | log_probs = norm.log_prob(self.actions) |
| 90 | cost = -tf.reduce_sum(self.advantages * log_probs + 0.1*norm.entropy()) |
| 91 | self.train_op = tf.train.AdamOptimizer(1e-3).minimize(cost) |
| 92 | |
| 93 | def set_session(self, session): |
| 94 | self.session = session |
nothing calls this directly
no test coverage detected