the network for generating non-determinstic (Gaussian distributed) action from the state input
| 138 | |
| 139 | |
| 140 | class PolicyNetwork(Model): |
| 141 | """ the network for generating non-determinstic (Gaussian distributed) action from the state input """ |
| 142 | |
| 143 | def __init__( |
| 144 | self, num_inputs, num_actions, hidden_dim, action_range=1., init_w=3e-3, log_std_min=-20, log_std_max=2 |
| 145 | ): |
| 146 | super(PolicyNetwork, self).__init__() |
| 147 | |
| 148 | self.log_std_min = log_std_min |
| 149 | self.log_std_max = log_std_max |
| 150 | |
| 151 | w_init = tf.keras.initializers.glorot_normal(seed=None) |
| 152 | # w_init = tf.random_uniform_initializer(-init_w, init_w) |
| 153 | |
| 154 | self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=num_inputs, name='policy1') |
| 155 | self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy2') |
| 156 | self.linear3 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy3') |
| 157 | |
| 158 | self.mean_linear = Dense( |
| 159 | n_units=num_actions, W_init=w_init, b_init=tf.random_uniform_initializer(-init_w, init_w), |
| 160 | in_channels=hidden_dim, name='policy_mean' |
| 161 | ) |
| 162 | self.log_std_linear = Dense( |
| 163 | n_units=num_actions, W_init=w_init, b_init=tf.random_uniform_initializer(-init_w, init_w), |
| 164 | in_channels=hidden_dim, name='policy_logstd' |
| 165 | ) |
| 166 | |
| 167 | self.action_range = action_range |
| 168 | self.num_actions = num_actions |
| 169 | |
| 170 | def forward(self, state): |
| 171 | x = self.linear1(state) |
| 172 | x = self.linear2(x) |
| 173 | x = self.linear3(x) |
| 174 | |
| 175 | mean = self.mean_linear(x) |
| 176 | log_std = self.log_std_linear(x) |
| 177 | log_std = tf.clip_by_value(log_std, self.log_std_min, self.log_std_max) |
| 178 | |
| 179 | return mean, log_std |
| 180 | |
| 181 | def evaluate(self, state, epsilon=1e-6): |
| 182 | """ generate action with state for calculating gradients """ |
| 183 | state = state.astype(np.float32) |
| 184 | mean, log_std = self.forward(state) |
| 185 | std = tf.math.exp(log_std) # no clip in evaluation, clip affects gradients flow |
| 186 | |
| 187 | normal = Normal(0, 1) |
| 188 | z = normal.sample(mean.shape) |
| 189 | action_0 = tf.math.tanh(mean + std * z) # TanhNormal distribution as actions; reparameterization trick |
| 190 | action = self.action_range * action_0 |
| 191 | # according to original paper, with an extra last term for normalizing different action range |
| 192 | log_prob = Normal(mean, std).log_prob(mean + std * z) - tf.math.log(1. - action_0**2 + |
| 193 | epsilon) - np.log(self.action_range) |
| 194 | # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action); |
| 195 | # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability, |
| 196 | # needs sum up across the dim of actions to get 1 dim probability; or else use Multivariate Normal. |
| 197 | log_prob = tf.reduce_sum(log_prob, axis=1)[:, np.newaxis] # expand dim as reduce_sum causes 1 dim reduced |
no outgoing calls
no test coverage detected
searching dependent graphs…