MCPcopy Index your code
hub / github.com/tensorlayer/TensorLayer / PolicyNetwork

Class PolicyNetwork

examples/reinforcement_learning/tutorial_SAC.py:140–218  ·  view source on GitHub ↗

the network for generating non-determinstic (Gaussian distributed) action from the state input

Source from the content-addressed store, hash-verified

138
139
140class PolicyNetwork(Model):
141 """ the network for generating non-determinstic (Gaussian distributed) action from the state input """
142
143 def __init__(
144 self, num_inputs, num_actions, hidden_dim, action_range=1., init_w=3e-3, log_std_min=-20, log_std_max=2
145 ):
146 super(PolicyNetwork, self).__init__()
147
148 self.log_std_min = log_std_min
149 self.log_std_max = log_std_max
150
151 w_init = tf.keras.initializers.glorot_normal(seed=None)
152 # w_init = tf.random_uniform_initializer(-init_w, init_w)
153
154 self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=num_inputs, name='policy1')
155 self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy2')
156 self.linear3 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy3')
157
158 self.mean_linear = Dense(
159 n_units=num_actions, W_init=w_init, b_init=tf.random_uniform_initializer(-init_w, init_w),
160 in_channels=hidden_dim, name='policy_mean'
161 )
162 self.log_std_linear = Dense(
163 n_units=num_actions, W_init=w_init, b_init=tf.random_uniform_initializer(-init_w, init_w),
164 in_channels=hidden_dim, name='policy_logstd'
165 )
166
167 self.action_range = action_range
168 self.num_actions = num_actions
169
170 def forward(self, state):
171 x = self.linear1(state)
172 x = self.linear2(x)
173 x = self.linear3(x)
174
175 mean = self.mean_linear(x)
176 log_std = self.log_std_linear(x)
177 log_std = tf.clip_by_value(log_std, self.log_std_min, self.log_std_max)
178
179 return mean, log_std
180
181 def evaluate(self, state, epsilon=1e-6):
182 """ generate action with state for calculating gradients """
183 state = state.astype(np.float32)
184 mean, log_std = self.forward(state)
185 std = tf.math.exp(log_std) # no clip in evaluation, clip affects gradients flow
186
187 normal = Normal(0, 1)
188 z = normal.sample(mean.shape)
189 action_0 = tf.math.tanh(mean + std * z) # TanhNormal distribution as actions; reparameterization trick
190 action = self.action_range * action_0
191 # according to original paper, with an extra last term for normalizing different action range
192 log_prob = Normal(mean, std).log_prob(mean + std * z) - tf.math.log(1. - action_0**2 +
193 epsilon) - np.log(self.action_range)
194 # both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action);
195 # the Normal.log_prob outputs the same dim of input features instead of 1 dim probability,
196 # needs sum up across the dim of actions to get 1 dim probability; or else use Multivariate Normal.
197 log_prob = tf.reduce_sum(log_prob, axis=1)[:, np.newaxis] # expand dim as reduce_sum causes 1 dim reduced

Callers 1

__init__Method · 0.70

Calls

no outgoing calls

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…