hub / github.com/tensorlayer/TensorLayer / PolicyNetwork

Class PolicyNetwork

examples/reinforcement_learning/tutorial_SAC.py:140–218 · view source on GitHub ↗

the network for generating non-determinstic (Gaussian distributed) action from the state input

Source from the content-addressed store, hash-verified

138
139
140	class PolicyNetwork(Model):
141	""" the network for generating non-determinstic (Gaussian distributed) action from the state input """
142
143	def __init__(
144	self, num_inputs, num_actions, hidden_dim, action_range=1., init_w=3e-3, log_std_min=-20, log_std_max=2
145	):
146	super(PolicyNetwork, self).__init__()
147
148	self.log_std_min = log_std_min
149	self.log_std_max = log_std_max
150
151	w_init = tf.keras.initializers.glorot_normal(seed=None)
152	# w_init = tf.random_uniform_initializer(-init_w, init_w)
153
154	self.linear1 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=num_inputs, name='policy1')
155	self.linear2 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy2')
156	self.linear3 = Dense(n_units=hidden_dim, act=tf.nn.relu, W_init=w_init, in_channels=hidden_dim, name='policy3')
157
158	self.mean_linear = Dense(
159	n_units=num_actions, W_init=w_init, b_init=tf.random_uniform_initializer(-init_w, init_w),
160	in_channels=hidden_dim, name='policy_mean'
161	)
162	self.log_std_linear = Dense(
163	n_units=num_actions, W_init=w_init, b_init=tf.random_uniform_initializer(-init_w, init_w),
164	in_channels=hidden_dim, name='policy_logstd'
165	)
166
167	self.action_range = action_range
168	self.num_actions = num_actions
169
170	def forward(self, state):
171	x = self.linear1(state)
172	x = self.linear2(x)
173	x = self.linear3(x)
174
175	mean = self.mean_linear(x)
176	log_std = self.log_std_linear(x)
177	log_std = tf.clip_by_value(log_std, self.log_std_min, self.log_std_max)
178
179	return mean, log_std
180
181	def evaluate(self, state, epsilon=1e-6):
182	""" generate action with state for calculating gradients """
183	state = state.astype(np.float32)
184	mean, log_std = self.forward(state)
185	std = tf.math.exp(log_std) # no clip in evaluation, clip affects gradients flow
186
187	normal = Normal(0, 1)
188	z = normal.sample(mean.shape)
189	action_0 = tf.math.tanh(mean + std * z) # TanhNormal distribution as actions; reparameterization trick
190	action = self.action_range * action_0
191	# according to original paper, with an extra last term for normalizing different action range
192	log_prob = Normal(mean, std).log_prob(mean + std * z) - tf.math.log(1. - action_0**2 +
193	epsilon) - np.log(self.action_range)
194	# both dims of normal.log_prob and -log(1-a**2) are (N,dim_of_action);
195	# the Normal.log_prob outputs the same dim of input features instead of 1 dim probability,
196	# needs sum up across the dim of actions to get 1 dim probability; or else use Multivariate Normal.
197	log_prob = tf.reduce_sum(log_prob, axis=1)[:, np.newaxis] # expand dim as reduce_sum causes 1 dim reduced

Callers 1

__init__Method · 0.70

Calls

no outgoing calls

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…