Class ActorCritic

2-cartpole/3-ppo.py:66–83 · view source on GitHub ↗

Source from the content-addressed store, hash-verified

64
65	# Shared-trunk actor-critic: two-layer MLP with tanh, then policy and value heads.
66	class ActorCritic(nn.Module):
67	def __init__(self, state_size, action_size):
68	super().__init__()
69	# gain = sqrt(2) for the tanh trunk, 0.01 for the policy head
70	# (keeps initial action distribution close to uniform), 1 for the
71	# value head. These are the standard PPO-paper / CleanRL choices.
72	self.shared = nn.Sequential(
73	_ortho(nn.Linear(state_size, 64), gain=2 ** 0.5),
74	nn.Tanh(),
75	_ortho(nn.Linear(64, 64), gain=2 ** 0.5),
76	nn.Tanh(),
77	)
78	self.policy = _ortho(nn.Linear(64, action_size), gain=0.01)
79	self.value = _ortho(nn.Linear(64, 1), gain=1.0)
80
81	def forward(self, x):
82	h = self.shared(x)
83	return self.policy(h), self.value(h).squeeze(-1)
84
85
86	# GAE-lambda: backward recursion over the collected rollout.

3-ppo.pyFile · 0.70

no outgoing calls

no test coverage detected