| 64 | |
| 65 | # Shared-trunk actor-critic: two-layer MLP with tanh, then policy and value heads. |
| 66 | class ActorCritic(nn.Module): |
| 67 | def __init__(self, state_size, action_size): |
| 68 | super().__init__() |
| 69 | # gain = sqrt(2) for the tanh trunk, 0.01 for the policy head |
| 70 | # (keeps initial action distribution close to uniform), 1 for the |
| 71 | # value head. These are the standard PPO-paper / CleanRL choices. |
| 72 | self.shared = nn.Sequential( |
| 73 | _ortho(nn.Linear(state_size, 64), gain=2 ** 0.5), |
| 74 | nn.Tanh(), |
| 75 | _ortho(nn.Linear(64, 64), gain=2 ** 0.5), |
| 76 | nn.Tanh(), |
| 77 | ) |
| 78 | self.policy = _ortho(nn.Linear(64, action_size), gain=0.01) |
| 79 | self.value = _ortho(nn.Linear(64, 1), gain=1.0) |
| 80 | |
| 81 | def forward(self, x): |
| 82 | h = self.shared(x) |
| 83 | return self.policy(h), self.value(h).squeeze(-1) |
| 84 | |
| 85 | |
| 86 | # GAE-lambda: backward recursion over the collected rollout. |