| 58 | |
| 59 | |
| 60 | def compute_gae(rewards, values, dones, last_value): |
| 61 | advantages = np.zeros_like(rewards, dtype=np.float32) |
| 62 | gae = 0.0 |
| 63 | for t in reversed(range(len(rewards))): |
| 64 | next_v = last_value if t == len(rewards) - 1 else values[t + 1] |
| 65 | next_nonterminal = 1.0 - dones[t] |
| 66 | delta = rewards[t] + GAMMA * next_v * next_nonterminal - values[t] |
| 67 | gae = delta + GAMMA * GAE_LAMBDA * next_nonterminal * gae |
| 68 | advantages[t] = gae |
| 69 | returns = advantages + values |
| 70 | return advantages, returns |
| 71 | |
| 72 | |
| 73 | if __name__ == "__main__": |