()
| 230 | x_data = torch.randn(batch, seqlen, model.d_model, device=device, dtype=dtype) |
| 231 | |
| 232 | def _run() -> dict[str, torch.Tensor]: |
| 233 | _set_deterministic(False) |
| 234 | model.zero_grad(set_to_none=True) |
| 235 | x = x_data.clone().requires_grad_(True) |
| 236 | y = model(x) |
| 237 | (y.float().square().mean()).backward() |
| 238 | torch.cuda.synchronize() |
| 239 | grads = {"input": x.grad.detach().float().clone()} |
| 240 | for name, p in model.named_parameters(): |
| 241 | if p.grad is not None: |
| 242 | grads[name] = p.grad.detach().float().clone() |
| 243 | return grads |
| 244 | |
| 245 | _run() # warmup |
| 246 | ref = _run() |
no test coverage detected