(tensor, size, start_event, end_event)
| 205 | |
| 206 | |
| 207 | def timed_allreduce(tensor, size, start_event, end_event): |
| 208 | dist.barrier() |
| 209 | start_event.record() |
| 210 | dist.all_reduce(tensor) |
| 211 | end_event.record() |
| 212 | torch.cuda.synchronize() |
| 213 | duration = start_event.elapsed_time(end_event) / 1000 |
| 214 | |
| 215 | n = dist.get_world_size() |
| 216 | # note that this is following the same math as NVIDIA/nccl-tests |
| 217 | algbw = torch.tensor([size / duration]).cuda(local_rank) |
| 218 | |
| 219 | # calculate mean across all ranks |
| 220 | dist.reduce(algbw, dst=0, op=dist.ReduceOp.SUM) |
| 221 | algbw /= n |
| 222 | |
| 223 | return algbw |
| 224 | |
| 225 | def run(local_rank): |
| 226 |
no test coverage detected