(local_rank)
| 50 | |
| 51 | |
| 52 | def run(local_rank): |
| 53 | hostname = socket.gethostname() |
| 54 | id = f"{hostname}:{local_rank}" |
| 55 | global_rank = dist.get_rank() |
| 56 | |
| 57 | chunks = 1000 |
| 58 | mat1 = torch.rand(N, M, dtype=torch.float32).cuda(local_rank) |
| 59 | mat2 = torch.rand(int(N/chunks), M, dtype=torch.float32).cuda(local_rank) |
| 60 | |
| 61 | start_event = torch.cuda.Event(enable_timing=True) |
| 62 | end_event = torch.cuda.Event(enable_timing=True) |
| 63 | for i in range(TRIALS): |
| 64 | dist.barrier() |
| 65 | |
| 66 | if global_rank == 0: |
| 67 | print(f"\n\n\n----------- 1x {N*M*4/1e9}GB ----------------") |
| 68 | timed_allreduce(mat1, 1, id, start_event, end_event) |
| 69 | |
| 70 | if global_rank == 0: |
| 71 | print(f"\n\n\n----------- {chunks}x {(N*M*4/chunks)/1e9}GB ----------------") |
| 72 | timed_allreduce(mat2, chunks, id, start_event, end_event) |
| 73 | |
| 74 | def init_processes(local_rank, fn, backend='nccl'): |
| 75 | torch.cuda.set_device(local_rank) |
nothing calls this directly
no test coverage detected