Function run

network/benchmarks/all_reduce_latency_comp.py:52–72 · view source on GitHub ↗

(local_rank)

Source from the content-addressed store, hash-verified

50
51
52	def run(local_rank):
53	hostname = socket.gethostname()
54	id = f"{hostname}:{local_rank}"
55	global_rank = dist.get_rank()
56
57	chunks = 1000
58	mat1 = torch.rand(N, M, dtype=torch.float32).cuda(local_rank)
59	mat2 = torch.rand(int(N/chunks), M, dtype=torch.float32).cuda(local_rank)
60
61	start_event = torch.cuda.Event(enable_timing=True)
62	end_event = torch.cuda.Event(enable_timing=True)
63	for i in range(TRIALS):
64	dist.barrier()
65
66	if global_rank == 0:
67	print(f"\n\n\n----------- 1x {NM4/1e9}GB ----------------")
68	timed_allreduce(mat1, 1, id, start_event, end_event)
69
70	if global_rank == 0:
71	print(f"\n\n\n----------- {chunks}x {(NM4/chunks)/1e9}GB ----------------")
72	timed_allreduce(mat2, chunks, id, start_event, end_event)
73
74	def init_processes(local_rank, fn, backend='nccl'):
75	torch.cuda.set_device(local_rank)

nothing calls this directly

printFunction · 0.85

timed_allreduceFunction · 0.70

no test coverage detected