Function timed_allreduce

network/benchmarks/all_reduce_bench.py:207–223 · view source on GitHub ↗

(tensor, size, start_event, end_event)

Source from the content-addressed store, hash-verified

205
206
207	def timed_allreduce(tensor, size, start_event, end_event):
208	dist.barrier()
209	start_event.record()
210	dist.all_reduce(tensor)
211	end_event.record()
212	torch.cuda.synchronize()
213	duration = start_event.elapsed_time(end_event) / 1000
214
215	n = dist.get_world_size()
216	# note that this is following the same math as NVIDIA/nccl-tests
217	algbw = torch.tensor([size / duration]).cuda(local_rank)
218
219	# calculate mean across all ranks
220	dist.reduce(algbw, dst=0, op=dist.ReduceOp.SUM)
221	algbw /= n
222
223	return algbw
224
225	def run(local_rank):
226

runFunction · 0.70

recordMethod · 0.80

elapsed_timeMethod · 0.80

synchronizeMethod · 0.45

no test coverage detected