Method _profile_bandwidth

colossalai/auto_parallel/offload/solver.py:164–200 · view source on GitHub ↗

Profile the bidirectional communication bandwidth between CPU and GPU using data volumes ranging from 1KB to 1GB.

(self)

Source from the content-addressed store, hash-verified

162	raise TypeError(f"Unknown NVIDIA GPU device name {device_name}")
163
164	def _profile_bandwidth(self):
165	"""
166	Profile the bidirectional communication bandwidth between CPU and GPU
167	using data volumes ranging from 1KB to 1GB.
168	"""
169
170	print("profiling bandwidth ......")
171	link_to_bandwidth = {}
172	links = ["h2d", "d2h"]
173
174	for link in links:
175	t_size = 1024
176	size_to_bandwidth = {}
177
178	# from 1KB to 1GB
179	for i in range(21):
180	if link == "h2d":
181	src_tensor = torch.ones(int(t_size), dtype=torch.int8, pin_memory=True)
182	dst_tensor = torch.ones((int(t_size)), dtype=torch.int8, device="cuda")
183	elif link == "d2h":
184	src_tensor = torch.ones(int(t_size), dtype=torch.int8, device="cuda")
185	dst_tensor = torch.ones((int(t_size)), dtype=torch.int8, pin_memory=True)
186
187	def func():
188	dst_tensor.copy_(src_tensor)
189
190	size_to_bandwidth[t_size] = t_size / benchmark_func(func, number=5, repeat=3)
191	print(
192	f"size: {t_size / 1024 ** 2:.3f} MB, "
193	f"{src_tensor.device.type}-to-{dst_tensor.device.type} "
194	f"bandwidth: {size_to_bandwidth[t_size] / 1024 ** 3:.3f} GB/s"
195	)
196
197	t_size *= 2
198
199	link_to_bandwidth[link] = size_to_bandwidth
200	return link_to_bandwidth
201
202
203	class SynGreedySolver(Solver):

__init__Method · 0.95

benchmark_funcFunction · 0.85

no test coverage detected