MCPcopy Index your code
hub / github.com/tensorpack/tensorpack / allreduce_grads_hierarchical

Function allreduce_grads_hierarchical

tensorpack/graph_builder/utils.py:202–260  ·  view source on GitHub ↗

Hierarchical allreduce for DGX-1 system. Args: all_grads (K x N): List of list of gradients. N is the number of variables. devices ([str]): K str for the K devices. average (bool): average gradients or not. Returns: (K x N): same as input, but each grad

(all_grads, devices, average=False)

Source from the content-addressed store, hash-verified

200
201@under_name_scope('AllReduceGradsHierachical')
202def allreduce_grads_hierarchical(all_grads, devices, average=False):
203 """
204 Hierarchical allreduce for DGX-1 system.
205
206 Args:
207 all_grads (K x N): List of list of gradients. N is the number of variables.
208 devices ([str]): K str for the K devices.
209 average (bool): average gradients or not.
210
211 Returns:
212 (K x N): same as input, but each grad is replaced by the average over K lists.
213 """
214 num_gpu = len(devices)
215 assert num_gpu == 8, num_gpu
216 assert len(all_grads) == num_gpu, len(all_grads)
217 group_size = num_gpu // 2
218
219 agg_all_grads = [] # N x K
220 for varid, grads in enumerate(zip(*all_grads)):
221 # grads: K gradients
222 g0_main_gpu = varid % num_gpu
223 g1_main_gpu = (g0_main_gpu + group_size) % num_gpu
224 g0_start = 0 if g0_main_gpu < group_size else group_size
225 g1_start = 0 if g1_main_gpu < group_size else group_size
226 assert g0_start != g1_start
227 g0_grads = grads[g0_start: g0_start + group_size]
228 g1_grads = grads[g1_start: g1_start + group_size]
229
230 with tf.device(devices[g0_main_gpu]):
231 g0_agg = tf.add_n(g0_grads, name='group0_agg')
232
233 with tf.device(devices[g1_main_gpu]):
234 g1_agg = tf.add_n(g1_grads, name='group1_agg')
235 g1_total_agg = tf.add(g0_agg, g1_agg, name='group1_total_agg')
236
237 with tf.device(devices[g0_main_gpu]):
238 g0_total_agg = tf.identity(g1_total_agg, name='group0_total_agg')
239
240 agg_grads = [] # K aggregated grads
241 for k in range(num_gpu):
242 if (k < group_size) == (g0_main_gpu < group_size):
243 main_gpu = g0_total_agg
244 else:
245 main_gpu = g1_total_agg
246 with tf.device(devices[k]):
247 if not average:
248 device_total_agg = tf.identity(
249 main_gpu, name='device{}_total_agg'.format(k))
250 else:
251 # TODO where to put average?
252 device_total_agg = tf.multiply(
253 main_gpu, 1.0 / num_gpu, name='device{}_total_agg'.format(k))
254 agg_grads.append(device_total_agg)
255
256 agg_all_grads.append(agg_grads)
257
258 # transpose
259 agg_all_grads = list(zip(*agg_all_grads)) # K x Nvar

Callers 1

do_allreduceMethod · 0.85

Calls 4

deviceMethod · 0.80
formatMethod · 0.80
appendMethod · 0.80
addMethod · 0.45

Tested by

no test coverage detected