Hierarchical allreduce for DGX-1 system. Args: all_grads (K x N): List of list of gradients. N is the number of variables. devices ([str]): K str for the K devices. average (bool): average gradients or not. Returns: (K x N): same as input, but each grad
(all_grads, devices, average=False)
| 200 | |
| 201 | @under_name_scope('AllReduceGradsHierachical') |
| 202 | def allreduce_grads_hierarchical(all_grads, devices, average=False): |
| 203 | """ |
| 204 | Hierarchical allreduce for DGX-1 system. |
| 205 | |
| 206 | Args: |
| 207 | all_grads (K x N): List of list of gradients. N is the number of variables. |
| 208 | devices ([str]): K str for the K devices. |
| 209 | average (bool): average gradients or not. |
| 210 | |
| 211 | Returns: |
| 212 | (K x N): same as input, but each grad is replaced by the average over K lists. |
| 213 | """ |
| 214 | num_gpu = len(devices) |
| 215 | assert num_gpu == 8, num_gpu |
| 216 | assert len(all_grads) == num_gpu, len(all_grads) |
| 217 | group_size = num_gpu // 2 |
| 218 | |
| 219 | agg_all_grads = [] # N x K |
| 220 | for varid, grads in enumerate(zip(*all_grads)): |
| 221 | # grads: K gradients |
| 222 | g0_main_gpu = varid % num_gpu |
| 223 | g1_main_gpu = (g0_main_gpu + group_size) % num_gpu |
| 224 | g0_start = 0 if g0_main_gpu < group_size else group_size |
| 225 | g1_start = 0 if g1_main_gpu < group_size else group_size |
| 226 | assert g0_start != g1_start |
| 227 | g0_grads = grads[g0_start: g0_start + group_size] |
| 228 | g1_grads = grads[g1_start: g1_start + group_size] |
| 229 | |
| 230 | with tf.device(devices[g0_main_gpu]): |
| 231 | g0_agg = tf.add_n(g0_grads, name='group0_agg') |
| 232 | |
| 233 | with tf.device(devices[g1_main_gpu]): |
| 234 | g1_agg = tf.add_n(g1_grads, name='group1_agg') |
| 235 | g1_total_agg = tf.add(g0_agg, g1_agg, name='group1_total_agg') |
| 236 | |
| 237 | with tf.device(devices[g0_main_gpu]): |
| 238 | g0_total_agg = tf.identity(g1_total_agg, name='group0_total_agg') |
| 239 | |
| 240 | agg_grads = [] # K aggregated grads |
| 241 | for k in range(num_gpu): |
| 242 | if (k < group_size) == (g0_main_gpu < group_size): |
| 243 | main_gpu = g0_total_agg |
| 244 | else: |
| 245 | main_gpu = g1_total_agg |
| 246 | with tf.device(devices[k]): |
| 247 | if not average: |
| 248 | device_total_agg = tf.identity( |
| 249 | main_gpu, name='device{}_total_agg'.format(k)) |
| 250 | else: |
| 251 | # TODO where to put average? |
| 252 | device_total_agg = tf.multiply( |
| 253 | main_gpu, 1.0 / num_gpu, name='device{}_total_agg'.format(k)) |
| 254 | agg_grads.append(device_total_agg) |
| 255 | |
| 256 | agg_all_grads.append(agg_grads) |
| 257 | |
| 258 | # transpose |
| 259 | agg_all_grads = list(zip(*agg_all_grads)) # K x Nvar |
no test coverage detected