Initialize model data parallel groups. Arguments: model_parallel_size: number of GPUs used to parallelize model. Returns: Tuple of data parallel group and model parallel group Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we use 2 GPUs to parall
(model_parallel_size_)
| 189 | |
| 190 | # Not currently used. Helper function to create a model (tensor) parallel group. |
| 191 | def _create_model_parallel(model_parallel_size_): |
| 192 | """ |
| 193 | Initialize model data parallel groups. |
| 194 | |
| 195 | Arguments: |
| 196 | model_parallel_size: number of GPUs used to parallelize model. |
| 197 | |
| 198 | Returns: |
| 199 | Tuple of data parallel group and model parallel group |
| 200 | |
| 201 | Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we |
| 202 | use 2 GPUs to parallelize the model. The present function will |
| 203 | create 4 model parallel groups and 2 data parallel groups as: |
| 204 | 4 model parallel groups: |
| 205 | [g0, g1], [g2, g3], [g4, g5], [g6, g7] |
| 206 | 2 data parallel groups: |
| 207 | [g0, g2, g4, g6], [g1, g3, g5, g7] |
| 208 | Note that for efficiency, the caller should make sure adjacent ranks |
| 209 | are on the same DGX box. For example if we are using 2 DGX-1 boxes |
| 210 | with a total of 16 GPUs, rank 0 to 7 belong to the first box and |
| 211 | ranks 8 to 15 belong to the second box. |
| 212 | """ |
| 213 | log_dist(f'Creating model parallel group with size {model_parallel_size_}', ranks=[0]) |
| 214 | # Get world size and rank. Ensure some consistencies. |
| 215 | assert dist.is_initialized() |
| 216 | world_size = dist.get_world_size() |
| 217 | model_parallel_size = min(model_parallel_size_, world_size) |
| 218 | _ensure_divisibility(world_size, model_parallel_size) |
| 219 | rank = dist.get_rank() |
| 220 | |
| 221 | _DATA_PARALLEL_GROUP = None |
| 222 | _MODEL_PARALLEL_GROUP = None |
| 223 | # Build the data parallel groups. |
| 224 | for i in range(model_parallel_size): |
| 225 | ranks = range(i, world_size, model_parallel_size) |
| 226 | group = dist.new_group(ranks) |
| 227 | if i == (rank % model_parallel_size): |
| 228 | _DATA_PARALLEL_GROUP = group |
| 229 | |
| 230 | # Build the model parallel groups. |
| 231 | for i in range(world_size // model_parallel_size): |
| 232 | ranks = range(i * model_parallel_size, (i + 1) * model_parallel_size) |
| 233 | group = dist.new_group(ranks) |
| 234 | if i == (rank // model_parallel_size): |
| 235 | _MODEL_PARALLEL_GROUP = group |
| 236 | |
| 237 | return _DATA_PARALLEL_GROUP, _MODEL_PARALLEL_GROUP |
| 238 | |
| 239 | |
| 240 | def _create_expert_and_data_parallel(expert_parallel_size_, |
nothing calls this directly
no test coverage detected
searching dependent graphs…