Get the norm Arguments: gradients (Iterable[Tensor]): The gradient value. parameters (Iterable[Tensor]): The parameter each gradient corresponds to. norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm. Returns: To
(gradients, parameters, last_stage=False, previous_norm=None, norm_type=2)
| 213 | |
| 214 | |
| 215 | def compute_norm(gradients, parameters, last_stage=False, previous_norm=None, norm_type=2): |
| 216 | """Get the norm |
| 217 | Arguments: |
| 218 | gradients (Iterable[Tensor]): The gradient value. |
| 219 | parameters (Iterable[Tensor]): The parameter each gradient corresponds to. |
| 220 | norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for |
| 221 | infinity norm. |
| 222 | |
| 223 | Returns: |
| 224 | Total norm of the parameters, need total_norm**(1/norm) before using. |
| 225 | """ |
| 226 | |
| 227 | enable_cuda_kernels = gradients[0].device.type == "cuda" |
| 228 | # Norm parameters. |
| 229 | norm_type = float(norm_type) |
| 230 | |
| 231 | # Calculate norm. |
| 232 | if norm_type == inf: |
| 233 | total_norm = max(g.data.abs().max() for g in gradients) |
| 234 | total_norm_cuda = torch.FloatTensor([float(total_norm)], device=gradients[0].device) |
| 235 | |
| 236 | if last_stage is False: |
| 237 | return total_norm_cuda |
| 238 | |
| 239 | if previous_norm is not None: |
| 240 | total_norm_cuda = max(total_norm_cuda, previous_norm) |
| 241 | |
| 242 | # Take max across all model-parallel GPUs. |
| 243 | if gpc.get_world_size(ParallelMode.MODEL) > 1: |
| 244 | dist.all_reduce( |
| 245 | total_norm_cuda, |
| 246 | op=dist.ReduceOp.MAX, |
| 247 | group=gpc.get_group(ParallelMode.MODEL), |
| 248 | ) |
| 249 | total_norm = total_norm_cuda[0].item() |
| 250 | else: |
| 251 | tensor_parallel_grads = [] |
| 252 | for g, p in zip(gradients, parameters): |
| 253 | # TODO: consider the pipeline shared parameter |
| 254 | if ( |
| 255 | gpc.is_initialized(ParallelMode.PIPELINE) |
| 256 | and hasattr(p, "pipeline_shared_module_pg") |
| 257 | and dist.get_rank(p.pipeline_shared_module_pg) == 0 |
| 258 | ): # if shared between different pipe, only count o |
| 259 | tensor_parallel_grads.append(g.data.float()) |
| 260 | elif ( |
| 261 | gpc.is_initialized(ParallelMode.PIPELINE) |
| 262 | and hasattr(p, "pipeline_shared_module_pg") |
| 263 | and dist.get_rank(p.pipeline_shared_module_pg) != 0 |
| 264 | ): |
| 265 | continue |
| 266 | elif ( |
| 267 | gpc.is_initialized(ParallelMode.TENSOR) |
| 268 | and not is_model_parallel_parameter(p) |
| 269 | and gpc.get_local_rank(ParallelMode.TENSOR) == 0 |
| 270 | ): # if not used in each chunk, such as layernorm |
| 271 | tensor_parallel_grads.append(g.data.float()) |
| 272 | elif is_model_parallel_parameter(p): |
no test coverage detected