MCPcopy
hub / github.com/InternLM/InternLM / compute_norm

Function compute_norm

internlm/solver/optimizer/utils.py:215–317  ·  view source on GitHub ↗

Get the norm Arguments: gradients (Iterable[Tensor]): The gradient value. parameters (Iterable[Tensor]): The parameter each gradient corresponds to. norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm. Returns: To

(gradients, parameters, last_stage=False, previous_norm=None, norm_type=2)

Source from the content-addressed store, hash-verified

213
214
215def compute_norm(gradients, parameters, last_stage=False, previous_norm=None, norm_type=2):
216 """Get the norm
217 Arguments:
218 gradients (Iterable[Tensor]): The gradient value.
219 parameters (Iterable[Tensor]): The parameter each gradient corresponds to.
220 norm_type (float or int): type of the used p-norm. Can be ``'inf'`` for
221 infinity norm.
222
223 Returns:
224 Total norm of the parameters, need total_norm**(1/norm) before using.
225 """
226
227 enable_cuda_kernels = gradients[0].device.type == "cuda"
228 # Norm parameters.
229 norm_type = float(norm_type)
230
231 # Calculate norm.
232 if norm_type == inf:
233 total_norm = max(g.data.abs().max() for g in gradients)
234 total_norm_cuda = torch.FloatTensor([float(total_norm)], device=gradients[0].device)
235
236 if last_stage is False:
237 return total_norm_cuda
238
239 if previous_norm is not None:
240 total_norm_cuda = max(total_norm_cuda, previous_norm)
241
242 # Take max across all model-parallel GPUs.
243 if gpc.get_world_size(ParallelMode.MODEL) > 1:
244 dist.all_reduce(
245 total_norm_cuda,
246 op=dist.ReduceOp.MAX,
247 group=gpc.get_group(ParallelMode.MODEL),
248 )
249 total_norm = total_norm_cuda[0].item()
250 else:
251 tensor_parallel_grads = []
252 for g, p in zip(gradients, parameters):
253 # TODO: consider the pipeline shared parameter
254 if (
255 gpc.is_initialized(ParallelMode.PIPELINE)
256 and hasattr(p, "pipeline_shared_module_pg")
257 and dist.get_rank(p.pipeline_shared_module_pg) == 0
258 ): # if shared between different pipe, only count o
259 tensor_parallel_grads.append(g.data.float())
260 elif (
261 gpc.is_initialized(ParallelMode.PIPELINE)
262 and hasattr(p, "pipeline_shared_module_pg")
263 and dist.get_rank(p.pipeline_shared_module_pg) != 0
264 ):
265 continue
266 elif (
267 gpc.is_initialized(ParallelMode.TENSOR)
268 and not is_model_parallel_parameter(p)
269 and gpc.get_local_rank(ParallelMode.TENSOR) == 0
270 ): # if not used in each chunk, such as layernorm
271 tensor_parallel_grads.append(g.data.float())
272 elif is_model_parallel_parameter(p):

Callers 1

Calls 9

get_tensor_normFunction · 0.90
move_norm_to_cudaFunction · 0.90
calc_l2_normFunction · 0.85
calc_lpFunction · 0.85
get_world_sizeMethod · 0.80
get_groupMethod · 0.80
is_initializedMethod · 0.80
get_local_rankMethod · 0.80

Tested by

no test coverage detected