MCPcopy
hub / github.com/NVIDIA/TensorRT-LLM / smooth_quant_layer_norm

Function smooth_quant_layer_norm

tensorrt_llm/quantization/functional.py:352–427  ·  view source on GitHub ↗
(input: Tensor,
                            normalized_shape: Union[int, Tuple[int]],
                            weight: Optional[Tensor] = None,
                            bias: Optional[Tensor] = None,
                            scale: Optional[Tensor] = None,
                            eps: float = 1e-05,
                            use_diff_of_squares: bool = True,
                            dynamic_act_scaling: bool = False)

Source from the content-addressed store, hash-verified

350
351# TODO: Should be renamed to layer_norm_quantize.
352def smooth_quant_layer_norm(input: Tensor,
353 normalized_shape: Union[int, Tuple[int]],
354 weight: Optional[Tensor] = None,
355 bias: Optional[Tensor] = None,
356 scale: Optional[Tensor] = None,
357 eps: float = 1e-05,
358 use_diff_of_squares: bool = True,
359 dynamic_act_scaling: bool = False) -> Tensor:
360 if not default_net().plugin_config.layernorm_quantization_plugin:
361 dtype = trt_dtype_to_np(input.dtype)
362 if weight is None:
363 weight = constant(np.ones(normalized_shape, dtype=dtype))
364 if bias is None:
365 bias = constant(np.zeros(normalized_shape, dtype=dtype))
366 result = layer_norm(input, normalized_shape, weight, bias, eps,
367 use_diff_of_squares)
368 if not dynamic_act_scaling:
369 return quantize_tensor(result, scale)
370 else:
371 return quantize_per_token(result)
372 else:
373 plg_creator = trt.get_plugin_registry().get_plugin_creator(
374 'LayernormQuantization', '1', TRT_LLM_PLUGIN_NAMESPACE)
375 assert plg_creator is not None
376
377 output_type = trt.PluginField("out_type_id",
378 np.array([int(trt.int8)], np.int32),
379 trt.PluginFieldType.INT32)
380 quant_mode = trt.PluginField(
381 "quant_mode",
382 np.array([int(QuantMode.use_smooth_quant(per_token=True))],
383 np.int32), trt.PluginFieldType.INT32)
384 eps = trt.PluginField("eps", np.array(eps, dtype=np.float32),
385 trt.PluginFieldType.FLOAT32)
386 use_diff_of_squares = trt.PluginField(
387 "use_diff_of_squares",
388 np.array([int(use_diff_of_squares)], dtype=np.int32),
389 trt.PluginFieldType.INT32)
390
391 dyn_act_scaling = trt.PluginField(
392 "dyn_act_scaling", np.array([int(dynamic_act_scaling)], np.int32),
393 trt.PluginFieldType.INT32)
394
395 p_dtype = default_net().plugin_config.layernorm_quantization_plugin
396 pf_type = trt.PluginField(
397 "type_id", np.array([int(str_dtype_to_trt(p_dtype))], np.int32),
398 trt.PluginFieldType.INT32)
399 pfc = trt.PluginFieldCollection([
400 eps, use_diff_of_squares, dyn_act_scaling, pf_type, output_type,
401 quant_mode
402 ])
403 layernorm_plug = plg_creator.create_plugin("layernorm_quantized", pfc)
404 normalized_shape = [normalized_shape] if isinstance(
405 normalized_shape, int) else normalized_shape
406 if weight is None:
407 weight = constant(
408 np.ones(normalized_shape, dtype=str_dtype_to_np(p_dtype)))
409 if bias is None:

Callers 2

forwardMethod · 0.85

Calls 15

default_netFunction · 0.85
trt_dtype_to_npFunction · 0.85
constantFunction · 0.85
layer_normFunction · 0.85
quantize_tensorFunction · 0.85
quantize_per_tokenFunction · 0.85
str_dtype_to_trtFunction · 0.85
castFunction · 0.85
default_trtnetFunction · 0.85
_add_plugin_infoFunction · 0.85
_create_tensorFunction · 0.85
use_smooth_quantMethod · 0.80

Tested by 1