MCPcopy Index your code
hub / github.com/NVIDIA/TensorRT-LLM / __init__

Method __init__

tensorrt_llm/quantization/layers.py:2458–2580  ·  view source on GitHub ↗
(self,
                 *,
                 local_layer_idx,
                 hidden_size,
                 num_attention_heads,
                 num_kv_heads=None,
                 max_position_embeddings=1024,
                 num_layers=1,
                 apply_query_key_layer_scaling=False,
                 attention_head_size=None,
                 attention_mask_type=AttentionMaskType.padding,
                 bias=True,
                 dense_bias=None,
                 dtype=None,
                 position_embedding_type=PositionEmbeddingType.learned_absolute,
                 rotary_embedding_base=10000.0,
                 rotary_embedding_scaling=None,
                 rotary_embedding_percentage=1.0,
                 tp_group=None,
                 tp_size=1,
                 tp_rank=0,
                 scale_alibi_bias=False,
                 paged_kv_cache=False,
                 quant_mode=QuantMode(0))

Source from the content-addressed store, hash-verified

2456class SmoothQuantAttention(Module):
2457
2458 def __init__(self,
2459 *,
2460 local_layer_idx,
2461 hidden_size,
2462 num_attention_heads,
2463 num_kv_heads=None,
2464 max_position_embeddings=1024,
2465 num_layers=1,
2466 apply_query_key_layer_scaling=False,
2467 attention_head_size=None,
2468 attention_mask_type=AttentionMaskType.padding,
2469 bias=True,
2470 dense_bias=None,
2471 dtype=None,
2472 position_embedding_type=PositionEmbeddingType.learned_absolute,
2473 rotary_embedding_base=10000.0,
2474 rotary_embedding_scaling=None,
2475 rotary_embedding_percentage=1.0,
2476 tp_group=None,
2477 tp_size=1,
2478 tp_rank=0,
2479 scale_alibi_bias=False,
2480 paged_kv_cache=False,
2481 quant_mode=QuantMode(0)):
2482 super().__init__()
2483 self.local_layer_idx = local_layer_idx
2484 self.attention_mask_type = attention_mask_type
2485 self.attention_head_size = hidden_size // num_attention_heads if attention_head_size is None else attention_head_size
2486 self.num_attention_heads = num_attention_heads // tp_size
2487 self.num_kv_heads = num_kv_heads
2488 self.num_attention_kv_heads = (
2489 num_kv_heads + tp_size - 1
2490 ) // tp_size if num_kv_heads is not None else self.num_attention_heads
2491 self.hidden_size = hidden_size // tp_size
2492 self.max_position_embeddings = 0 if max_position_embeddings is None else max_position_embeddings
2493 self.tp_size = tp_size
2494 self.tp_rank = tp_rank
2495 self.dense_bias = dense_bias
2496 if dense_bias is None:
2497 self.dense_bias = bias
2498
2499 self.num_layers = num_layers
2500 self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
2501 self.norm_factor = math.sqrt(self.attention_head_size)
2502 self.q_scaling = 1
2503 if self.apply_query_key_layer_scaling:
2504 self.norm_factor *= self.num_layers
2505 self.q_scaling *= self.num_layers
2506 # Whether to scale ALiBi bias. Mathematically, it's equivalent to
2507 # normalizing QK after adding bias.
2508 # - False, inv_sqrt_Dh * Q*K^T + alibi_bias
2509 # - True, inv_sqrt_Dh * Q*K^T + inv_sqrt_Dh * alibi_bias
2510 self.scale_alibi_bias = scale_alibi_bias
2511
2512 self.position_embedding_type = position_embedding_type
2513 self.paged_kv_cache = paged_kv_cache
2514
2515 self.rotary_embedding_base = rotary_embedding_base

Callers

nothing calls this directly

Calls 15

QuantModeClass · 0.85
generate_alibi_slopesFunction · 0.85
ParameterClass · 0.85
sqrtMethod · 0.80
is_ropeMethod · 0.80
is_alibiMethod · 0.80
from_descriptionMethod · 0.80
__init__Method · 0.45

Tested by

no test coverage detected