(self,
*,
local_layer_idx,
hidden_size,
num_attention_heads,
num_kv_heads=None,
max_position_embeddings=1024,
num_layers=1,
apply_query_key_layer_scaling=False,
attention_head_size=None,
attention_mask_type=AttentionMaskType.padding,
bias=True,
dense_bias=None,
dtype=None,
position_embedding_type=PositionEmbeddingType.learned_absolute,
rotary_embedding_base=10000.0,
rotary_embedding_scaling=None,
rotary_embedding_percentage=1.0,
tp_group=None,
tp_size=1,
tp_rank=0,
scale_alibi_bias=False,
paged_kv_cache=False,
quant_mode=QuantMode(0))
| 2456 | class SmoothQuantAttention(Module): |
| 2457 | |
| 2458 | def __init__(self, |
| 2459 | *, |
| 2460 | local_layer_idx, |
| 2461 | hidden_size, |
| 2462 | num_attention_heads, |
| 2463 | num_kv_heads=None, |
| 2464 | max_position_embeddings=1024, |
| 2465 | num_layers=1, |
| 2466 | apply_query_key_layer_scaling=False, |
| 2467 | attention_head_size=None, |
| 2468 | attention_mask_type=AttentionMaskType.padding, |
| 2469 | bias=True, |
| 2470 | dense_bias=None, |
| 2471 | dtype=None, |
| 2472 | position_embedding_type=PositionEmbeddingType.learned_absolute, |
| 2473 | rotary_embedding_base=10000.0, |
| 2474 | rotary_embedding_scaling=None, |
| 2475 | rotary_embedding_percentage=1.0, |
| 2476 | tp_group=None, |
| 2477 | tp_size=1, |
| 2478 | tp_rank=0, |
| 2479 | scale_alibi_bias=False, |
| 2480 | paged_kv_cache=False, |
| 2481 | quant_mode=QuantMode(0)): |
| 2482 | super().__init__() |
| 2483 | self.local_layer_idx = local_layer_idx |
| 2484 | self.attention_mask_type = attention_mask_type |
| 2485 | self.attention_head_size = hidden_size // num_attention_heads if attention_head_size is None else attention_head_size |
| 2486 | self.num_attention_heads = num_attention_heads // tp_size |
| 2487 | self.num_kv_heads = num_kv_heads |
| 2488 | self.num_attention_kv_heads = ( |
| 2489 | num_kv_heads + tp_size - 1 |
| 2490 | ) // tp_size if num_kv_heads is not None else self.num_attention_heads |
| 2491 | self.hidden_size = hidden_size // tp_size |
| 2492 | self.max_position_embeddings = 0 if max_position_embeddings is None else max_position_embeddings |
| 2493 | self.tp_size = tp_size |
| 2494 | self.tp_rank = tp_rank |
| 2495 | self.dense_bias = dense_bias |
| 2496 | if dense_bias is None: |
| 2497 | self.dense_bias = bias |
| 2498 | |
| 2499 | self.num_layers = num_layers |
| 2500 | self.apply_query_key_layer_scaling = apply_query_key_layer_scaling |
| 2501 | self.norm_factor = math.sqrt(self.attention_head_size) |
| 2502 | self.q_scaling = 1 |
| 2503 | if self.apply_query_key_layer_scaling: |
| 2504 | self.norm_factor *= self.num_layers |
| 2505 | self.q_scaling *= self.num_layers |
| 2506 | # Whether to scale ALiBi bias. Mathematically, it's equivalent to |
| 2507 | # normalizing QK after adding bias. |
| 2508 | # - False, inv_sqrt_Dh * Q*K^T + alibi_bias |
| 2509 | # - True, inv_sqrt_Dh * Q*K^T + inv_sqrt_Dh * alibi_bias |
| 2510 | self.scale_alibi_bias = scale_alibi_bias |
| 2511 | |
| 2512 | self.position_embedding_type = position_embedding_type |
| 2513 | self.paged_kv_cache = paged_kv_cache |
| 2514 | |
| 2515 | self.rotary_embedding_base = rotary_embedding_base |
nothing calls this directly
no test coverage detected