(self,
hidden_states: Tensor,
encoder_output: Optional[Tensor] = None,
attention_mask_params=None,
use_cache=False,
kv_cache_params=None,
attention_params=None,
lora_layer_params=None,
cross_kv_cache_gen: Optional[Tensor] = None,
cross_kv_reuse: Optional[Tensor] = None,
language_adapter_routings: Optional[Tensor] = None)
| 485 | quant_mode=quant_mode) |
| 486 | |
| 487 | def forward(self, |
| 488 | hidden_states: Tensor, |
| 489 | encoder_output: Optional[Tensor] = None, |
| 490 | attention_mask_params=None, |
| 491 | use_cache=False, |
| 492 | kv_cache_params=None, |
| 493 | attention_params=None, |
| 494 | lora_layer_params=None, |
| 495 | cross_kv_cache_gen: Optional[Tensor] = None, |
| 496 | cross_kv_reuse: Optional[Tensor] = None, |
| 497 | language_adapter_routings: Optional[Tensor] = None): |
| 498 | assert isinstance(hidden_states, Tensor) |
| 499 | |
| 500 | if encoder_output: |
| 501 | assert isinstance(encoder_output, Tensor) |
| 502 | |
| 503 | # self-attention |
| 504 | residual = hidden_states * self.residual_scaling |
| 505 | |
| 506 | if self.layernorm_position == LayerNormPositionType.pre_layernorm: |
| 507 | hidden_states = self.self_attention_layernorm(hidden_states) |
| 508 | |
| 509 | attention_output = self.self_attention( |
| 510 | hidden_states=hidden_states, |
| 511 | attention_mask=attention_mask_params.self_attention_mask, |
| 512 | use_cache=use_cache, |
| 513 | kv_cache_params=kv_cache_params, |
| 514 | attention_params=attention_params, |
| 515 | lora_layer_params=lora_layer_params) |
| 516 | |
| 517 | if use_cache: |
| 518 | attention_output, presents_self = attention_output |
| 519 | |
| 520 | self.register_network_output('self_attention_output', attention_output) |
| 521 | |
| 522 | hidden_states = residual + attention_output |
| 523 | |
| 524 | if self.fp16_clamping: |
| 525 | hidden_states = maximum(-64000.0, hidden_states) |
| 526 | hidden_states = minimum(64000.0, hidden_states) |
| 527 | |
| 528 | if self.layernorm_position == LayerNormPositionType.post_layernorm: |
| 529 | hidden_states = self.self_attention_layernorm(hidden_states) |
| 530 | |
| 531 | # cross attention |
| 532 | residual = hidden_states * self.residual_scaling |
| 533 | |
| 534 | if self.layernorm_position == LayerNormPositionType.pre_layernorm: |
| 535 | hidden_states = self.cross_attention_layernorm(hidden_states) |
| 536 | |
| 537 | attention_output = self.cross_attention( |
| 538 | hidden_states=hidden_states, |
| 539 | attention_mask=attention_mask_params.cross_attention_mask, |
| 540 | attention_packed_mask=attention_mask_params. |
| 541 | cross_attention_packed_mask, |
| 542 | encoder_output=encoder_output, |
| 543 | use_cache=use_cache, |
| 544 | kv_cache_params=kv_cache_params, |
nothing calls this directly
no test coverage detected