Frontend + Encoder. Note that this method is used by asr_inference.py Args: speech: (Batch, Length, ...) speech_lengths: (Batch, ) ind: int
(
self,
speech: torch.Tensor,
speech_lengths: torch.Tensor,
**kwargs,
)
| 274 | return loss, stats, weight |
| 275 | |
| 276 | def encode( |
| 277 | self, |
| 278 | speech: torch.Tensor, |
| 279 | speech_lengths: torch.Tensor, |
| 280 | **kwargs, |
| 281 | ) -> Tuple[torch.Tensor, torch.Tensor]: |
| 282 | """Frontend + Encoder. Note that this method is used by asr_inference.py |
| 283 | Args: |
| 284 | speech: (Batch, Length, ...) |
| 285 | speech_lengths: (Batch, ) |
| 286 | ind: int |
| 287 | """ |
| 288 | with autocast(False): |
| 289 | |
| 290 | # Data augmentation |
| 291 | if self.specaug is not None and self.training: |
| 292 | speech, speech_lengths = self.specaug(speech, speech_lengths) |
| 293 | |
| 294 | # Normalization for feature: e.g. Global-CMVN, Utterance-CMVN |
| 295 | if self.normalize is not None: |
| 296 | speech, speech_lengths = self.normalize(speech, speech_lengths) |
| 297 | |
| 298 | # Forward encoder |
| 299 | # feats: (Batch, Length, Dim) |
| 300 | # -> encoder_out: (Batch, Length2, Dim2) |
| 301 | encoder_out, encoder_out_lens, _ = self.encoder(speech, speech_lengths) |
| 302 | intermediate_outs = None |
| 303 | if isinstance(encoder_out, tuple): |
| 304 | intermediate_outs = encoder_out[1] |
| 305 | encoder_out = encoder_out[0] |
| 306 | |
| 307 | if intermediate_outs is not None: |
| 308 | return (encoder_out, intermediate_outs), encoder_out_lens |
| 309 | |
| 310 | return encoder_out, encoder_out_lens |
| 311 | |
| 312 | def _calc_transducer_loss( |
| 313 | self, |