Frontend + Encoder. Note that this method is used by asr_inference.py Args: speech: (Batch, Length, ...) speech_lengths: (Batch, ) ind: int
(
self,
speech: torch.Tensor,
speech_lengths: torch.Tensor,
**kwargs,
)
| 255 | return loss, stats, weight |
| 256 | |
| 257 | def encode( |
| 258 | self, |
| 259 | speech: torch.Tensor, |
| 260 | speech_lengths: torch.Tensor, |
| 261 | **kwargs, |
| 262 | ) -> Tuple[torch.Tensor, torch.Tensor]: |
| 263 | """Frontend + Encoder. Note that this method is used by asr_inference.py |
| 264 | Args: |
| 265 | speech: (Batch, Length, ...) |
| 266 | speech_lengths: (Batch, ) |
| 267 | ind: int |
| 268 | """ |
| 269 | with autocast(False): |
| 270 | |
| 271 | # Data augmentation |
| 272 | if self.specaug is not None and self.training: |
| 273 | speech, speech_lengths = self.specaug(speech, speech_lengths) |
| 274 | |
| 275 | # Normalization for feature: e.g. Global-CMVN, Utterance-CMVN |
| 276 | if self.normalize is not None: |
| 277 | speech, speech_lengths = self.normalize(speech, speech_lengths) |
| 278 | |
| 279 | # Forward encoder |
| 280 | # feats: (Batch, Length, Dim) |
| 281 | # -> encoder_out: (Batch, Length2, Dim2) |
| 282 | if self.encoder.interctc_use_conditioning: |
| 283 | encoder_out, encoder_out_lens, _ = self.encoder(speech, speech_lengths, ctc=self.ctc) |
| 284 | else: |
| 285 | encoder_out, encoder_out_lens, _ = self.encoder(speech, speech_lengths) |
| 286 | intermediate_outs = None |
| 287 | if isinstance(encoder_out, tuple): |
| 288 | intermediate_outs = encoder_out[1] |
| 289 | encoder_out = encoder_out[0] |
| 290 | |
| 291 | if intermediate_outs is not None: |
| 292 | return (encoder_out, intermediate_outs), encoder_out_lens |
| 293 | |
| 294 | return encoder_out, encoder_out_lens |
| 295 | |
| 296 | def _calc_att_loss( |
| 297 | self, |