Frontend + Encoder. Note that this method is used by asr_inference.py Args: speech: (Batch, Length, ...) speech_lengths: (Batch, )
(
self, speech: torch.Tensor, speech_lengths: torch.Tensor
)
| 237 | return {"feats": feats, "feats_lengths": feats_lengths} |
| 238 | |
| 239 | def encode( |
| 240 | self, speech: torch.Tensor, speech_lengths: torch.Tensor |
| 241 | ) -> Tuple[torch.Tensor, torch.Tensor]: |
| 242 | """Frontend + Encoder. Note that this method is used by asr_inference.py |
| 243 | Args: |
| 244 | speech: (Batch, Length, ...) |
| 245 | speech_lengths: (Batch, ) |
| 246 | """ |
| 247 | with autocast(False): |
| 248 | # 1. Extract feats |
| 249 | feats, feats_lengths = self._extract_feats(speech, speech_lengths) |
| 250 | |
| 251 | # 2. Data augmentation |
| 252 | if self.specaug is not None and self.training: |
| 253 | feats, feats_lengths = self.specaug(feats, feats_lengths) |
| 254 | |
| 255 | # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN |
| 256 | if self.normalize is not None: |
| 257 | feats, feats_lengths = self.normalize(feats, feats_lengths) |
| 258 | |
| 259 | # Pre-encoder, e.g. used for raw input data |
| 260 | if self.preencoder is not None: |
| 261 | feats, feats_lengths = self.preencoder(feats, feats_lengths) |
| 262 | |
| 263 | # 4. Forward encoder |
| 264 | # feats: (Batch, Length, Dim) -> (Batch, Channel, Length2, Dim2) |
| 265 | encoder_out, encoder_out_lens = self.encoder(feats, feats_lengths) |
| 266 | |
| 267 | # Post-encoder, e.g. NLU |
| 268 | if self.postencoder is not None: |
| 269 | encoder_out, encoder_out_lens = self.postencoder(encoder_out, encoder_out_lens) |
| 270 | |
| 271 | return encoder_out, encoder_out_lens |
| 272 | |
| 273 | def _extract_feats( |
| 274 | self, speech: torch.Tensor, speech_lengths: torch.Tensor |
no test coverage detected