Returns: (seq_len, 33), (seq_len, 33)
(self, segment: Segment)
| 96 | return torch.cat(frame_tokens, dim=0), torch.cat(frame_masks, dim=0) |
| 97 | |
| 98 | def _tokenize_segment(self, segment: Segment) -> Tuple[torch.Tensor, torch.Tensor]: |
| 99 | """ |
| 100 | Returns: |
| 101 | (seq_len, 33), (seq_len, 33) |
| 102 | """ |
| 103 | text_tokens, text_masks = self._tokenize_text_segment(segment.text, segment.speaker) |
| 104 | audio_tokens, audio_masks = self._tokenize_audio(segment.audio) |
| 105 | |
| 106 | return torch.cat([text_tokens, audio_tokens], dim=0), torch.cat([text_masks, audio_masks], dim=0) |
| 107 | |
| 108 | @torch.inference_mode() |
| 109 | def generate( |
no test coverage detected