(self, text: str, speaker: int)
| 58 | self.device = device |
| 59 | |
| 60 | def _tokenize_text_segment(self, text: str, speaker: int) -> Tuple[torch.Tensor, torch.Tensor]: |
| 61 | frame_tokens = [] |
| 62 | frame_masks = [] |
| 63 | |
| 64 | text_tokens = self._text_tokenizer.encode(f"[{speaker}]{text}") |
| 65 | text_frame = torch.zeros(len(text_tokens), 33).long() |
| 66 | text_frame_mask = torch.zeros(len(text_tokens), 33).bool() |
| 67 | text_frame[:, -1] = torch.tensor(text_tokens) |
| 68 | text_frame_mask[:, -1] = True |
| 69 | |
| 70 | frame_tokens.append(text_frame.to(self.device)) |
| 71 | frame_masks.append(text_frame_mask.to(self.device)) |
| 72 | |
| 73 | return torch.cat(frame_tokens, dim=0), torch.cat(frame_masks, dim=0) |
| 74 | |
| 75 | def _tokenize_audio(self, audio: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: |
| 76 | assert audio.ndim == 1, "Audio must be single channel" |
no outgoing calls
no test coverage detected