ⓍTTS model implementation. ❗ Currently it only supports inference. Examples: >>> from TTS.tts.configs.xtts_config import XttsConfig >>> from TTS.tts.models.xtts import Xtts >>> config = XttsConfig() >>> model = Xtts.inif_from_config(config) >>> model
| 189 | |
| 190 | |
| 191 | class Xtts(BaseTTS): |
| 192 | """ⓍTTS model implementation. |
| 193 | |
| 194 | ❗ Currently it only supports inference. |
| 195 | |
| 196 | Examples: |
| 197 | >>> from TTS.tts.configs.xtts_config import XttsConfig |
| 198 | >>> from TTS.tts.models.xtts import Xtts |
| 199 | >>> config = XttsConfig() |
| 200 | >>> model = Xtts.inif_from_config(config) |
| 201 | >>> model.load_checkpoint(config, checkpoint_dir="paths/to/models_dir/", eval=True) |
| 202 | """ |
| 203 | |
| 204 | def __init__(self, config: Coqpit): |
| 205 | super().__init__(config, ap=None, tokenizer=None) |
| 206 | self.mel_stats_path = None |
| 207 | self.config = config |
| 208 | self.gpt_checkpoint = self.args.gpt_checkpoint |
| 209 | self.decoder_checkpoint = self.args.decoder_checkpoint # TODO: check if this is even needed |
| 210 | self.models_dir = config.model_dir |
| 211 | self.gpt_batch_size = self.args.gpt_batch_size |
| 212 | |
| 213 | self.tokenizer = VoiceBpeTokenizer() |
| 214 | self.gpt = None |
| 215 | self.init_models() |
| 216 | self.register_buffer("mel_stats", torch.ones(80)) |
| 217 | |
| 218 | def init_models(self): |
| 219 | """Initialize the models. We do it here since we need to load the tokenizer first.""" |
| 220 | if self.tokenizer.tokenizer is not None: |
| 221 | self.args.gpt_number_text_tokens = self.tokenizer.get_number_tokens() |
| 222 | self.args.gpt_start_text_token = self.tokenizer.tokenizer.token_to_id("[START]") |
| 223 | self.args.gpt_stop_text_token = self.tokenizer.tokenizer.token_to_id("[STOP]") |
| 224 | |
| 225 | if self.args.gpt_number_text_tokens: |
| 226 | self.gpt = GPT( |
| 227 | layers=self.args.gpt_layers, |
| 228 | model_dim=self.args.gpt_n_model_channels, |
| 229 | start_text_token=self.args.gpt_start_text_token, |
| 230 | stop_text_token=self.args.gpt_stop_text_token, |
| 231 | heads=self.args.gpt_n_heads, |
| 232 | max_text_tokens=self.args.gpt_max_text_tokens, |
| 233 | max_mel_tokens=self.args.gpt_max_audio_tokens, |
| 234 | max_prompt_tokens=self.args.gpt_max_prompt_tokens, |
| 235 | number_text_tokens=self.args.gpt_number_text_tokens, |
| 236 | num_audio_tokens=self.args.gpt_num_audio_tokens, |
| 237 | start_audio_token=self.args.gpt_start_audio_token, |
| 238 | stop_audio_token=self.args.gpt_stop_audio_token, |
| 239 | use_perceiver_resampler=self.args.gpt_use_perceiver_resampler, |
| 240 | code_stride_len=self.args.gpt_code_stride_len, |
| 241 | ) |
| 242 | |
| 243 | self.hifigan_decoder = HifiDecoder( |
| 244 | input_sample_rate=self.args.input_sample_rate, |
| 245 | output_sample_rate=self.args.output_sample_rate, |
| 246 | output_hop_length=self.args.output_hop_length, |
| 247 | ar_mel_length_compression=self.args.gpt_code_stride_len, |
| 248 | decoder_input_dim=self.args.decoder_input_dim, |
no outgoing calls
no test coverage detected
searching dependent graphs…