| 679 | return videos[0] if self.rank == 0 else None |
| 680 | |
| 681 | def tts(self, tts_prompt_audio, tts_prompt_text, tts_text): |
| 682 | if not hasattr(self, 'cosyvoice'): |
| 683 | self.load_tts() |
| 684 | speech_list = [] |
| 685 | from cosyvoice.utils.file_utils import load_wav |
| 686 | import torchaudio |
| 687 | prompt_speech_16k = load_wav(tts_prompt_audio, 16000) |
| 688 | if tts_prompt_text is not None: |
| 689 | for i in self.cosyvoice.inference_zero_shot(tts_text, tts_prompt_text, prompt_speech_16k): |
| 690 | speech_list.append(i['tts_speech']) |
| 691 | else: |
| 692 | for i in self.cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k): |
| 693 | speech_list.append(i['tts_speech']) |
| 694 | torchaudio.save('tts.wav', torch.concat(speech_list, dim=1), self.cosyvoice.sample_rate) |
| 695 | return 'tts.wav' |
| 696 | |
| 697 | def load_tts(self): |
| 698 | if not os.path.exists('CosyVoice'): |