| 542 | return backend_pb2.Result(success=True) |
| 543 | |
| 544 | def TTS(self, request, context): |
| 545 | try: |
| 546 | text = request.text |
| 547 | print(f"[transformers] TTS generating for text: {text[:100]}...", file=sys.stderr) |
| 548 | |
| 549 | # Build inputs based on processor capabilities |
| 550 | if request.voice and os.path.exists(request.voice): |
| 551 | # Voice cloning: use chat template with reference audio |
| 552 | chat_template = [{ |
| 553 | "role": "0", |
| 554 | "content": [ |
| 555 | {"type": "text", "text": text}, |
| 556 | {"type": "audio", "path": request.voice}, |
| 557 | ], |
| 558 | }] |
| 559 | inputs = self.processor.apply_chat_template( |
| 560 | chat_template, tokenize=True, return_dict=True, |
| 561 | ).to(self.model.device, self.model.dtype) |
| 562 | elif hasattr(self.processor, 'apply_chat_template'): |
| 563 | # Models that use chat template format (VibeVoice, CSM, etc.) |
| 564 | chat_template = [{"role": "0", "content": [{"type": "text", "text": text}]}] |
| 565 | try: |
| 566 | inputs = self.processor.apply_chat_template( |
| 567 | chat_template, tokenize=True, return_dict=True, |
| 568 | ).to(self.model.device, self.model.dtype) |
| 569 | except Exception: |
| 570 | # Fallback if chat template fails (not all processors support it) |
| 571 | inputs = self.processor(text=[text], padding=True, return_tensors="pt") |
| 572 | if self.CUDA: |
| 573 | inputs = inputs.to("cuda") |
| 574 | else: |
| 575 | # Direct processor call (Musicgen, etc.) |
| 576 | inputs = self.processor(text=[text], padding=True, return_tensors="pt") |
| 577 | if self.CUDA: |
| 578 | inputs = inputs.to("cuda") |
| 579 | |
| 580 | # Build generation kwargs from self.options |
| 581 | gen_kwargs = {**inputs, "max_new_tokens": self.max_tokens} |
| 582 | for key in ["guidance_scale", "temperature", "top_p", "top_k", "do_sample"]: |
| 583 | if key in self.options: |
| 584 | gen_kwargs[key] = self.options[key] |
| 585 | |
| 586 | # Add noise scheduler if configured (e.g., for VibeVoice) |
| 587 | noise_scheduler_type = self.options.get("noise_scheduler", None) |
| 588 | if noise_scheduler_type: |
| 589 | import diffusers |
| 590 | SchedulerClass = getattr(diffusers, noise_scheduler_type) |
| 591 | scheduler_kwargs = {} |
| 592 | for key in ["beta_schedule", "prediction_type"]: |
| 593 | if key in self.options: |
| 594 | scheduler_kwargs[key] = self.options[key] |
| 595 | gen_kwargs["noise_scheduler"] = SchedulerClass(**scheduler_kwargs) |
| 596 | |
| 597 | # Generate audio |
| 598 | audio = self.model.generate(**gen_kwargs) |
| 599 | print("[transformers] TTS generated!", file=sys.stderr) |
| 600 | |
| 601 | # Save audio output |