()
| 90 | |
| 91 | |
| 92 | def main(): |
| 93 | args = parse_args() |
| 94 | |
| 95 | # Load model from checkpoint directory (no denoiser) |
| 96 | print(f"[FT Inference] Loading model: {args.ckpt_dir}", file=sys.stderr) |
| 97 | model = VoxCPM.from_pretrained( |
| 98 | hf_model_id=args.ckpt_dir, |
| 99 | load_denoiser=False, |
| 100 | optimize=True, |
| 101 | ) |
| 102 | |
| 103 | # Run inference |
| 104 | prompt_wav_path = args.prompt_audio if args.prompt_audio else None |
| 105 | prompt_text = args.prompt_text if args.prompt_text else None |
| 106 | |
| 107 | print(f"[FT Inference] Synthesizing: text='{args.text}'", file=sys.stderr) |
| 108 | if prompt_wav_path: |
| 109 | print(f"[FT Inference] Using reference audio: {prompt_wav_path}", file=sys.stderr) |
| 110 | print(f"[FT Inference] Reference text: {prompt_text}", file=sys.stderr) |
| 111 | |
| 112 | audio_np = model.generate( |
| 113 | text=args.text, |
| 114 | prompt_wav_path=prompt_wav_path, |
| 115 | prompt_text=prompt_text, |
| 116 | cfg_value=args.cfg_value, |
| 117 | inference_timesteps=args.inference_timesteps, |
| 118 | max_len=args.max_len, |
| 119 | normalize=args.normalize, |
| 120 | denoise=False, |
| 121 | ) |
| 122 | |
| 123 | # Save audio |
| 124 | out_path = Path(args.output) |
| 125 | out_path.parent.mkdir(parents=True, exist_ok=True) |
| 126 | sf.write(str(out_path), audio_np, model.tts_model.sample_rate) |
| 127 | |
| 128 | print( |
| 129 | f"[FT Inference] Saved to: {out_path}, duration: {len(audio_np) / model.tts_model.sample_rate:.2f}s", |
| 130 | file=sys.stderr, |
| 131 | ) |
| 132 | |
| 133 | |
| 134 | if __name__ == "__main__": |
no test coverage detected
searching dependent graphs…