Run speech recognition on input audio. This is the primary user-facing method. It automatically routes to: - inference() if no vad_model is configured (single utterance) - inference_with_vad() if vad_model is configured (long audio with segmentation) Args:
(self, input, input_len=None, progress_callback=None, **cfg)
| 440 | return res |
| 441 | |
| 442 | def generate(self, input, input_len=None, progress_callback=None, **cfg): |
| 443 | """Run speech recognition on input audio. |
| 444 | |
| 445 | This is the primary user-facing method. It automatically routes to: |
| 446 | - inference() if no vad_model is configured (single utterance) |
| 447 | - inference_with_vad() if vad_model is configured (long audio with segmentation) |
| 448 | |
| 449 | Args: |
| 450 | input: Audio input. Accepts: |
| 451 | - File path (str): "audio.wav", "audio.mp3" |
| 452 | - URL (str): "https://..." |
| 453 | - numpy array: raw audio samples (float32, 16kHz) |
| 454 | - list: batch of file paths or arrays |
| 455 | - bytes: raw audio bytes |
| 456 | input_len (tensor, optional): Length of each input sample. |
| 457 | progress_callback (callable, optional): fn(current, total) called during processing. |
| 458 | **cfg: Runtime parameters: |
| 459 | - cache (dict): State cache for streaming mode. Pass {} for first call. |
| 460 | - hotword (str/list): Keywords to boost recognition accuracy. |
| 461 | - language (str): Language hint ("auto", "zh", "en", "Chinese", etc.) |
| 462 | - batch_size_s (int): Dynamic batch total duration in seconds. |
| 463 | - is_final (bool): Last chunk flag for streaming mode. |
| 464 | - return_spk_res (bool): Return speaker diarization results. |
| 465 | - sentence_timestamp (bool): Return sentence-level timestamps. |
| 466 | - use_itn (bool): Apply inverse text normalization (SenseVoice). |
| 467 | |
| 468 | Returns: |
| 469 | list[dict]: Results for each input sample. Common fields: |
| 470 | - "key" (str): Sample identifier |
| 471 | - "text" (str): Recognized text |
| 472 | - "timestamp" (list): [[start_ms, end_ms], ...] per character/word |
| 473 | - "sentence_info" (list): [{text, start, end, spk, timestamp}, ...] when spk enabled |
| 474 | """ |
| 475 | self._reset_runtime_configs() |
| 476 | if self.vad_model is None: |
| 477 | results = self.inference( |
| 478 | input, input_len=input_len, progress_callback=progress_callback, **cfg |
| 479 | ) |
| 480 | if self.punc_model is not None: |
| 481 | deep_update(self.punc_kwargs, cfg) |
| 482 | for result in results: |
| 483 | punc_res = self.inference( |
| 484 | result["text"], model=self.punc_model, kwargs=self.punc_kwargs, **cfg |
| 485 | ) |
| 486 | if cfg.get("return_raw_text", self.kwargs.get("return_raw_text", False)): |
| 487 | result["raw_text"] = copy.copy(result["text"]) |
| 488 | result["text"] = punc_res[0]["text"] |
| 489 | return results |
| 490 | |
| 491 | else: |
| 492 | return self.inference_with_vad( |
| 493 | input, input_len=input_len, progress_callback=progress_callback, **cfg |
| 494 | ) |
| 495 | |
| 496 | def inference( |
| 497 | self, |