A wrapper class for Vosk speech-to-text with microphone input. Provides real-time speech recognition with support for partial results, grammar constraints, word-level timing, and echo filtering of TTS output.
| 64 | |
| 65 | |
| 66 | class Speech2Text: |
| 67 | """ |
| 68 | A wrapper class for Vosk speech-to-text with microphone input. |
| 69 | |
| 70 | Provides real-time speech recognition with support for partial results, |
| 71 | grammar constraints, word-level timing, and echo filtering of TTS output. |
| 72 | """ |
| 73 | |
| 74 | def __init__( |
| 75 | self, |
| 76 | model_path: Optional[str] = None, |
| 77 | lang: str = "en-us", |
| 78 | sample_rate: Optional[int] = None, |
| 79 | device: Optional[int] = None, |
| 80 | block_size: int = 8000, |
| 81 | grammar: Optional[List[str]] = None, |
| 82 | max_alternatives: int = 1, |
| 83 | enable_words: bool = False, |
| 84 | enable_partial_words: bool = False, |
| 85 | log_level: int = -1, |
| 86 | ): |
| 87 | """ |
| 88 | Initialize the Speech2Text recognizer. |
| 89 | |
| 90 | Args: |
| 91 | model_path: Path to local Vosk model folder. If None, downloads model for `lang`. |
| 92 | lang: Language code (e.g., "en-us", "fr"). Ignored if model_path is set. |
| 93 | sample_rate: Audio sample rate. Auto-detected from device if None. |
| 94 | device: Audio input device ID. Uses default if None. |
| 95 | block_size: Audio buffer size in samples. |
| 96 | grammar: Optional list of phrases for constrained recognition. |
| 97 | max_alternatives: Maximum number of alternative transcriptions to return. |
| 98 | enable_words: Enable word-level timestamps in results. |
| 99 | enable_partial_words: Enable word-level timestamps for partial results. |
| 100 | log_level: Vosk log level (0=normal, -1=silent). |
| 101 | """ |
| 102 | if not IMPORT_FOUND: |
| 103 | return |
| 104 | |
| 105 | SetLogLevel(log_level) |
| 106 | |
| 107 | self.device = device |
| 108 | self.block_size = block_size |
| 109 | self._running = False |
| 110 | self._audio_queue: queue.Queue = queue.Queue() |
| 111 | |
| 112 | if sample_rate is None: |
| 113 | device_info = sd.query_devices(device, "input") |
| 114 | self.sample_rate = int(device_info["default_samplerate"]) |
| 115 | else: |
| 116 | self.sample_rate = sample_rate |
| 117 | |
| 118 | self.model = Model(model_path) if model_path else Model(lang=lang) |
| 119 | |
| 120 | if grammar: |
| 121 | grammar_str = json.dumps(grammar + ["[unk]"]) |
| 122 | self.recognizer = KaldiRecognizer(self.model, self.sample_rate, grammar_str) |
| 123 | else: |
no outgoing calls
no test coverage detected