MCPcopy
hub / github.com/Fosowl/agenticSeek / Speech2Text

Class Speech2Text

sources/speech_to_text.py:66–289  ·  view source on GitHub ↗

A wrapper class for Vosk speech-to-text with microphone input. Provides real-time speech recognition with support for partial results, grammar constraints, word-level timing, and echo filtering of TTS output.

Source from the content-addressed store, hash-verified

64
65
66class Speech2Text:
67 """
68 A wrapper class for Vosk speech-to-text with microphone input.
69
70 Provides real-time speech recognition with support for partial results,
71 grammar constraints, word-level timing, and echo filtering of TTS output.
72 """
73
74 def __init__(
75 self,
76 model_path: Optional[str] = None,
77 lang: str = "en-us",
78 sample_rate: Optional[int] = None,
79 device: Optional[int] = None,
80 block_size: int = 8000,
81 grammar: Optional[List[str]] = None,
82 max_alternatives: int = 1,
83 enable_words: bool = False,
84 enable_partial_words: bool = False,
85 log_level: int = -1,
86 ):
87 """
88 Initialize the Speech2Text recognizer.
89
90 Args:
91 model_path: Path to local Vosk model folder. If None, downloads model for `lang`.
92 lang: Language code (e.g., "en-us", "fr"). Ignored if model_path is set.
93 sample_rate: Audio sample rate. Auto-detected from device if None.
94 device: Audio input device ID. Uses default if None.
95 block_size: Audio buffer size in samples.
96 grammar: Optional list of phrases for constrained recognition.
97 max_alternatives: Maximum number of alternative transcriptions to return.
98 enable_words: Enable word-level timestamps in results.
99 enable_partial_words: Enable word-level timestamps for partial results.
100 log_level: Vosk log level (0=normal, -1=silent).
101 """
102 if not IMPORT_FOUND:
103 return
104
105 SetLogLevel(log_level)
106
107 self.device = device
108 self.block_size = block_size
109 self._running = False
110 self._audio_queue: queue.Queue = queue.Queue()
111
112 if sample_rate is None:
113 device_info = sd.query_devices(device, "input")
114 self.sample_rate = int(device_info["default_samplerate"])
115 else:
116 self.sample_rate = sample_rate
117
118 self.model = Model(model_path) if model_path else Model(lang=lang)
119
120 if grammar:
121 grammar_str = json.dumps(grammar + ["[unk]"])
122 self.recognizer = KaldiRecognizer(self.model, self.sample_rate, grammar_str)
123 else:

Callers 2

initialize_sttMethod · 0.90
speech_to_text.pyFile · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected