(self, audio_path: str, actions)
| 394 | return _cosine_distance(self.embed(audio1), self.embed(audio2)) |
| 395 | |
| 396 | def analyze(self, audio_path: str, actions): |
| 397 | # AnalysisHead expects 16kHz mono; _load_waveform already |
| 398 | # resamples to self._expected_sr. If the user configured a |
| 399 | # non-16k expected rate, resample one more time for analyze. |
| 400 | audio = self._load_waveform(audio_path) |
| 401 | if self._expected_sr != 16000: |
| 402 | import numpy as np |
| 403 | |
| 404 | ratio = 16000 / float(self._expected_sr) |
| 405 | n = int(round(len(audio) * ratio)) |
| 406 | audio = np.interp( |
| 407 | np.linspace(0, len(audio), n, endpoint=False), |
| 408 | np.arange(len(audio)), |
| 409 | audio, |
| 410 | ).astype("float32") |
| 411 | attrs = self._analysis.analyze(audio_path, audio, actions) |
| 412 | if not attrs: |
| 413 | raise NotImplementedError( |
| 414 | "analyze head failed to load — install transformers + torch or pass age_gender_model/emotion_model options" |
| 415 | ) |
| 416 | duration = float(len(audio)) / 16000.0 if len(audio) else 0.0 |
| 417 | return [dict(start=0.0, end=duration, **attrs)] |
| 418 | |
| 419 | |
| 420 | def build_engine(model_name: str, options: dict[str, str]) -> tuple[SpeakerEngine, str]: |
nothing calls this directly
no test coverage detected