MCPcopy
hub / github.com/collabora/WhisperLive / identify_speaker

Method identify_speaker

whisper_live/diarization.py:96–137  ·  view source on GitHub ↗

Identify or create a speaker from an audio segment. Args: audio_np (np.ndarray): 1-D float32 audio for the segment. sample_rate (int): Sample rate. Default 16000. Returns: str or None: Speaker label (e.g. "SPEAKER_00"), or None if

(self, audio_np, sample_rate=16000)

Source from the content-addressed store, hash-verified

94 return float(np.dot(a, b))
95
96 def identify_speaker(self, audio_np, sample_rate=16000):
97 """Identify or create a speaker from an audio segment.
98
99 Args:
100 audio_np (np.ndarray): 1-D float32 audio for the segment.
101 sample_rate (int): Sample rate. Default 16000.
102
103 Returns:
104 str or None: Speaker label (e.g. "SPEAKER_00"), or None if
105 the audio is too short to embed.
106 """
107 embedding = self._compute_embedding(audio_np, sample_rate)
108 if embedding is None:
109 return None
110
111 best_speaker = None
112 best_sim = -1.0
113
114 for speaker_id, stored_emb in self.speakers.items():
115 sim = self._cosine_similarity(embedding, stored_emb)
116 if sim > best_sim:
117 best_sim = sim
118 best_speaker = speaker_id
119
120 if best_sim >= self.similarity_threshold:
121 # Update running average for the matched speaker
122 self.speakers[best_speaker] = (
123 self.speakers[best_speaker] * 0.9 + embedding * 0.1
124 )
125 # Re-normalize
126 self.speakers[best_speaker] /= np.linalg.norm(self.speakers[best_speaker])
127 return best_speaker
128
129 if len(self.speakers) >= self.max_speakers:
130 # Assign to closest speaker
131 return best_speaker if best_speaker else f"SPEAKER_{self._speaker_count:02d}"
132
133 # Create a new speaker
134 speaker_id = f"SPEAKER_{self._speaker_count:02d}"
135 self._speaker_count += 1
136 self.speakers[speaker_id] = embedding
137 return speaker_id
138
139 def reset(self):
140 """Reset all speaker state."""

Calls 2

_compute_embeddingMethod · 0.95
_cosine_similarityMethod · 0.95