Identify or create a speaker from an audio segment. Args: audio_np (np.ndarray): 1-D float32 audio for the segment. sample_rate (int): Sample rate. Default 16000. Returns: str or None: Speaker label (e.g. "SPEAKER_00"), or None if
(self, audio_np, sample_rate=16000)
| 94 | return float(np.dot(a, b)) |
| 95 | |
| 96 | def identify_speaker(self, audio_np, sample_rate=16000): |
| 97 | """Identify or create a speaker from an audio segment. |
| 98 | |
| 99 | Args: |
| 100 | audio_np (np.ndarray): 1-D float32 audio for the segment. |
| 101 | sample_rate (int): Sample rate. Default 16000. |
| 102 | |
| 103 | Returns: |
| 104 | str or None: Speaker label (e.g. "SPEAKER_00"), or None if |
| 105 | the audio is too short to embed. |
| 106 | """ |
| 107 | embedding = self._compute_embedding(audio_np, sample_rate) |
| 108 | if embedding is None: |
| 109 | return None |
| 110 | |
| 111 | best_speaker = None |
| 112 | best_sim = -1.0 |
| 113 | |
| 114 | for speaker_id, stored_emb in self.speakers.items(): |
| 115 | sim = self._cosine_similarity(embedding, stored_emb) |
| 116 | if sim > best_sim: |
| 117 | best_sim = sim |
| 118 | best_speaker = speaker_id |
| 119 | |
| 120 | if best_sim >= self.similarity_threshold: |
| 121 | # Update running average for the matched speaker |
| 122 | self.speakers[best_speaker] = ( |
| 123 | self.speakers[best_speaker] * 0.9 + embedding * 0.1 |
| 124 | ) |
| 125 | # Re-normalize |
| 126 | self.speakers[best_speaker] /= np.linalg.norm(self.speakers[best_speaker]) |
| 127 | return best_speaker |
| 128 | |
| 129 | if len(self.speakers) >= self.max_speakers: |
| 130 | # Assign to closest speaker |
| 131 | return best_speaker if best_speaker else f"SPEAKER_{self._speaker_count:02d}" |
| 132 | |
| 133 | # Create a new speaker |
| 134 | speaker_id = f"SPEAKER_{self._speaker_count:02d}" |
| 135 | self._speaker_count += 1 |
| 136 | self.speakers[speaker_id] = embedding |
| 137 | return speaker_id |
| 138 | |
| 139 | def reset(self): |
| 140 | """Reset all speaker state.""" |