Computes an embedding for a single utterance. # TODO: handle multiple wavs to benefit from batching on GPU :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32 :param using_partials: if True, then the utterance is split in partial utterances
(wav, using_partials=True, return_partials=False, **kwargs)
| 109 | |
| 110 | |
| 111 | def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs): |
| 112 | """ |
| 113 | Computes an embedding for a single utterance. |
| 114 | |
| 115 | # TODO: handle multiple wavs to benefit from batching on GPU |
| 116 | :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32 |
| 117 | :param using_partials: if True, then the utterance is split in partial utterances of |
| 118 | <partial_utterance_n_frames> frames and the utterance embedding is computed from their |
| 119 | normalized average. If False, the utterance is instead computed from feeding the entire |
| 120 | spectogram to the network. |
| 121 | :param return_partials: if True, the partial embeddings will also be returned along with the |
| 122 | wav slices that correspond to the partial embeddings. |
| 123 | :param kwargs: additional arguments to compute_partial_splits() |
| 124 | :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If |
| 125 | <return_partials> is True, the partial utterances as a numpy array of float32 of shape |
| 126 | (n_partials, model_embedding_size) and the wav partials as a list of slices will also be |
| 127 | returned. If <using_partials> is simultaneously set to False, both these values will be None |
| 128 | instead. |
| 129 | """ |
| 130 | # Process the entire utterance if not using partials |
| 131 | if not using_partials: |
| 132 | frames = audio.wav_to_mel_spectrogram(wav) |
| 133 | embed = embed_frames_batch(frames[None, ...])[0] |
| 134 | if return_partials: |
| 135 | return embed, None, None |
| 136 | return embed |
| 137 | |
| 138 | # Compute where to split the utterance into partials and pad if necessary |
| 139 | wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs) |
| 140 | max_wave_length = wave_slices[-1].stop |
| 141 | if max_wave_length >= len(wav): |
| 142 | wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant") |
| 143 | |
| 144 | # Split the utterance into partials |
| 145 | frames = audio.wav_to_mel_spectrogram(wav) |
| 146 | frames_batch = np.array([frames[s] for s in mel_slices]) |
| 147 | partial_embeds = embed_frames_batch(frames_batch) |
| 148 | |
| 149 | # Compute the utterance embedding from the partial embeddings |
| 150 | raw_embed = np.mean(partial_embeds, axis=0) |
| 151 | embed = raw_embed / np.linalg.norm(raw_embed, 2) |
| 152 | |
| 153 | if return_partials: |
| 154 | return embed, partial_embeds, wave_slices |
| 155 | return embed |
| 156 | |
| 157 | |
| 158 | def embed_speaker(wavs, **kwargs): |
nothing calls this directly
no test coverage detected