Computes the embeddings of a batch of utterance spectrograms. :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape (batch_size, n_frames, n_channels) :param hidden_init: initial hidden state of the LSTM as a tensor of sha
(self, utterances, hidden_init=None)
| 39 | clip_grad_norm_(self.parameters(), 3, norm_type=2) |
| 40 | |
| 41 | def forward(self, utterances, hidden_init=None): |
| 42 | """ |
| 43 | Computes the embeddings of a batch of utterance spectrograms. |
| 44 | |
| 45 | :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape |
| 46 | (batch_size, n_frames, n_channels) |
| 47 | :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers, |
| 48 | batch_size, hidden_size). Will default to a tensor of zeros if None. |
| 49 | :return: the embeddings as a tensor of shape (batch_size, embedding_size) |
| 50 | """ |
| 51 | # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state |
| 52 | # and the final cell state. |
| 53 | out, (hidden, cell) = self.lstm(utterances, hidden_init) |
| 54 | |
| 55 | # We take only the hidden state of the last layer |
| 56 | embeds_raw = self.relu(self.linear(hidden[-1])) |
| 57 | |
| 58 | # L2-normalize it |
| 59 | embeds = embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5) |
| 60 | |
| 61 | return embeds |
| 62 | |
| 63 | def similarity_matrix(self, embeds): |
| 64 | """ |
no outgoing calls
no test coverage detected