MCPcopy
hub / github.com/babysor/MockingBird / SpeakerEncoder

Class SpeakerEncoder

encoder/model.py:12–135  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

10
11
12class SpeakerEncoder(nn.Module):
13 def __init__(self, device, loss_device):
14 super().__init__()
15 self.loss_device = loss_device
16
17 # Network defition
18 self.lstm = nn.LSTM(input_size=mel_n_channels,
19 hidden_size=model_hidden_size,
20 num_layers=model_num_layers,
21 batch_first=True).to(device)
22 self.linear = nn.Linear(in_features=model_hidden_size,
23 out_features=model_embedding_size).to(device)
24 self.relu = torch.nn.ReLU().to(device)
25
26 # Cosine similarity scaling (with fixed initial parameter values)
27 self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
28 self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
29
30 # Loss
31 self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
32
33 def do_gradient_ops(self):
34 # Gradient scale
35 self.similarity_weight.grad *= 0.01
36 self.similarity_bias.grad *= 0.01
37
38 # Gradient clipping
39 clip_grad_norm_(self.parameters(), 3, norm_type=2)
40
41 def forward(self, utterances, hidden_init=None):
42 """
43 Computes the embeddings of a batch of utterance spectrograms.
44
45 :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
46 (batch_size, n_frames, n_channels)
47 :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
48 batch_size, hidden_size). Will default to a tensor of zeros if None.
49 :return: the embeddings as a tensor of shape (batch_size, embedding_size)
50 """
51 # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
52 # and the final cell state.
53 out, (hidden, cell) = self.lstm(utterances, hidden_init)
54
55 # We take only the hidden state of the last layer
56 embeds_raw = self.relu(self.linear(hidden[-1]))
57
58 # L2-normalize it
59 embeds = embeds_raw / (torch.norm(embeds_raw, dim=1, keepdim=True) + 1e-5)
60
61 return embeds
62
63 def similarity_matrix(self, embeds):
64 """
65 Computes the similarity matrix according the section 2.1 of GE2E.
66
67 :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
68 utterances_per_speaker, embedding_size)
69 :return: the similarity matrix as a tensor of shape (speakers_per_batch,

Callers 2

trainFunction · 0.90
load_modelFunction · 0.90

Calls

no outgoing calls

Tested by

no test coverage detected