Compute the log-Mel spectrogram of Parameters ---------- audio: Union[str, np.ndarray, torch.Tensor], shape = (*) The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz n_mels: int The number of Mel-frequency filters, only 8
(
audio: Union[str, np.ndarray, torch.Tensor],
n_mels: int = N_MELS,
padding: int = 0,
device: Optional[Union[str, torch.device]] = None,
)
| 155 | |
| 156 | |
| 157 | def log_mel_spectrogram( |
| 158 | audio: Union[str, np.ndarray, torch.Tensor], |
| 159 | n_mels: int = N_MELS, |
| 160 | padding: int = 0, |
| 161 | device: Optional[Union[str, torch.device]] = None, |
| 162 | ): |
| 163 | """ |
| 164 | Compute the log-Mel spectrogram of |
| 165 | |
| 166 | Parameters |
| 167 | ---------- |
| 168 | audio: Union[str, np.ndarray, torch.Tensor], shape = (*) |
| 169 | The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz |
| 170 | |
| 171 | n_mels: int |
| 172 | The number of Mel-frequency filters, only 80 is supported |
| 173 | |
| 174 | padding: int |
| 175 | Number of zero samples to pad to the right |
| 176 | |
| 177 | device: Optional[Union[str, torch.device]] |
| 178 | If given, the audio tensor is moved to this device before STFT |
| 179 | |
| 180 | Returns |
| 181 | ------- |
| 182 | torch.Tensor, shape = (80, n_frames) |
| 183 | A Tensor that contains the Mel spectrogram |
| 184 | """ |
| 185 | if not torch.is_tensor(audio): |
| 186 | if isinstance(audio, str): |
| 187 | audio = load_audio(audio) |
| 188 | audio = torch.from_numpy(audio) |
| 189 | |
| 190 | if device is not None: |
| 191 | audio = audio.to(device) |
| 192 | if padding > 0: |
| 193 | audio = F.pad(audio, (0, padding)) |
| 194 | window = torch.hann_window(N_FFT).to(audio.device) |
| 195 | stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True) |
| 196 | magnitudes = stft[..., :-1].abs() ** 2 |
| 197 | |
| 198 | filters = mel_filters(audio.device, n_mels) |
| 199 | mel_spec = filters @ magnitudes |
| 200 | |
| 201 | log_spec = torch.clamp(mel_spec, min=1e-10).log10() |
| 202 | log_spec = torch.maximum(log_spec, log_spec.max() - 8.0) |
| 203 | log_spec = (log_spec + 4.0) / 4.0 |
| 204 | return log_spec |
| 205 | |
| 206 | |
| 207 | @dataclass |
no test coverage detected