(audio_data, audio_cfg)
| 411 | |
| 412 | |
| 413 | def get_mel(audio_data, audio_cfg): |
| 414 | # mel shape: (n_mels, T) |
| 415 | mel = torchaudio.transforms.MelSpectrogram( |
| 416 | sample_rate=audio_cfg["sample_rate"], |
| 417 | n_fft=audio_cfg["window_size"], |
| 418 | win_length=audio_cfg["window_size"], |
| 419 | hop_length=audio_cfg["hop_size"], |
| 420 | center=True, |
| 421 | pad_mode="reflect", |
| 422 | power=2.0, |
| 423 | norm=None, |
| 424 | onesided=True, |
| 425 | n_mels=64, |
| 426 | f_min=audio_cfg["fmin"], |
| 427 | f_max=audio_cfg["fmax"], |
| 428 | ).to(audio_data.device) |
| 429 | mel = mel(audio_data) |
| 430 | # Align to librosa: |
| 431 | # librosa_melspec = librosa.feature.melspectrogram( |
| 432 | # waveform, |
| 433 | # sr=audio_cfg['sample_rate'], |
| 434 | # n_fft=audio_cfg['window_size'], |
| 435 | # hop_length=audio_cfg['hop_size'], |
| 436 | # win_length=audio_cfg['window_size'], |
| 437 | # center=True, |
| 438 | # pad_mode="reflect", |
| 439 | # power=2.0, |
| 440 | # n_mels=64, |
| 441 | # norm=None, |
| 442 | # htk=True, |
| 443 | # f_min=audio_cfg['fmin'], |
| 444 | # f_max=audio_cfg['fmax'] |
| 445 | # ) |
| 446 | # we use log mel spectrogram as input |
| 447 | mel = torchaudio.transforms.AmplitudeToDB(top_db=None)(mel) |
| 448 | return mel.T # (T, n_mels) |
| 449 | |
| 450 | |
| 451 | def get_audio_features( |
no outgoing calls
no test coverage detected