Calculate and add audio features to sample. Sample: a dict containing all the data of current sample. audio_data: a tensor of shape (T) containing audio data. max_len: the maximum length of audio data. data_truncating: the method of truncating data. data_filling: the method
(
sample, audio_data, max_len, data_truncating, data_filling, audio_cfg
)
| 449 | |
| 450 | |
| 451 | def get_audio_features( |
| 452 | sample, audio_data, max_len, data_truncating, data_filling, audio_cfg |
| 453 | ): |
| 454 | """ |
| 455 | Calculate and add audio features to sample. |
| 456 | Sample: a dict containing all the data of current sample. |
| 457 | audio_data: a tensor of shape (T) containing audio data. |
| 458 | max_len: the maximum length of audio data. |
| 459 | data_truncating: the method of truncating data. |
| 460 | data_filling: the method of filling data. |
| 461 | audio_cfg: a dict containing audio configuration. Comes from model_cfg['audio_cfg']. |
| 462 | """ |
| 463 | with torch.no_grad(): |
| 464 | if len(audio_data) > max_len: |
| 465 | if data_truncating == "rand_trunc": |
| 466 | longer = torch.tensor([True]) |
| 467 | elif data_truncating == "fusion": |
| 468 | # fusion |
| 469 | mel = get_mel(audio_data, audio_cfg) |
| 470 | # split to three parts |
| 471 | chunk_frames = ( |
| 472 | max_len // audio_cfg["hop_size"] + 1 |
| 473 | ) # the +1 related to how the spectrogram is computed |
| 474 | total_frames = mel.shape[0] |
| 475 | if chunk_frames == total_frames: |
| 476 | # there is a corner case where the audio length is |
| 477 | # larger than max_len but smaller than max_len+hop_size. |
| 478 | # In this case, we just use the whole audio. |
| 479 | mel_fusion = torch.stack([mel, mel, mel, mel], dim=0) |
| 480 | sample["mel_fusion"] = mel_fusion |
| 481 | longer = torch.tensor([False]) |
| 482 | else: |
| 483 | ranges = np.array_split( |
| 484 | list(range(0, total_frames - chunk_frames + 1)), 3 |
| 485 | ) |
| 486 | # print('total_frames-chunk_frames:', total_frames-chunk_frames, |
| 487 | # 'len(audio_data):', len(audio_data), |
| 488 | # 'chunk_frames:', chunk_frames, |
| 489 | # 'total_frames:', total_frames) |
| 490 | if len(ranges[1]) == 0: |
| 491 | # if the audio is too short, we just use the first chunk |
| 492 | ranges[1] = [0] |
| 493 | if len(ranges[2]) == 0: |
| 494 | # if the audio is too short, we just use the first chunk |
| 495 | ranges[2] = [0] |
| 496 | # randomly choose index for each part |
| 497 | idx_front = np.random.choice(ranges[0]) |
| 498 | idx_middle = np.random.choice(ranges[1]) |
| 499 | idx_back = np.random.choice(ranges[2]) |
| 500 | # select mel |
| 501 | mel_chunk_front = mel[idx_front : idx_front + chunk_frames, :] |
| 502 | mel_chunk_middle = mel[idx_middle : idx_middle + chunk_frames, :] |
| 503 | mel_chunk_back = mel[idx_back : idx_back + chunk_frames, :] |
| 504 | |
| 505 | # shrink the mel |
| 506 | mel_shrink = torchvision.transforms.Resize(size=[chunk_frames, 64])( |
| 507 | mel[None] |
| 508 | )[0] |
no test coverage detected