(wav: np.ndarray, text: str, out_dir: Path, basename: str,
skip_existing: bool, hparams)
| 17 | |
| 18 | |
| 19 | def _process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str, |
| 20 | skip_existing: bool, hparams): |
| 21 | ## FOR REFERENCE: |
| 22 | # For you not to lose your head if you ever wish to change things here or implement your own |
| 23 | # synthesizer. |
| 24 | # - Both the audios and the mel spectrograms are saved as numpy arrays |
| 25 | # - There is no processing done to the audios that will be saved to disk beyond volume |
| 26 | # normalization (in split_on_silences) |
| 27 | # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This |
| 28 | # is why we re-apply it on the audio on the side of the vocoder. |
| 29 | # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved |
| 30 | # without extra padding. This means that you won't have an exact relation between the length |
| 31 | # of the wav and of the mel spectrogram. See the vocoder data loader. |
| 32 | |
| 33 | |
| 34 | # Skip existing utterances if needed |
| 35 | mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename) |
| 36 | wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename) |
| 37 | if skip_existing and mel_fpath.exists() and wav_fpath.exists(): |
| 38 | return None |
| 39 | |
| 40 | # Trim silence |
| 41 | if hparams.trim_silence: |
| 42 | wav = encoder.preprocess_wav(wav, normalize=False, trim_silence=True) |
| 43 | |
| 44 | # Skip utterances that are too short |
| 45 | if len(wav) < hparams.utterance_min_duration * hparams.sample_rate: |
| 46 | return None |
| 47 | |
| 48 | # Compute the mel spectrogram |
| 49 | mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) |
| 50 | mel_frames = mel_spectrogram.shape[1] |
| 51 | |
| 52 | # Skip utterances that are too long |
| 53 | if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: |
| 54 | return None |
| 55 | |
| 56 | # Write the spectrogram, embed and audio to disk |
| 57 | np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False) |
| 58 | np.save(wav_fpath, wav, allow_pickle=False) |
| 59 | |
| 60 | # Return a tuple describing this training example |
| 61 | return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(wav), mel_frames, text |
| 62 | |
| 63 | |
| 64 | def _split_on_silences(wav_fpath, words, hparams): |
no test coverage detected