Ensures that segments without voice in the waveform remain no longer than a threshold determined by the VAD parameters in params.py. :param wav: the raw waveform as a numpy array of floats :return: the same waveform with silences trimmed away (length <= original wav length)
(wav)
| 66 | |
| 67 | |
| 68 | def trim_long_silences(wav): |
| 69 | """ |
| 70 | Ensures that segments without voice in the waveform remain no longer than a |
| 71 | threshold determined by the VAD parameters in params.py. |
| 72 | |
| 73 | :param wav: the raw waveform as a numpy array of floats |
| 74 | :return: the same waveform with silences trimmed away (length <= original wav length) |
| 75 | """ |
| 76 | # Compute the voice detection window size |
| 77 | samples_per_window = (vad_window_length * sampling_rate) // 1000 |
| 78 | |
| 79 | # Trim the end of the audio to have a multiple of the window size |
| 80 | wav = wav[:len(wav) - (len(wav) % samples_per_window)] |
| 81 | |
| 82 | # Convert the float waveform to 16-bit mono PCM |
| 83 | pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16)) |
| 84 | |
| 85 | # Perform voice activation detection |
| 86 | voice_flags = [] |
| 87 | vad = webrtcvad.Vad(mode=3) |
| 88 | for window_start in range(0, len(wav), samples_per_window): |
| 89 | window_end = window_start + samples_per_window |
| 90 | voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2], |
| 91 | sample_rate=sampling_rate)) |
| 92 | voice_flags = np.array(voice_flags) |
| 93 | |
| 94 | # Smooth the voice detection with a moving average |
| 95 | def moving_average(array, width): |
| 96 | array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2))) |
| 97 | ret = np.cumsum(array_padded, dtype=float) |
| 98 | ret[width:] = ret[width:] - ret[:-width] |
| 99 | return ret[width - 1:] / width |
| 100 | |
| 101 | audio_mask = moving_average(voice_flags, vad_moving_average_width) |
| 102 | audio_mask = np.round(audio_mask).astype(np.bool) |
| 103 | |
| 104 | # Dilate the voiced regions |
| 105 | audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1)) |
| 106 | audio_mask = np.repeat(audio_mask, samples_per_window) |
| 107 | |
| 108 | return wav[audio_mask == True] |
| 109 | |
| 110 | |
| 111 | def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False): |
no test coverage detected