(
self,
input: srt,
audio: np.ndarray,
speech_array_indices: List[SPEECH_ARRAY_INDEX],
lang: LANG,
prompt: str,
)
| 177 | self.whisper_model = partial(openai.Audio.transcribe, model=model_name) |
| 178 | |
| 179 | def transcribe( |
| 180 | self, |
| 181 | input: srt, |
| 182 | audio: np.ndarray, |
| 183 | speech_array_indices: List[SPEECH_ARRAY_INDEX], |
| 184 | lang: LANG, |
| 185 | prompt: str, |
| 186 | ) -> List[srt.Subtitle]: |
| 187 | res = [] |
| 188 | name, _ = os.path.splitext(input) |
| 189 | raw_audio = AudioSegment.from_file(input) |
| 190 | ms_bytes = len(raw_audio[:1].raw_data) |
| 191 | audios: List[ |
| 192 | TypedDict( |
| 193 | "AudioInfo", {"input": str, "audio": AudioSegment, "start_ms": float} |
| 194 | ) |
| 195 | ] = [] |
| 196 | |
| 197 | i = 0 |
| 198 | for index in speech_array_indices: |
| 199 | start = int(index["start"]) / self.sample_rate * 1000 |
| 200 | end = int(index["end"]) / self.sample_rate * 1000 |
| 201 | audio_seg = raw_audio[start:end] |
| 202 | if len(audio_seg.raw_data) < self.split_audio_bytes: |
| 203 | temp_file = f"{name}_temp_{i}.wav" |
| 204 | audios.append( |
| 205 | {"input": temp_file, "audio": audio_seg, "start_ms": start} |
| 206 | ) |
| 207 | else: |
| 208 | logging.info( |
| 209 | f"Long audio with a size({len(audio_seg.raw_data)} bytes) greater than 25M({25 * 2 ** 20} bytes) " |
| 210 | "will be segmented" |
| 211 | "due to Openai's API restrictions on files smaller than 25M" |
| 212 | ) |
| 213 | split_num = len(audio_seg.raw_data) // self.split_audio_bytes + 1 |
| 214 | for j in range(split_num): |
| 215 | temp_file = f"{name}_{i}_temp_{j}.wav" |
| 216 | split_audio = audio_seg[ |
| 217 | j |
| 218 | * self.split_audio_bytes |
| 219 | // ms_bytes : (j + 1) |
| 220 | * self.split_audio_bytes |
| 221 | // ms_bytes |
| 222 | ] |
| 223 | audios.append( |
| 224 | { |
| 225 | "input": temp_file, |
| 226 | "audio": split_audio, |
| 227 | "start_ms": start + j * self.split_audio_bytes // ms_bytes, |
| 228 | } |
| 229 | ) |
| 230 | i += 1 |
| 231 | |
| 232 | if len(audios) > 1: |
| 233 | from multiprocessing import Pool |
| 234 | |
| 235 | pbar = tqdm(total=len(audios)) |
| 236 |
no test coverage detected