(self, text)
| 547 | return re.findall(pattern, text) |
| 548 | |
| 549 | def process_audio(self, text): |
| 550 | audio_urls = self.extract_audio_urls(text) |
| 551 | if len(audio_urls) > 0: |
| 552 | audios, audio_lens, audio_span_tokens = [], [], [] |
| 553 | for audio_path in audio_urls: |
| 554 | if audio_path.startswith("http://") or audio_path.startswith("https://"): # http |
| 555 | data = bytes(requests.get(audio_path, stream=True).content) |
| 556 | audio = load_bytesio_audio(data) |
| 557 | else: |
| 558 | audio = load_audio(audio_path) |
| 559 | L = (audio.shape[0] if audio.shape[0] <= 480000 else 480000) # max_length < 30s |
| 560 | mel_len = L // 160 |
| 561 | audio = pad_or_trim(audio.flatten()) |
| 562 | mel = log_mel_spectrogram(audio) |
| 563 | audio_len_after_cnn = get_T_after_cnn(mel_len) |
| 564 | audio_token_num = (audio_len_after_cnn - 2) // 2 + 1 |
| 565 | audio_len = [audio_len_after_cnn, audio_token_num] |
| 566 | audios.append(mel) |
| 567 | audio_lens.append(audio_len) |
| 568 | audio_span_tokens.append(audio_token_num + 2) # add audio bos eos |
| 569 | input_audio_lengths = torch.IntTensor(audio_lens) |
| 570 | input_audios = torch.stack(audios, dim=0) |
| 571 | return {"input_audios": input_audios, |
| 572 | "input_audio_lengths": input_audio_lengths, |
| 573 | "audio_span_tokens": audio_span_tokens, |
| 574 | "audio_urls": audio_urls} |
| 575 | else: |
| 576 | return None |
| 577 | |
| 578 | |
| 579 |
no test coverage detected