Load audio/text/image/video data from various input formats. Args: data_or_path_or_list: File path, URL, numpy array, torch Tensor, bytes, or list. fs (int): Target sample rate (default 16000). audio_fs (int): Source audio sample rate. data_type (str): Input type
(
data_or_path_or_list,
fs: int = 16000,
audio_fs: int = 16000,
data_type="sound",
tokenizer=None,
**kwargs,
)
| 46 | |
| 47 | |
| 48 | def load_audio_text_image_video( |
| 49 | data_or_path_or_list, |
| 50 | fs: int = 16000, |
| 51 | audio_fs: int = 16000, |
| 52 | data_type="sound", |
| 53 | tokenizer=None, |
| 54 | **kwargs, |
| 55 | ): |
| 56 | """Load audio/text/image/video data from various input formats. |
| 57 | |
| 58 | Args: |
| 59 | data_or_path_or_list: File path, URL, numpy array, torch Tensor, bytes, or list. |
| 60 | fs (int): Target sample rate (default 16000). |
| 61 | audio_fs (int): Source audio sample rate. |
| 62 | data_type (str): Input type ("sound", "text", "fbank"). |
| 63 | |
| 64 | Returns: |
| 65 | torch.Tensor or list: Loaded and resampled audio tensor(s). |
| 66 | """ |
| 67 | if isinstance(data_or_path_or_list, (list, tuple)): |
| 68 | if data_type is not None and isinstance(data_type, (list, tuple)): |
| 69 | data_types = [data_type] * len(data_or_path_or_list) |
| 70 | data_or_path_or_list_ret = [[] for d in data_type] |
| 71 | for i, (data_type_i, data_or_path_or_list_i) in enumerate( |
| 72 | zip(data_types, data_or_path_or_list) |
| 73 | ): |
| 74 | for j, (data_type_j, data_or_path_or_list_j) in enumerate( |
| 75 | zip(data_type_i, data_or_path_or_list_i) |
| 76 | ): |
| 77 | data_or_path_or_list_j = load_audio_text_image_video( |
| 78 | data_or_path_or_list_j, |
| 79 | fs=fs, |
| 80 | audio_fs=audio_fs, |
| 81 | data_type=data_type_j, |
| 82 | tokenizer=tokenizer, |
| 83 | **kwargs, |
| 84 | ) |
| 85 | data_or_path_or_list_ret[j].append(data_or_path_or_list_j) |
| 86 | |
| 87 | return data_or_path_or_list_ret |
| 88 | else: |
| 89 | return [ |
| 90 | load_audio_text_image_video( |
| 91 | audio, fs=fs, audio_fs=audio_fs, data_type=data_type, **kwargs |
| 92 | ) |
| 93 | for audio in data_or_path_or_list |
| 94 | ] |
| 95 | if isinstance(data_or_path_or_list, str) and data_or_path_or_list.startswith( |
| 96 | ("http://", "https://") |
| 97 | ): # download url to local file |
| 98 | data_or_path_or_list = download_from_url(data_or_path_or_list) |
| 99 | |
| 100 | # Fail fast with a clear error if an audio file path does not exist, instead of |
| 101 | # silently passing the string downstream (which later crashes with a cryptic |
| 102 | # "expected Tensor ... but got str" deep inside the model). |
| 103 | if ( |
| 104 | isinstance(data_or_path_or_list, str) |
| 105 | and data_type in (None, "sound") |
no test coverage detected
searching dependent graphs…