(data_in, input_len=None, data_type=None, key=None)
| 109 | |
| 110 | |
| 111 | def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None): |
| 112 | """ """ |
| 113 | data_list = [] |
| 114 | key_list = [] |
| 115 | filelist = [".scp", ".txt", ".json", ".jsonl", ".text"] |
| 116 | |
| 117 | chars = string.ascii_letters + string.digits |
| 118 | if isinstance(data_in, str): |
| 119 | if data_in.startswith("http://") or data_in.startswith("https://"): # url |
| 120 | data_in = download_from_url(data_in) |
| 121 | |
| 122 | if isinstance(data_in, str) and os.path.exists( |
| 123 | data_in |
| 124 | ): # wav_path; filelist: wav.scp, file.jsonl;text.txt; |
| 125 | _, file_extension = os.path.splitext(data_in) |
| 126 | file_extension = file_extension.lower() |
| 127 | if file_extension in filelist: # filelist: wav.scp, file.jsonl;text.txt; |
| 128 | with open(data_in, encoding="utf-8") as fin: |
| 129 | for line in fin: |
| 130 | key = "rand_key_" + "".join(random.choice(chars) for _ in range(13)) |
| 131 | if data_in.endswith(".jsonl"): # file.jsonl: json.dumps({"source": data}) |
| 132 | lines = json.loads(line.strip()) |
| 133 | data = lines["source"] |
| 134 | key = lines.get("key", key) |
| 135 | else: # filelist, wav.scp, text.txt: id \t data or data |
| 136 | lines = line.strip().split(maxsplit=1) |
| 137 | data = lines[1] if len(lines) > 1 else lines[0] |
| 138 | key = lines[0] if len(lines) > 1 else key |
| 139 | |
| 140 | data_list.append(data) |
| 141 | key_list.append(key) |
| 142 | else: |
| 143 | if key is None: |
| 144 | # key = "rand_key_" + "".join(random.choice(chars) for _ in range(13)) |
| 145 | key = misc.extract_filename_without_extension(data_in) |
| 146 | data_list = [data_in] |
| 147 | key_list = [key] |
| 148 | elif isinstance(data_in, (list, tuple)): |
| 149 | if data_type is not None and isinstance(data_type, (list, tuple)): # mutiple inputs |
| 150 | data_list_tmp = [] |
| 151 | for data_in_i, data_type_i in zip(data_in, data_type): |
| 152 | key_list, data_list_i = prepare_data_iterator( |
| 153 | data_in=data_in_i, data_type=data_type_i |
| 154 | ) |
| 155 | data_list_tmp.append(data_list_i) |
| 156 | data_list = [] |
| 157 | for item in zip(*data_list_tmp): |
| 158 | data_list.append(item) |
| 159 | else: |
| 160 | # [audio sample point, fbank, text] |
| 161 | data_list = data_in |
| 162 | key_list = [] |
| 163 | for data_i in data_in: |
| 164 | if isinstance(data_i, str) and os.path.exists(data_i): |
| 165 | key = misc.extract_filename_without_extension(data_i) |
| 166 | else: |
| 167 | if key is None: |
| 168 | key = "rand_key_" + "".join(random.choice(chars) for _ in range(13)) |
no test coverage detected
searching dependent graphs…