MCPcopy Index your code
hub / github.com/modelscope/FunASR / prepare_data_iterator

Function prepare_data_iterator

funasr/auto/auto_model.py:111–179  ·  view source on GitHub ↗

(data_in, input_len=None, data_type=None, key=None)

Source from the content-addressed store, hash-verified

109
110
111def prepare_data_iterator(data_in, input_len=None, data_type=None, key=None):
112 """ """
113 data_list = []
114 key_list = []
115 filelist = [".scp", ".txt", ".json", ".jsonl", ".text"]
116
117 chars = string.ascii_letters + string.digits
118 if isinstance(data_in, str):
119 if data_in.startswith("http://") or data_in.startswith("https://"): # url
120 data_in = download_from_url(data_in)
121
122 if isinstance(data_in, str) and os.path.exists(
123 data_in
124 ): # wav_path; filelist: wav.scp, file.jsonl;text.txt;
125 _, file_extension = os.path.splitext(data_in)
126 file_extension = file_extension.lower()
127 if file_extension in filelist: # filelist: wav.scp, file.jsonl;text.txt;
128 with open(data_in, encoding="utf-8") as fin:
129 for line in fin:
130 key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
131 if data_in.endswith(".jsonl"): # file.jsonl: json.dumps({"source": data})
132 lines = json.loads(line.strip())
133 data = lines["source"]
134 key = lines.get("key", key)
135 else: # filelist, wav.scp, text.txt: id \t data or data
136 lines = line.strip().split(maxsplit=1)
137 data = lines[1] if len(lines) > 1 else lines[0]
138 key = lines[0] if len(lines) > 1 else key
139
140 data_list.append(data)
141 key_list.append(key)
142 else:
143 if key is None:
144 # key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))
145 key = misc.extract_filename_without_extension(data_in)
146 data_list = [data_in]
147 key_list = [key]
148 elif isinstance(data_in, (list, tuple)):
149 if data_type is not None and isinstance(data_type, (list, tuple)): # mutiple inputs
150 data_list_tmp = []
151 for data_in_i, data_type_i in zip(data_in, data_type):
152 key_list, data_list_i = prepare_data_iterator(
153 data_in=data_in_i, data_type=data_type_i
154 )
155 data_list_tmp.append(data_list_i)
156 data_list = []
157 for item in zip(*data_list_tmp):
158 data_list.append(item)
159 else:
160 # [audio sample point, fbank, text]
161 data_list = data_in
162 key_list = []
163 for data_i in data_in:
164 if isinstance(data_i, str) and os.path.exists(data_i):
165 key = misc.extract_filename_without_extension(data_i)
166 else:
167 if key is None:
168 key = "rand_key_" + "".join(random.choice(chars) for _ in range(13))

Callers 4

__call__Method · 0.90
inferenceMethod · 0.85
inference_with_vadMethod · 0.85
exportMethod · 0.85

Calls 2

download_from_urlFunction · 0.90
load_bytesFunction · 0.90

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…