MCPcopy
hub / github.com/k2-fsa/OmniVoice / main

Function main

omnivoice/scripts/extract_audio_tokens.py:322–621  ·  view source on GitHub ↗
()

Source from the content-addressed store, hash-verified

320
321
322def main() -> None:
323 formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
324 logging.basicConfig(format=formatter, level=logging.INFO, force=True)
325 parser = build_parser()
326 args = parser.parse_args()
327 mp.set_start_method("spawn", force=True)
328
329 # Validate input arguments
330 assert bool(args.input_manifest) != bool(
331 args.input_jsonl
332 ), "Exactly one of --input_manifest or --input_jsonl must be provided."
333
334 if args.num_machines > 1:
335 assert (
336 0 <= args.machine_index < args.num_machines
337 ), f"machine_index {args.machine_index} must be in [0, {args.num_machines})"
338
339 # Build base dataset and count total samples based on input mode
340 if args.input_jsonl:
341 logging.info(f"Input mode: raw JSONL ({args.input_jsonl})")
342 total_samples = count_lines(args.input_jsonl)
343 base_dataset = JsonlDatasetReader(
344 args.input_jsonl,
345 sample_rate=HIGGS_INPUT_SAMPLE_RATE,
346 shuffle=args.shuffle,
347 shuffle_seed=args.shuffle_seed,
348 )
349 loader_workers = args.loader_workers
350 else:
351 logging.info(f"Input mode: WebDataset manifest ({args.input_manifest})")
352 manifest_num_lines = count_lines(args.input_manifest)
353 loader_workers = min(args.loader_workers, manifest_num_lines)
354 total_samples = 0
355 manifests = []
356 with open(args.input_manifest, "r", encoding="utf-8") as f:
357 for line_id, line in tqdm(
358 enumerate(f),
359 total=manifest_num_lines,
360 desc="Calculating dataset length",
361 ):
362 items = line.strip().split(" ")
363 tar_path, jsonl_path, num_items, duration = (
364 items[0],
365 items[1],
366 int(items[2]),
367 float(items[3]),
368 )
369 assert os.path.exists(tar_path), f"File {tar_path} does not exist."
370 assert os.path.exists(jsonl_path), f"File {jsonl_path} does not exist."
371 assert jsonl_path.endswith(
372 ".jsonl"
373 ), f"File {jsonl_path} is not a .jsonl file."
374 if (
375 args.num_machines > 1
376 and line_id % args.num_machines != args.machine_index
377 ):
378 continue
379 total_samples += num_items

Callers 1

Calls 8

JsonlDatasetReaderClass · 0.90
WebDatasetReaderClass · 0.90
closeMethod · 0.80
build_parserFunction · 0.70
count_linesFunction · 0.70
drain_completedFunction · 0.70
submitMethod · 0.45

Tested by

no test coverage detected