MCPcopy
hub / github.com/k2-fsa/OmniVoice / main

Function main

omnivoice/scripts/extract_audio_tokens_add_noise.py:506–815  ·  view source on GitHub ↗
()

Source from the content-addressed store, hash-verified

504
505
506def main() -> None:
507 formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s"
508 logging.basicConfig(format=formatter, level=logging.INFO, force=True)
509 parser = build_parser()
510 args = parser.parse_args()
511 mp.set_start_method("spawn", force=True)
512
513 # Validate input arguments
514 assert bool(args.input_manifest) != bool(
515 args.input_jsonl
516 ), "Exactly one of --input_manifest or --input_jsonl must be provided."
517
518 if args.num_machines > 1:
519 assert (
520 0 <= args.machine_index < args.num_machines
521 ), f"machine_index {args.machine_index} must be in [0, {args.num_machines})"
522
523 # Build base dataset and count total samples based on input mode
524 if args.input_jsonl:
525 logging.info(f"Input mode: raw JSONL ({args.input_jsonl})")
526 total_samples = count_lines(args.input_jsonl)
527 base_dataset = JsonlDatasetReader(
528 args.input_jsonl,
529 sample_rate=HIGGS_INPUT_SAMPLE_RATE,
530 shuffle=args.shuffle,
531 shuffle_seed=args.shuffle_seed,
532 )
533 loader_workers = args.loader_workers
534 else:
535 logging.info(f"Input mode: WebDataset manifest ({args.input_manifest})")
536 manifest_num_lines = count_lines(args.input_manifest)
537 loader_workers = min(args.loader_workers, manifest_num_lines)
538 total_samples = 0
539 manifests = []
540 with open(args.input_manifest, "r", encoding="utf-8") as f:
541 for line_id, line in tqdm(
542 enumerate(f),
543 total=manifest_num_lines,
544 desc="Calculating dataset length",
545 ):
546 items = line.strip().split(" ")
547 tar_path, jsonl_path, num_items, duration = (
548 items[0],
549 items[1],
550 int(items[2]),
551 float(items[3]),
552 )
553 assert os.path.exists(tar_path), f"File {tar_path} does not exist."
554 assert os.path.exists(jsonl_path), f"File {jsonl_path} does not exist."
555 assert jsonl_path.endswith(
556 ".jsonl"
557 ), f"File {jsonl_path} is not a .jsonl file."
558 if (
559 args.num_machines > 1
560 and line_id % args.num_machines != args.machine_index
561 ):
562 continue
563 total_samples += num_items

Calls 8

JsonlDatasetReaderClass · 0.90
WebDatasetReaderClass · 0.90
closeMethod · 0.80
build_parserFunction · 0.70
count_linesFunction · 0.70
drain_completedFunction · 0.70
submitMethod · 0.45

Tested by

no test coverage detected