()
| 504 | |
| 505 | |
| 506 | def main() -> None: |
| 507 | formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" |
| 508 | logging.basicConfig(format=formatter, level=logging.INFO, force=True) |
| 509 | parser = build_parser() |
| 510 | args = parser.parse_args() |
| 511 | mp.set_start_method("spawn", force=True) |
| 512 | |
| 513 | # Validate input arguments |
| 514 | assert bool(args.input_manifest) != bool( |
| 515 | args.input_jsonl |
| 516 | ), "Exactly one of --input_manifest or --input_jsonl must be provided." |
| 517 | |
| 518 | if args.num_machines > 1: |
| 519 | assert ( |
| 520 | 0 <= args.machine_index < args.num_machines |
| 521 | ), f"machine_index {args.machine_index} must be in [0, {args.num_machines})" |
| 522 | |
| 523 | # Build base dataset and count total samples based on input mode |
| 524 | if args.input_jsonl: |
| 525 | logging.info(f"Input mode: raw JSONL ({args.input_jsonl})") |
| 526 | total_samples = count_lines(args.input_jsonl) |
| 527 | base_dataset = JsonlDatasetReader( |
| 528 | args.input_jsonl, |
| 529 | sample_rate=HIGGS_INPUT_SAMPLE_RATE, |
| 530 | shuffle=args.shuffle, |
| 531 | shuffle_seed=args.shuffle_seed, |
| 532 | ) |
| 533 | loader_workers = args.loader_workers |
| 534 | else: |
| 535 | logging.info(f"Input mode: WebDataset manifest ({args.input_manifest})") |
| 536 | manifest_num_lines = count_lines(args.input_manifest) |
| 537 | loader_workers = min(args.loader_workers, manifest_num_lines) |
| 538 | total_samples = 0 |
| 539 | manifests = [] |
| 540 | with open(args.input_manifest, "r", encoding="utf-8") as f: |
| 541 | for line_id, line in tqdm( |
| 542 | enumerate(f), |
| 543 | total=manifest_num_lines, |
| 544 | desc="Calculating dataset length", |
| 545 | ): |
| 546 | items = line.strip().split(" ") |
| 547 | tar_path, jsonl_path, num_items, duration = ( |
| 548 | items[0], |
| 549 | items[1], |
| 550 | int(items[2]), |
| 551 | float(items[3]), |
| 552 | ) |
| 553 | assert os.path.exists(tar_path), f"File {tar_path} does not exist." |
| 554 | assert os.path.exists(jsonl_path), f"File {jsonl_path} does not exist." |
| 555 | assert jsonl_path.endswith( |
| 556 | ".jsonl" |
| 557 | ), f"File {jsonl_path} is not a .jsonl file." |
| 558 | if ( |
| 559 | args.num_machines > 1 |
| 560 | and line_id % args.num_machines != args.machine_index |
| 561 | ): |
| 562 | continue |
| 563 | total_samples += num_items |
no test coverage detected