()
| 320 | |
| 321 | |
| 322 | def main() -> None: |
| 323 | formatter = "%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s" |
| 324 | logging.basicConfig(format=formatter, level=logging.INFO, force=True) |
| 325 | parser = build_parser() |
| 326 | args = parser.parse_args() |
| 327 | mp.set_start_method("spawn", force=True) |
| 328 | |
| 329 | # Validate input arguments |
| 330 | assert bool(args.input_manifest) != bool( |
| 331 | args.input_jsonl |
| 332 | ), "Exactly one of --input_manifest or --input_jsonl must be provided." |
| 333 | |
| 334 | if args.num_machines > 1: |
| 335 | assert ( |
| 336 | 0 <= args.machine_index < args.num_machines |
| 337 | ), f"machine_index {args.machine_index} must be in [0, {args.num_machines})" |
| 338 | |
| 339 | # Build base dataset and count total samples based on input mode |
| 340 | if args.input_jsonl: |
| 341 | logging.info(f"Input mode: raw JSONL ({args.input_jsonl})") |
| 342 | total_samples = count_lines(args.input_jsonl) |
| 343 | base_dataset = JsonlDatasetReader( |
| 344 | args.input_jsonl, |
| 345 | sample_rate=HIGGS_INPUT_SAMPLE_RATE, |
| 346 | shuffle=args.shuffle, |
| 347 | shuffle_seed=args.shuffle_seed, |
| 348 | ) |
| 349 | loader_workers = args.loader_workers |
| 350 | else: |
| 351 | logging.info(f"Input mode: WebDataset manifest ({args.input_manifest})") |
| 352 | manifest_num_lines = count_lines(args.input_manifest) |
| 353 | loader_workers = min(args.loader_workers, manifest_num_lines) |
| 354 | total_samples = 0 |
| 355 | manifests = [] |
| 356 | with open(args.input_manifest, "r", encoding="utf-8") as f: |
| 357 | for line_id, line in tqdm( |
| 358 | enumerate(f), |
| 359 | total=manifest_num_lines, |
| 360 | desc="Calculating dataset length", |
| 361 | ): |
| 362 | items = line.strip().split(" ") |
| 363 | tar_path, jsonl_path, num_items, duration = ( |
| 364 | items[0], |
| 365 | items[1], |
| 366 | int(items[2]), |
| 367 | float(items[3]), |
| 368 | ) |
| 369 | assert os.path.exists(tar_path), f"File {tar_path} does not exist." |
| 370 | assert os.path.exists(jsonl_path), f"File {jsonl_path} does not exist." |
| 371 | assert jsonl_path.endswith( |
| 372 | ".jsonl" |
| 373 | ), f"File {jsonl_path} is not a .jsonl file." |
| 374 | if ( |
| 375 | args.num_machines > 1 |
| 376 | and line_id % args.num_machines != args.machine_index |
| 377 | ): |
| 378 | continue |
| 379 | total_samples += num_items |
no test coverage detected