()
| 370 | |
| 371 | |
| 372 | def main(): |
| 373 | parser = get_parser() |
| 374 | args = parser.parse_args() |
| 375 | |
| 376 | logging.basicConfig( |
| 377 | format="%(asctime)s %(levelname)s [%(filename)s:%(lineno)d] %(message)s", |
| 378 | level=logging.INFO, |
| 379 | force=True, |
| 380 | ) |
| 381 | |
| 382 | # 1. Prepare Data |
| 383 | logging.info("Reading test list...") |
| 384 | data_by_lang = defaultdict(list) |
| 385 | total_files = 0 |
| 386 | wav_root = Path(args.wav_path) |
| 387 | |
| 388 | samples = read_test_list(args.test_list) |
| 389 | for s in samples: |
| 390 | wav_path = str(wav_root / f"{s['id']}.{args.extension}") |
| 391 | if not os.path.exists(wav_path): |
| 392 | logging.warning(f"File missing: {wav_path}") |
| 393 | continue |
| 394 | |
| 395 | lang_id = s.get("language_id") or "unknown" |
| 396 | lang_name = s.get("language_name") or "unknown" |
| 397 | |
| 398 | item = { |
| 399 | "wav_path": wav_path, |
| 400 | "truth_text": s["text"], |
| 401 | "lang_id": lang_id, |
| 402 | "lang_name": lang_name, |
| 403 | } |
| 404 | if args.lang and s.get("language_id") != args.lang: |
| 405 | continue |
| 406 | |
| 407 | data_by_lang[lang_name].append(item) |
| 408 | total_files += 1 |
| 409 | |
| 410 | logging.info(f"Total files: {total_files} in {len(data_by_lang)} languages.") |
| 411 | |
| 412 | # 2. Worker config |
| 413 | num_gpus = torch.cuda.device_count() |
| 414 | assert num_gpus > 0, "No GPU found. GPU is required." |
| 415 | total_workers = num_gpus * args.nj_per_gpu |
| 416 | |
| 417 | mp.set_start_method("spawn", force=True) |
| 418 | manager = mp.Manager() |
| 419 | |
| 420 | # 3. Scheduling: Split data into Chinese (Paraformer) and non-Chinese (Whisper) |
| 421 | zh_items = [] |
| 422 | non_zh_items = [] |
| 423 | for lang_name, items in data_by_lang.items(): |
| 424 | lang_id = items[0].get("lang_id", "") if items else "" |
| 425 | if lang_name == "Chinese" or (lang_id and lang_id.startswith("zh")): |
| 426 | zh_items.extend(items) |
| 427 | else: |
| 428 | non_zh_items.extend(items) |
| 429 |
no test coverage detected