(cfg, main_port: int)
| 42 | |
| 43 | |
| 44 | def single_node_runner(cfg, main_port: int): |
| 45 | assert cfg.launcher.num_nodes == 1 |
| 46 | num_proc = cfg.launcher.gpus_per_node |
| 47 | torch.multiprocessing.set_start_method( |
| 48 | "spawn" |
| 49 | ) # CUDA runtime does not support `fork` |
| 50 | if num_proc == 1: |
| 51 | # directly call single_proc so we can easily set breakpoints |
| 52 | # mp.spawn does not let us set breakpoints |
| 53 | single_proc_run(local_rank=0, main_port=main_port, cfg=cfg, world_size=num_proc) |
| 54 | else: |
| 55 | mp_runner = torch.multiprocessing.start_processes |
| 56 | args = (main_port, cfg, num_proc) |
| 57 | # Note: using "fork" below, "spawn" causes time and error regressions. Using |
| 58 | # spawn changes the default multiprocessing context to spawn, which doesn't |
| 59 | # interact well with the dataloaders (likely due to the use of OpenCV). |
| 60 | mp_runner(single_proc_run, args=args, nprocs=num_proc, start_method="spawn") |
| 61 | |
| 62 | |
| 63 | def format_exception(e: Exception, limit=20): |
no test coverage detected