(args)
| 207 | |
| 208 | |
| 209 | def init_distributed_mode(args): |
| 210 | if 'OMPI_COMM_WORLD_RANK' in os.environ: |
| 211 | args.rank = int(os.environ.get('OMPI_COMM_WORLD_RANK')) |
| 212 | args.world_size = int(os.environ.get('OMPI_COMM_WORLD_SIZE')) |
| 213 | args.gpu = args.rank % torch.cuda.device_count() |
| 214 | elif 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: |
| 215 | args.rank = int(os.environ["RANK"]) |
| 216 | args.world_size = int(os.environ['WORLD_SIZE']) |
| 217 | args.gpu = int(os.environ['LOCAL_RANK']) |
| 218 | elif 'SLURM_PROCID' in os.environ: |
| 219 | args.rank = int(os.environ['SLURM_PROCID']) |
| 220 | args.gpu = args.rank % torch.cuda.device_count() |
| 221 | else: |
| 222 | print('Not using distributed mode') |
| 223 | args.distributed = False |
| 224 | return |
| 225 | |
| 226 | args.distributed = True |
| 227 | |
| 228 | torch.cuda.set_device(args.gpu) |
| 229 | args.dist_backend = 'nccl' |
| 230 | print('| distributed init (rank {}): {}'.format( |
| 231 | args.rank, args.dist_url), flush=True) |
| 232 | torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url, |
| 233 | world_size=args.world_size, rank=args.rank) |
| 234 | torch.distributed.barrier() |
| 235 | setup_for_distributed(args.rank == 0) |
nothing calls this directly
no test coverage detected