For training CogView via MPI to setup the connection. Omit this function if use the basic deepspeed pdsh runner.
(args)
| 376 | |
| 377 | |
| 378 | def mpi_define_env(args): |
| 379 | ''' For training CogView via MPI to setup the connection. |
| 380 | Omit this function if use the basic deepspeed pdsh runner. |
| 381 | ''' |
| 382 | from mpi4py import MPI |
| 383 | import subprocess |
| 384 | comm = MPI.COMM_WORLD |
| 385 | rank = comm.Get_rank() |
| 386 | world_size = comm.Get_size() |
| 387 | |
| 388 | master_addr = None |
| 389 | if rank == 0: |
| 390 | hostname_cmd = ["hostname -I"] |
| 391 | result = subprocess.check_output(hostname_cmd, shell=True) |
| 392 | master_addr = result.decode('utf-8').split()[0] |
| 393 | master_addr = comm.bcast(master_addr, root=0) |
| 394 | |
| 395 | # Determine local rank by assuming hostnames are unique |
| 396 | proc_name = MPI.Get_processor_name() |
| 397 | all_procs = comm.allgather(proc_name) |
| 398 | local_rank = sum([i == proc_name for i in all_procs[:rank]]) |
| 399 | |
| 400 | os.environ['RANK'] = str(rank) |
| 401 | os.environ['WORLD_SIZE'] = str(world_size) |
| 402 | args.local_rank = local_rank |
| 403 | args.world_size = world_size |
| 404 | args.rank = rank |
| 405 | os.environ['MASTER_ADDR'] = master_addr |
| 406 | os.environ['MASTER_PORT'] = "29500" # TORCH_DISTRIBUTED_DEFAULT_PORT = 29500 |
| 407 | |
| 408 | print( |
| 409 | "Discovered MPI settings of world_rank={}, local_rank={}, world_size={}, master_addr={}, master_port={}" |
| 410 | .format(os.environ['RANK'], |
| 411 | args.local_rank, |
| 412 | os.environ['WORLD_SIZE'], |
| 413 | os.environ['MASTER_ADDR'], |
| 414 | os.environ['MASTER_PORT'])) |