MCPcopy
hub / github.com/facebookresearch/dlrm / init_distributed

Function init_distributed

extend_distributed.py:65–191  ·  view source on GitHub ↗
(rank=-1, local_rank=-1, size=-1, use_gpu=False, backend="")

Source from the content-addressed store, hash-verified

63
64
65def init_distributed(rank=-1, local_rank=-1, size=-1, use_gpu=False, backend=""):
66 global myreq
67 global my_rank
68 global my_size
69 global my_local_rank
70 global my_local_size
71 global a2a_impl
72 global alltoall_supported
73
74 # guess MPI ranks from env (works for IMPI, OMPI and MVAPICH2)
75 num_mpi_ranks = env2int(
76 ["PMI_SIZE", "OMPI_COMM_WORLD_SIZE", "MV2_COMM_WORLD_SIZE", "WORLD_SIZE"]
77 )
78 if backend == "" and num_mpi_ranks > 1:
79 if torch_ccl and env2int(["CCL_WORKER_COUNT"]) > 0:
80 backend = "ccl"
81 elif use_gpu and dist.is_nccl_available():
82 backend = "nccl"
83 elif dist.is_mpi_available():
84 backend = "mpi"
85 else:
86 print(
87 "WARNING: MPI multi-process launch detected but PyTorch MPI backend not available."
88 )
89 backend = "gloo"
90
91 if backend != "":
92 # guess Rank and size
93 if rank == -1:
94 rank = env2int(
95 ["PMI_RANK", "OMPI_COMM_WORLD_RANK", "MV2_COMM_WORLD_RANK", "RANK"], 0
96 )
97 if size == -1:
98 size = env2int(
99 [
100 "PMI_SIZE",
101 "OMPI_COMM_WORLD_SIZE",
102 "MV2_COMM_WORLD_SIZE",
103 "WORLD_SIZE",
104 ],
105 1,
106 )
107 if not os.environ.get("RANK", None) and rank != -1:
108 os.environ["RANK"] = str(rank)
109 if not os.environ.get("WORLD_SIZE", None) and size != -1:
110 os.environ["WORLD_SIZE"] = str(size)
111 if not os.environ.get("MASTER_PORT", None):
112 os.environ["MASTER_PORT"] = "29500"
113 if not os.environ.get("MASTER_ADDR", None):
114 local_size = env2int(
115 [
116 "MPI_LOCALNRANKS",
117 "OMPI_COMM_WORLD_LOCAL_SIZE",
118 "MV2_COMM_WORLD_LOCAL_SIZE",
119 ],
120 1,
121 )
122 if local_size != size and backend != "mpi":

Callers

nothing calls this directly

Calls 3

env2intFunction · 0.85
print_allFunction · 0.85
RequestClass · 0.85

Tested by

no test coverage detected