ProcessGroup Process Group indicates how processes are organized in groups for parallel execution using Tensor Parallelism and Data Parallelism. NOTE, the ProcessGroup must be used after `torch.distributed.initialize()` Args: rank: the global rank of the current process.
| 35 | |
| 36 | |
| 37 | class ProcessGroup: |
| 38 | """ProcessGroup |
| 39 | Process Group indicates how processes are organized in groups for parallel execution using Tensor Parallelism and Data Parallelism. |
| 40 | |
| 41 | NOTE, the ProcessGroup must be used after `torch.distributed.initialize()` |
| 42 | |
| 43 | |
| 44 | Args: |
| 45 | rank: the global rank of the current process. |
| 46 | ranks: List[int], a list of rank id belongings to this process group. |
| 47 | backend: str, the backend of the process group. |
| 48 | tp_degree: Optional[int], tensor parallelism degree. How many processes are inside a tp process group. default None means 1. |
| 49 | dp_degree: Optional[int], data parallelism degree. How many processes are inside a dp process group. . default None means len(ranks). |
| 50 | """ |
| 51 | |
| 52 | def __init__( |
| 53 | self, |
| 54 | rank: Optional[int] = None, |
| 55 | ranks: Optional[List[int]] = None, |
| 56 | tp_degree: Optional[int] = None, |
| 57 | dp_degree: Optional[int] = None, |
| 58 | ) -> None: |
| 59 | if not torch.distributed.is_initialized(): |
| 60 | self.is_init = False |
| 61 | return |
| 62 | global PYTORCHPGDICT_ |
| 63 | if PYTORCHPGDICT_ is None: |
| 64 | PYTORCHPGDICT_ = PyTorchProcessGroupDict() |
| 65 | |
| 66 | assert torch.distributed.is_initialized(), f"ProcessGroup must be used after distributed initialized" |
| 67 | |
| 68 | self._rank = torch.distributed.get_rank() |
| 69 | if rank is not None: |
| 70 | assert self._rank == rank # make sure that the global rank is correct |
| 71 | |
| 72 | if ranks is None: |
| 73 | self._rank_list = list(range(torch.distributed.get_world_size())) |
| 74 | else: |
| 75 | self._rank_list = ranks |
| 76 | self._rank_list.sort() # ensure that the list is in order |
| 77 | |
| 78 | self._world_size = len(self._rank_list) |
| 79 | |
| 80 | if dp_degree is None and tp_degree is None: |
| 81 | self._dp_degree = self._world_size |
| 82 | self._tp_degree = 1 |
| 83 | elif dp_degree and not tp_degree: |
| 84 | self._dp_degree = dp_degree |
| 85 | assert ( |
| 86 | self._world_size % self._dp_degree == 0 |
| 87 | ), f"DP degree {dp_degree} should be divisible by {self._world_size} hen DP degree is None" |
| 88 | self._tp_degree = self._world_size // dp_degree |
| 89 | elif not dp_degree and tp_degree: |
| 90 | self._tp_degree = tp_degree |
| 91 | assert ( |
| 92 | self._world_size % self._tp_degree == 0 |
| 93 | ), f"TP degree {tp_degree} should be divisible by {self._world_size} when DP degree is None" |
| 94 | self._dp_degree = self._world_size // tp_degree |
no outgoing calls
searching dependent graphs…