(pg)
| 31 | |
| 32 | |
| 33 | def process_group_is_intranode(pg): |
| 34 | if pg is None: |
| 35 | from torch.distributed.distributed_c10d import _get_default_group |
| 36 | |
| 37 | pg = _get_default_group() |
| 38 | |
| 39 | local_world_size = None |
| 40 | for var in ["LOCAL_WORLD_SIZE", "OMPI_COMM_WORLD_LOCAL_SIZE", "SLURM_TASKS_PER_NODE"]: |
| 41 | if var in os.environ: |
| 42 | local_world_size = int(os.environ["LOCAL_WORLD_SIZE"]) |
| 43 | if local_world_size is None: |
| 44 | local_world_size = torch.cuda.device_count() |
| 45 | |
| 46 | group_ranks = dist.get_process_group_ranks(pg) |
| 47 | group_ranks_node_ids = [rank // local_world_size for rank in group_ranks] |
| 48 | return min(group_ranks_node_ids) == max(group_ranks_node_ids) |
| 49 | |
| 50 | |
| 51 | def cast_to_fp8( |
no test coverage detected
searching dependent graphs…