Args: main_func: a function that will be called by `main_func(*args)` num_machines (int): the total number of machines machine_rank (int): the rank of this machine (one per machine) dist_url (str): url to connect to for distributed training, including protocol
(
main_func,
num_gpus_per_machine,
num_machines=1,
machine_rank=0,
backend="nccl",
dist_url=None,
args=(),
)
| 38 | |
| 39 | |
| 40 | def launch( |
| 41 | main_func, |
| 42 | num_gpus_per_machine, |
| 43 | num_machines=1, |
| 44 | machine_rank=0, |
| 45 | backend="nccl", |
| 46 | dist_url=None, |
| 47 | args=(), |
| 48 | ): |
| 49 | """ |
| 50 | Args: |
| 51 | main_func: a function that will be called by `main_func(*args)` |
| 52 | num_machines (int): the total number of machines |
| 53 | machine_rank (int): the rank of this machine (one per machine) |
| 54 | dist_url (str): url to connect to for distributed training, including protocol |
| 55 | e.g. "tcp://127.0.0.1:8686". |
| 56 | Can be set to auto to automatically select a free port on localhost |
| 57 | args (tuple): arguments passed to main_func |
| 58 | """ |
| 59 | world_size = num_machines * num_gpus_per_machine |
| 60 | if world_size > 1: |
| 61 | if int(os.environ.get("WORLD_SIZE", "1")) > 1: |
| 62 | dist_url = "{}:{}".format( |
| 63 | os.environ.get("MASTER_ADDR", None), |
| 64 | os.environ.get("MASTER_PORT", "None"), |
| 65 | ) |
| 66 | local_rank = int(os.environ.get("LOCAL_RANK", "0")) |
| 67 | world_size = int(os.environ.get("WORLD_SIZE", "1")) |
| 68 | _distributed_worker( |
| 69 | local_rank, |
| 70 | main_func, |
| 71 | world_size, |
| 72 | num_gpus_per_machine, |
| 73 | num_machines, |
| 74 | machine_rank, |
| 75 | backend, |
| 76 | dist_url, |
| 77 | args, |
| 78 | ) |
| 79 | exit() |
| 80 | launch_by_subprocess( |
| 81 | sys.argv, |
| 82 | world_size, |
| 83 | num_machines, |
| 84 | machine_rank, |
| 85 | num_gpus_per_machine, |
| 86 | dist_url, |
| 87 | args, |
| 88 | ) |
| 89 | else: |
| 90 | main_func(*args) |
| 91 | |
| 92 | |
| 93 | def launch_by_subprocess( |
no test coverage detected