| 58 | |
| 59 | |
| 60 | def start_local_trainers_cpu(trainer_endpoints, training_script, training_script_args, log_dir=None): |
| 61 | current_env = copy.copy(os.environ.copy()) |
| 62 | current_env.pop("http_proxy", None) |
| 63 | current_env.pop("https_proxy", None) |
| 64 | |
| 65 | procs = [] |
| 66 | n_rank = len(trainer_endpoints) |
| 67 | print(trainer_endpoints) |
| 68 | for rank_id, endpoint in enumerate(trainer_endpoints): |
| 69 | proc_env = { |
| 70 | "PADDLE_DISTRI_BACKEND": "gloo", |
| 71 | "PADDLE_TRAINER_ID": "%d" % rank_id, |
| 72 | "PADDLE_CURRENT_ENDPOINT": "%s" % endpoint, |
| 73 | "PADDLE_TRAINERS_NUM": "%d" % n_rank, |
| 74 | "PADDLE_TRAINER_ENDPOINTS": ",".join(trainer_endpoints), |
| 75 | } |
| 76 | |
| 77 | current_env.update(proc_env) |
| 78 | |
| 79 | print("trainer proc env:{}".format(current_env)) |
| 80 | |
| 81 | assert os.getenv("WITH_COVERAGE", "OFF") == "OFF", "Gloo don't support WITH_COVERAGE." |
| 82 | cmd = "python -u " + training_script |
| 83 | |
| 84 | print("start trainer proc:{} env:{}".format(cmd, proc_env)) |
| 85 | |
| 86 | fn = None |
| 87 | |
| 88 | proc = subprocess.Popen(cmd.split(" "), env=current_env) |
| 89 | |
| 90 | tp = TrainerProc() |
| 91 | tp.proc = proc |
| 92 | tp.rank = rank_id |
| 93 | tp.log_fn = fn |
| 94 | tp.cmd = cmd |
| 95 | |
| 96 | procs.append(tp) |
| 97 | |
| 98 | return procs |
| 99 | |
| 100 | |
| 101 | def start_local_trainers( |