This process tries to clean up the remote training tasks.
(get_all_remote_pids, conn)
| 18 | |
| 19 | |
| 20 | def cleanup_proc(get_all_remote_pids, conn): |
| 21 | """This process tries to clean up the remote training tasks.""" |
| 22 | print("cleanupu process runs") |
| 23 | # This process should not handle SIGINT. |
| 24 | signal.signal(signal.SIGINT, signal.SIG_IGN) |
| 25 | |
| 26 | data = conn.recv() |
| 27 | # If the launch process exits normally, this process doesn't need to do anything. |
| 28 | if data == "exit": |
| 29 | sys.exit(0) |
| 30 | else: |
| 31 | remote_pids = get_all_remote_pids() |
| 32 | # Otherwise, we need to ssh to each machine and kill the training jobs. |
| 33 | for (ip, port), pids in remote_pids.items(): |
| 34 | kill_process(ip, port, pids) |
| 35 | print("cleanup process exits") |
| 36 | |
| 37 | |
| 38 | def kill_process(ip, port, pids): |
nothing calls this directly
no test coverage detected