| 30 | |
| 31 | |
| 32 | class MultiRunning(object): |
| 33 | def __init__(self, args, device_id): |
| 34 | self.args = args |
| 35 | self.device_id = device_id |
| 36 | |
| 37 | def multi_card_run(self): |
| 38 | """ Spawns 1 process per GPU """ |
| 39 | init_logger() |
| 40 | |
| 41 | nb_gpu = self.args.world_size |
| 42 | mp = torch.multiprocessing.get_context('spawn') |
| 43 | |
| 44 | # Create a thread to listen for errors in the child processes. |
| 45 | error_queue = mp.SimpleQueue() |
| 46 | error_handler = ErrorHandler(error_queue) |
| 47 | |
| 48 | # Train with multiprocessing. |
| 49 | process = [] |
| 50 | for i in range(nb_gpu): |
| 51 | self.device_id = i |
| 52 | process.append(mp.Process(target=self.multi_card_train, args=(self.args, self.device_id, error_queue), |
| 53 | daemon=True)) |
| 54 | process[i].start() |
| 55 | logger.info(" Starting process pid: %d " % process[i].pid) |
| 56 | error_handler.add_child(process[i].pid) |
| 57 | for p in process: |
| 58 | p.join() |
| 59 | |
| 60 | def multi_card_train(self, error_queue): |
| 61 | """ run process """ |
| 62 | setattr(self.args, 'gpu_ranks', [int(i) for i in self.args.gpu_ranks]) |
| 63 | |
| 64 | try: |
| 65 | gpu_rank = distributed.multi_init(self.device_id, self.args.world_size, self.args.gpu_ranks) |
| 66 | print('gpu_rank %d' % gpu_rank) |
| 67 | if gpu_rank != self.args.gpu_ranks[self.device_id]: |
| 68 | raise AssertionError("An error occurred in Distributed initialization") |
| 69 | runner = Running(self.args, self.device_id) |
| 70 | runner.train() |
| 71 | except KeyboardInterrupt: |
| 72 | pass # killed by parent, do nothing |
| 73 | except Exception: |
| 74 | # propagate exception to parent process, keeping original traceback |
| 75 | import traceback |
| 76 | error_queue.put((self.args.gpu_ranks[self.device_id], traceback.format_exc())) |
| 77 | |
| 78 | |
| 79 | class ErrorHandler(object): |