run process
(self, error_queue)
| 58 | p.join() |
| 59 | |
| 60 | def multi_card_train(self, error_queue): |
| 61 | """ run process """ |
| 62 | setattr(self.args, 'gpu_ranks', [int(i) for i in self.args.gpu_ranks]) |
| 63 | |
| 64 | try: |
| 65 | gpu_rank = distributed.multi_init(self.device_id, self.args.world_size, self.args.gpu_ranks) |
| 66 | print('gpu_rank %d' % gpu_rank) |
| 67 | if gpu_rank != self.args.gpu_ranks[self.device_id]: |
| 68 | raise AssertionError("An error occurred in Distributed initialization") |
| 69 | runner = Running(self.args, self.device_id) |
| 70 | runner.train() |
| 71 | except KeyboardInterrupt: |
| 72 | pass # killed by parent, do nothing |
| 73 | except Exception: |
| 74 | # propagate exception to parent process, keeping original traceback |
| 75 | import traceback |
| 76 | error_queue.put((self.args.gpu_ranks[self.device_id], traceback.format_exc())) |
| 77 | |
| 78 | |
| 79 | class ErrorHandler(object): |