()
| 715 | |
| 716 | |
| 717 | def main(): |
| 718 | parser = get_argument_parser() |
| 719 | |
| 720 | # Include DeepSpeed configuration arguments |
| 721 | parser = deepspeed.add_config_arguments(parser) |
| 722 | |
| 723 | args = parser.parse_args() |
| 724 | |
| 725 | if args.local_rank == -1 or args.no_cuda: |
| 726 | device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") |
| 727 | n_gpu = torch.cuda.device_count() |
| 728 | else: |
| 729 | torch.cuda.set_device(args.local_rank) |
| 730 | device = torch.device("cuda", args.local_rank) |
| 731 | n_gpu = 1 |
| 732 | # Initializes the distributed backend which will take care of sychronizing nodes/GPUs |
| 733 | torch.distributed.init_process_group(backend='nccl') |
| 734 | logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( |
| 735 | device, n_gpu, bool(args.local_rank != -1), args.fp16)) |
| 736 | |
| 737 | if args.gradient_accumulation_steps < 1: |
| 738 | raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( |
| 739 | args.gradient_accumulation_steps)) |
| 740 | |
| 741 | args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) |
| 742 | |
| 743 | random.seed(args.seed) |
| 744 | np.random.seed(args.seed) |
| 745 | torch.manual_seed(args.seed) |
| 746 | if n_gpu > 0: |
| 747 | torch.cuda.manual_seed_all(args.seed) |
| 748 | |
| 749 | if not args.do_train and not args.do_predict: |
| 750 | raise ValueError("At least one of `do_train` or `do_predict` must be True.") |
| 751 | |
| 752 | if args.do_train: |
| 753 | if not args.train_file: |
| 754 | raise ValueError( |
| 755 | "If `do_train` is True, then `train_file` must be specified.") |
| 756 | if args.do_predict: |
| 757 | if not args.predict_file: |
| 758 | raise ValueError( |
| 759 | "If `do_predict` is True, then `predict_file` must be specified.") |
| 760 | |
| 761 | if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: |
| 762 | raise ValueError("Output directory () already exists and is not empty.") |
| 763 | os.makedirs(args.output_dir, exist_ok=True) |
| 764 | |
| 765 | |
| 766 | # Prepare Summary writer |
| 767 | if torch.distributed.get_rank() == 0 and args.job_name is not None: |
| 768 | args.summary_writer = get_summary_writer(name=args.job_name, |
| 769 | base=args.output_dir) |
| 770 | else: |
| 771 | args.summary_writer = None |
| 772 | |
| 773 | tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) |
| 774 |
no test coverage detected