MCPcopy
hub / github.com/deepspeedai/DeepSpeedExamples / main

Function main

BingBertSquad/nvidia_run_squad_deepspeed.py:717–1016  ·  view source on GitHub ↗
()

Source from the content-addressed store, hash-verified

715
716
717def main():
718 parser = get_argument_parser()
719
720 # Include DeepSpeed configuration arguments
721 parser = deepspeed.add_config_arguments(parser)
722
723 args = parser.parse_args()
724
725 if args.local_rank == -1 or args.no_cuda:
726 device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
727 n_gpu = torch.cuda.device_count()
728 else:
729 torch.cuda.set_device(args.local_rank)
730 device = torch.device("cuda", args.local_rank)
731 n_gpu = 1
732 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
733 torch.distributed.init_process_group(backend='nccl')
734 logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
735 device, n_gpu, bool(args.local_rank != -1), args.fp16))
736
737 if args.gradient_accumulation_steps < 1:
738 raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
739 args.gradient_accumulation_steps))
740
741 args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
742
743 random.seed(args.seed)
744 np.random.seed(args.seed)
745 torch.manual_seed(args.seed)
746 if n_gpu > 0:
747 torch.cuda.manual_seed_all(args.seed)
748
749 if not args.do_train and not args.do_predict:
750 raise ValueError("At least one of `do_train` or `do_predict` must be True.")
751
752 if args.do_train:
753 if not args.train_file:
754 raise ValueError(
755 "If `do_train` is True, then `train_file` must be specified.")
756 if args.do_predict:
757 if not args.predict_file:
758 raise ValueError(
759 "If `do_predict` is True, then `predict_file` must be specified.")
760
761 if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
762 raise ValueError("Output directory () already exists and is not empty.")
763 os.makedirs(args.output_dir, exist_ok=True)
764
765
766 # Prepare Summary writer
767 if torch.distributed.get_rank() == 0 and args.job_name is not None:
768 args.summary_writer = get_summary_writer(name=args.job_name,
769 base=args.output_dir)
770 else:
771 args.summary_writer = None
772
773 tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
774

Callers 1

Calls 15

get_argument_parserFunction · 0.90
get_summary_writerFunction · 0.90
BertConfigClass · 0.90
RandomSamplerClass · 0.90
write_summary_eventsFunction · 0.90
is_time_to_exitFunction · 0.90
set_deviceMethod · 0.80
infoMethod · 0.80
trainMethod · 0.80
appendMethod · 0.80
evalMethod · 0.80

Tested by

no test coverage detected