Build the model.
(args)
| 52 | |
| 53 | |
| 54 | def get_model(args): |
| 55 | """Build the model.""" |
| 56 | |
| 57 | print_rank_0('building GPT2 model ...') |
| 58 | model = GPT2Model(num_layers=args.num_layers, |
| 59 | vocab_size=args.vocab_size, |
| 60 | hidden_size=args.hidden_size, |
| 61 | num_attention_heads=args.num_attention_heads, |
| 62 | embedding_dropout_prob=args.hidden_dropout, |
| 63 | attention_dropout_prob=args.attention_dropout, |
| 64 | output_dropout_prob=args.hidden_dropout, |
| 65 | max_sequence_length=args.max_position_embeddings, |
| 66 | checkpoint_activations=args.checkpoint_activations, |
| 67 | checkpoint_num_layers=args.checkpoint_num_layers, |
| 68 | parallel_output=True) |
| 69 | |
| 70 | if mpu.get_data_parallel_rank() == 0: |
| 71 | print(' > number of parameters on model parallel rank {}: {}'.format( |
| 72 | mpu.get_model_parallel_rank(), |
| 73 | sum([p.nelement() for p in model.parameters()])), flush=True) |
| 74 | |
| 75 | # GPU allocation. |
| 76 | model.cuda(torch.cuda.current_device()) |
| 77 | |
| 78 | # Fp16 conversion. |
| 79 | if args.fp16: |
| 80 | model = FP16_Module(model) |
| 81 | |
| 82 | # Wrap model for distributed training. |
| 83 | if USE_TORCH_DDP: |
| 84 | i = torch.cuda.current_device() |
| 85 | model = DDP(model, device_ids=[i], output_device=i, |
| 86 | process_group=mpu.get_data_parallel_group()) |
| 87 | else: |
| 88 | model = DDP(model) |
| 89 | |
| 90 | return model |
| 91 | |
| 92 | |
| 93 | def get_optimizer(model, args): |
no test coverage detected