Build the model.
(args)
| 44 | from detokenizer import * |
| 45 | |
| 46 | def get_model(args): |
| 47 | """Build the model.""" |
| 48 | |
| 49 | print_rank_0('building GPT2 model ...') |
| 50 | model = GPT2Model(num_layers=args.num_layers, |
| 51 | vocab_size=args.vocab_size, |
| 52 | hidden_size=args.hidden_size, |
| 53 | num_attention_heads=args.num_attention_heads, |
| 54 | embedding_dropout_prob=args.hidden_dropout, |
| 55 | attention_dropout_prob=args.attention_dropout, |
| 56 | output_dropout_prob=args.hidden_dropout, |
| 57 | max_sequence_length=args.max_position_embeddings, |
| 58 | checkpoint_activations=args.checkpoint_activations, |
| 59 | checkpoint_num_layers=args.checkpoint_num_layers, |
| 60 | parallel_output=not args.cloze_eval) |
| 61 | |
| 62 | print_rank_0(' > number of parameters: {}'.format( |
| 63 | sum([p.nelement() for p in model.parameters()]))) |
| 64 | |
| 65 | # GPU allocation. |
| 66 | model.cuda(torch.cuda.current_device()) |
| 67 | |
| 68 | # Fp16 conversion. |
| 69 | if args.fp16: |
| 70 | model = FP16_Module(model) |
| 71 | |
| 72 | # Wrap model for distributed training. |
| 73 | model = DDP(model) |
| 74 | |
| 75 | return model |
| 76 | |
| 77 | |
| 78 | def setup_model(args): |
no test coverage detected