MCPcopy
hub / github.com/deepspeedai/DeepSpeedExamples / get_model

Function get_model

Megatron-LM/pretrain_gpt2.py:54–90  ·  view source on GitHub ↗

Build the model.

(args)

Source from the content-addressed store, hash-verified

52
53
54def get_model(args):
55 """Build the model."""
56
57 print_rank_0('building GPT2 model ...')
58 model = GPT2Model(num_layers=args.num_layers,
59 vocab_size=args.vocab_size,
60 hidden_size=args.hidden_size,
61 num_attention_heads=args.num_attention_heads,
62 embedding_dropout_prob=args.hidden_dropout,
63 attention_dropout_prob=args.attention_dropout,
64 output_dropout_prob=args.hidden_dropout,
65 max_sequence_length=args.max_position_embeddings,
66 checkpoint_activations=args.checkpoint_activations,
67 checkpoint_num_layers=args.checkpoint_num_layers,
68 parallel_output=True)
69
70 if mpu.get_data_parallel_rank() == 0:
71 print(' > number of parameters on model parallel rank {}: {}'.format(
72 mpu.get_model_parallel_rank(),
73 sum([p.nelement() for p in model.parameters()])), flush=True)
74
75 # GPU allocation.
76 model.cuda(torch.cuda.current_device())
77
78 # Fp16 conversion.
79 if args.fp16:
80 model = FP16_Module(model)
81
82 # Wrap model for distributed training.
83 if USE_TORCH_DDP:
84 i = torch.cuda.current_device()
85 model = DDP(model, device_ids=[i], output_device=i,
86 process_group=mpu.get_data_parallel_group())
87 else:
88 model = DDP(model)
89
90 return model
91
92
93def get_optimizer(model, args):

Callers 1

Calls 3

print_rank_0Function · 0.90
GPT2ModelClass · 0.90
FP16_ModuleClass · 0.90

Tested by

no test coverage detected