MCPcopy
hub / github.com/haykgrigo3/TimeCapsuleLLM / from_pretrained

Method from_pretrained

london_1800_1875_v0.5/v0.5/model.py:207–261  ·  view source on GitHub ↗
(cls, model_type, override_args=None)

Source from the content-addressed store, hash-verified

205
206 @classmethod
207 def from_pretrained(cls, model_type, override_args=None):
208 assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
209 override_args = override_args or {} # default to empty dict
210 # only dropout can be overridden see more notes below
211 assert all(k == 'dropout' for k in override_args)
212 from transformers import GPT2LMHeadModel
213 print("loading weights from pretrained gpt: %s" % model_type)
214
215 # n_layer, n_head and n_embd are determined from model_type
216 config_args = {
217 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
218 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
219 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
220 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
221 }[model_type]
222 print("forcing vocab_size=50257, block_size=1024, bias=True")
223 config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
224 config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
225 config_args['bias'] = True # always True for GPT model checkpoints
226 # we can override the dropout rate, if desired
227 if 'dropout' in override_args:
228 print(f"overriding dropout rate to {override_args['dropout']}")
229 config_args['dropout'] = override_args['dropout']
230 # create a from-scratch initialized minGPT model
231 config = GPTConfig(**config_args)
232 model = GPT(config)
233 sd = model.state_dict()
234 sd_keys = sd.keys()
235 sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
236
237 # init a huggingface/transformers model
238 model_hf = GPT2LMHeadModel.from_pretrained(model_type)
239 sd_hf = model_hf.state_dict()
240
241 # copy while ensuring all of the parameters are aligned and match in names and shapes
242 sd_keys_hf = sd_hf.keys()
243 sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
244 sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
245 transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
246 # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
247 # this means that we have to transpose these weights when we import them
248 assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
249 for k in sd_keys_hf:
250 if any(k.endswith(w) for w in transposed):
251 # special treatment for the Conv1D weights we need to transpose
252 assert sd_hf[k].shape[::-1] == sd[k].shape
253 with torch.no_grad():
254 sd[k].copy_(sd_hf[k].t())
255 else:
256 # vanilla copy over the other parameters
257 assert sd_hf[k].shape == sd[k].shape
258 with torch.no_grad():
259 sd[k].copy_(sd_hf[k])
260
261 return model
262
263 def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
264 # start with all of the candidate parameters

Callers 3

run_v2.pyFile · 0.45

Calls 2

GPTConfigClass · 0.70
GPTClass · 0.70

Tested by

no test coverage detected