hub / github.com/haykgrigo3/TimeCapsuleLLM / from_pretrained

Method from_pretrained

london_1800_1875_v0.5/v0.5/model.py:207–261 · view source on GitHub ↗

(cls, model_type, override_args=None)

Source from the content-addressed store, hash-verified

205
206	@classmethod
207	def from_pretrained(cls, model_type, override_args=None):
208	assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
209	override_args = override_args or {} # default to empty dict
210	# only dropout can be overridden see more notes below
211	assert all(k == 'dropout' for k in override_args)
212	from transformers import GPT2LMHeadModel
213	print("loading weights from pretrained gpt: %s" % model_type)
214
215	# n_layer, n_head and n_embd are determined from model_type
216	config_args = {
217	'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
218	'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
219	'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
220	'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
221	}[model_type]
222	print("forcing vocab_size=50257, block_size=1024, bias=True")
223	config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
224	config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
225	config_args['bias'] = True # always True for GPT model checkpoints
226	# we can override the dropout rate, if desired
227	if 'dropout' in override_args:
228	print(f"overriding dropout rate to {override_args['dropout']}")
229	config_args['dropout'] = override_args['dropout']
230	# create a from-scratch initialized minGPT model
231	config = GPTConfig(**config_args)
232	model = GPT(config)
233	sd = model.state_dict()
234	sd_keys = sd.keys()
235	sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
236
237	# init a huggingface/transformers model
238	model_hf = GPT2LMHeadModel.from_pretrained(model_type)
239	sd_hf = model_hf.state_dict()
240
241	# copy while ensuring all of the parameters are aligned and match in names and shapes
242	sd_keys_hf = sd_hf.keys()
243	sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
244	sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
245	transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
246	# basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
247	# this means that we have to transpose these weights when we import them
248	assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
249	for k in sd_keys_hf:
250	if any(k.endswith(w) for w in transposed):
251	# special treatment for the Conv1D weights we need to transpose
252	assert sd_hf[k].shape[::-1] == sd[k].shape
253	with torch.no_grad():
254	sd[k].copy_(sd_hf[k].t())
255	else:
256	# vanilla copy over the other parameters
257	assert sd_hf[k].shape == sd[k].shape
258	with torch.no_grad():
259	sd[k].copy_(sd_hf[k])
260
261	return model
262
263	def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
264	# start with all of the candidate parameters

Callers 3

test_v2mini_eval2.pyFile · 0.45

test_v2mini_eval1.pyFile · 0.45

run_v2.pyFile · 0.45

Calls 2

GPTConfigClass · 0.70

GPTClass · 0.70

Tested by

no test coverage detected