(cls, model_type, override_args=None)
| 205 | |
| 206 | @classmethod |
| 207 | def from_pretrained(cls, model_type, override_args=None): |
| 208 | assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'} |
| 209 | override_args = override_args or {} # default to empty dict |
| 210 | # only dropout can be overridden see more notes below |
| 211 | assert all(k == 'dropout' for k in override_args) |
| 212 | from transformers import GPT2LMHeadModel |
| 213 | print("loading weights from pretrained gpt: %s" % model_type) |
| 214 | |
| 215 | # n_layer, n_head and n_embd are determined from model_type |
| 216 | config_args = { |
| 217 | 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params |
| 218 | 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params |
| 219 | 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params |
| 220 | 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params |
| 221 | }[model_type] |
| 222 | print("forcing vocab_size=50257, block_size=1024, bias=True") |
| 223 | config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints |
| 224 | config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints |
| 225 | config_args['bias'] = True # always True for GPT model checkpoints |
| 226 | # we can override the dropout rate, if desired |
| 227 | if 'dropout' in override_args: |
| 228 | print(f"overriding dropout rate to {override_args['dropout']}") |
| 229 | config_args['dropout'] = override_args['dropout'] |
| 230 | # create a from-scratch initialized minGPT model |
| 231 | config = GPTConfig(**config_args) |
| 232 | model = GPT(config) |
| 233 | sd = model.state_dict() |
| 234 | sd_keys = sd.keys() |
| 235 | sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param |
| 236 | |
| 237 | # init a huggingface/transformers model |
| 238 | model_hf = GPT2LMHeadModel.from_pretrained(model_type) |
| 239 | sd_hf = model_hf.state_dict() |
| 240 | |
| 241 | # copy while ensuring all of the parameters are aligned and match in names and shapes |
| 242 | sd_keys_hf = sd_hf.keys() |
| 243 | sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer |
| 244 | sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer) |
| 245 | transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight'] |
| 246 | # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear |
| 247 | # this means that we have to transpose these weights when we import them |
| 248 | assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}" |
| 249 | for k in sd_keys_hf: |
| 250 | if any(k.endswith(w) for w in transposed): |
| 251 | # special treatment for the Conv1D weights we need to transpose |
| 252 | assert sd_hf[k].shape[::-1] == sd[k].shape |
| 253 | with torch.no_grad(): |
| 254 | sd[k].copy_(sd_hf[k].t()) |
| 255 | else: |
| 256 | # vanilla copy over the other parameters |
| 257 | assert sd_hf[k].shape == sd[k].shape |
| 258 | with torch.no_grad(): |
| 259 | sd[k].copy_(sd_hf[k]) |
| 260 | |
| 261 | return model |
| 262 | |
| 263 | def configure_optimizers(self, weight_decay, learning_rate, betas, device_type): |
| 264 | # start with all of the candidate parameters |
no test coverage detected