| 324 | # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1} |
| 325 | @staticmethod |
| 326 | def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: |
| 327 | with open(config_path) as f: |
| 328 | config = json.load(f) |
| 329 | |
| 330 | n_experts = None |
| 331 | n_experts_used = None |
| 332 | f_rope_freq_base = None |
| 333 | |
| 334 | # hack to determine LLaMA v1 vs v2 vs CodeLlama |
| 335 | if config.get("moe"): |
| 336 | # Mixtral |
| 337 | n_ctx = 32768 |
| 338 | elif config.get("rope_theta") == 1000000: |
| 339 | # CodeLlama |
| 340 | n_ctx = 16384 |
| 341 | elif config["norm_eps"] == 1e-05: |
| 342 | # LLaMA v2 |
| 343 | n_ctx = 4096 |
| 344 | else: |
| 345 | # LLaMA v1 |
| 346 | n_ctx = 2048 |
| 347 | |
| 348 | if "layers.0.feed_forward.w1.weight" in model: |
| 349 | n_ff = model["layers.0.feed_forward.w1.weight"].shape[0] |
| 350 | |
| 351 | if config.get("moe"): |
| 352 | n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0] |
| 353 | n_experts = config["moe"]["num_experts"] |
| 354 | n_experts_used = config["moe"]["num_experts_per_tok"] |
| 355 | f_rope_freq_base = 1e6 |
| 356 | |
| 357 | return Params( |
| 358 | n_vocab = model["tok_embeddings.weight"].shape[0], |
| 359 | n_embd = config["dim"], |
| 360 | n_layer = config["n_layers"], |
| 361 | n_ctx = n_ctx, |
| 362 | n_ff = n_ff, |
| 363 | n_head = (n_head := config["n_heads"]), |
| 364 | n_head_kv = config.get("n_kv_heads", n_head), |
| 365 | n_experts = n_experts, |
| 366 | n_experts_used = n_experts_used, |
| 367 | f_norm_eps = config["norm_eps"], |
| 368 | f_rope_freq_base = config.get("rope_theta", f_rope_freq_base), |
| 369 | ) |
| 370 | |
| 371 | @staticmethod |
| 372 | def load(model_plus: ModelPlus) -> Params: |