MCPcopy Index your code
hub / github.com/microsoft/BitNet / loadOriginalParamsJson

Method loadOriginalParamsJson

utils/convert.py:326–369  ·  view source on GitHub ↗
(model: LazyModel, config_path: Path)

Source from the content-addressed store, hash-verified

324 # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1}
325 @staticmethod
326 def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params:
327 with open(config_path) as f:
328 config = json.load(f)
329
330 n_experts = None
331 n_experts_used = None
332 f_rope_freq_base = None
333
334 # hack to determine LLaMA v1 vs v2 vs CodeLlama
335 if config.get("moe"):
336 # Mixtral
337 n_ctx = 32768
338 elif config.get("rope_theta") == 1000000:
339 # CodeLlama
340 n_ctx = 16384
341 elif config["norm_eps"] == 1e-05:
342 # LLaMA v2
343 n_ctx = 4096
344 else:
345 # LLaMA v1
346 n_ctx = 2048
347
348 if "layers.0.feed_forward.w1.weight" in model:
349 n_ff = model["layers.0.feed_forward.w1.weight"].shape[0]
350
351 if config.get("moe"):
352 n_ff = model["layers.0.feed_forward.experts.0.w1.weight"].shape[0]
353 n_experts = config["moe"]["num_experts"]
354 n_experts_used = config["moe"]["num_experts_per_tok"]
355 f_rope_freq_base = 1e6
356
357 return Params(
358 n_vocab = model["tok_embeddings.weight"].shape[0],
359 n_embd = config["dim"],
360 n_layer = config["n_layers"],
361 n_ctx = n_ctx,
362 n_ff = n_ff,
363 n_head = (n_head := config["n_heads"]),
364 n_head_kv = config.get("n_kv_heads", n_head),
365 n_experts = n_experts,
366 n_experts_used = n_experts_used,
367 f_norm_eps = config["norm_eps"],
368 f_rope_freq_base = config.get("rope_theta", f_rope_freq_base),
369 )
370
371 @staticmethod
372 def load(model_plus: ModelPlus) -> Params:

Callers 1

loadMethod · 0.45

Calls 2

ParamsClass · 0.70
loadMethod · 0.45

Tested by

no test coverage detected