| 638 | |
| 639 | |
| 640 | class LoraManager(object): |
| 641 | LORA_MODULE_IDS = { |
| 642 | "attn_qkv": 0, |
| 643 | "attn_q": 1, |
| 644 | "attn_k": 2, |
| 645 | "attn_v": 3, |
| 646 | "attn_dense": 4, |
| 647 | "mlp_h_to_4h": 5, |
| 648 | "mlp_4h_to_h": 6, |
| 649 | "mlp_gate": 7, |
| 650 | "cross_attn_qkv": 8, |
| 651 | "cross_attn_q": 9, |
| 652 | "cross_attn_k": 10, |
| 653 | "cross_attn_v": 11, |
| 654 | "cross_attn_dense": 12, |
| 655 | "moe_h_to_4h": 13, |
| 656 | "moe_4h_to_h": 14, |
| 657 | "moe_gate": 15, |
| 658 | "moe_router": 16, |
| 659 | "mlp_router": 17, |
| 660 | "mlp_gate_up": 18, |
| 661 | } |
| 662 | |
| 663 | def __init__( |
| 664 | self, |
| 665 | *, |
| 666 | mapping: Mapping, |
| 667 | model_config: "ModelConfig", |
| 668 | cpp_peft_cache_manager: tb_internal.batch_manager.PeftCacheManager | None = None, |
| 669 | ): |
| 670 | """Constructor. |
| 671 | |
| 672 | Args: |
| 673 | mapping (Mapping): Parallelism related information. |
| 674 | model_config (ModelConfig): model configuration python class instance. |
| 675 | cpp_peft_cache_manager (PeftCacheManager, optional): used by is_adapter_in_cpu_cache method, that's used for |
| 676 | a performance optimization with LoRA of not sending the LoRA adapter weights with every LLM request when |
| 677 | the adapter is already loaded in the LoRA CPU cache. |
| 678 | """ |
| 679 | # _lora_uid_to_low_ranks: dict[str -> dict[int -> dict[str -> int]]] |
| 680 | # { |
| 681 | # uid: { |
| 682 | # 0: { |
| 683 | # lora_module: int |
| 684 | # }, # layer_0_rank, |
| 685 | # 1: { |
| 686 | # lora_module: int |
| 687 | # }, # layer_1_rank, |
| 688 | # ... |
| 689 | # } |
| 690 | # } |
| 691 | |
| 692 | # _lora_weights_pointers_list: dict[str -> dict[int -> dict[str -> [Tensor, Tensor]]]] |
| 693 | # { |
| 694 | # uid: { |
| 695 | # 0: { |
| 696 | # lora_module: [t_in, t_out] |
| 697 | # }, # layer_0, |
no outgoing calls
no test coverage detected