MCPcopy
hub / github.com/NVIDIA/TensorRT-LLM / LoraManager

Class LoraManager

tensorrt_llm/lora_manager.py:640–1269  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

638
639
640class LoraManager(object):
641 LORA_MODULE_IDS = {
642 "attn_qkv": 0,
643 "attn_q": 1,
644 "attn_k": 2,
645 "attn_v": 3,
646 "attn_dense": 4,
647 "mlp_h_to_4h": 5,
648 "mlp_4h_to_h": 6,
649 "mlp_gate": 7,
650 "cross_attn_qkv": 8,
651 "cross_attn_q": 9,
652 "cross_attn_k": 10,
653 "cross_attn_v": 11,
654 "cross_attn_dense": 12,
655 "moe_h_to_4h": 13,
656 "moe_4h_to_h": 14,
657 "moe_gate": 15,
658 "moe_router": 16,
659 "mlp_router": 17,
660 "mlp_gate_up": 18,
661 }
662
663 def __init__(
664 self,
665 *,
666 mapping: Mapping,
667 model_config: "ModelConfig",
668 cpp_peft_cache_manager: tb_internal.batch_manager.PeftCacheManager | None = None,
669 ):
670 """Constructor.
671
672 Args:
673 mapping (Mapping): Parallelism related information.
674 model_config (ModelConfig): model configuration python class instance.
675 cpp_peft_cache_manager (PeftCacheManager, optional): used by is_adapter_in_cpu_cache method, that's used for
676 a performance optimization with LoRA of not sending the LoRA adapter weights with every LLM request when
677 the adapter is already loaded in the LoRA CPU cache.
678 """
679 # _lora_uid_to_low_ranks: dict[str -> dict[int -> dict[str -> int]]]
680 # {
681 # uid: {
682 # 0: {
683 # lora_module: int
684 # }, # layer_0_rank,
685 # 1: {
686 # lora_module: int
687 # }, # layer_1_rank,
688 # ...
689 # }
690 # }
691
692 # _lora_weights_pointers_list: dict[str -> dict[int -> dict[str -> [Tensor, Tensor]]]]
693 # {
694 # uid: {
695 # 0: {
696 # lora_module: [t_in, t_out]
697 # }, # layer_0,

Callers 6

__init__Method · 0.90
setup_engineMethod · 0.85
__init__Method · 0.85
from_engineMethod · 0.85
from_dirMethod · 0.85
from_dirMethod · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected