MCPcopy
hub / github.com/NVIDIA/TensorRT-LLM / from_dir

Method from_dir

tensorrt_llm/runtime/model_runner_cpp.py:90–453  ·  view source on GitHub ↗

Create a ModelRunnerCpp instance from an engine directory. Args: engine_dir (str): The directory that contains the serialized engine files and config files. lora_dir (str): The directory that contains LoRA weights.

(
        cls,
        engine_dir: str,
        *,
        lora_dir: Optional[str] = None,
        rank: int = 0,
        max_batch_size: Optional[int] = None,
        max_input_len: Optional[int] = None,
        max_output_len: Optional[int] = None,
        max_beam_width: Optional[int] = None,
        max_attention_window_size: Optional[list[int]] = None,
        sink_token_length: Optional[int] = None,
        kv_cache_free_gpu_memory_fraction: Optional[float] = None,
        cross_kv_cache_fraction: Optional[float] = None,
        medusa_choices: list[list[int]] | None = None,
        eagle_choices: list[list[int]] | None = None,
        eagle_posterior_threshold: float | None = None,
        eagle_use_dynamic_tree: bool = False,
        eagle_dynamic_tree_max_top_k: Optional[int] = None,
        lookahead_config: list[int] | None = None,
        debug_mode: bool = False,
        lora_ckpt_source: str = "hf",
        use_gpu_direct_storage: bool = False,
        gpu_weights_percent: float = 1,
        max_tokens_in_paged_kv_cache: int | None = None,
        kv_cache_enable_block_reuse: bool = False,
        enable_chunked_context: bool = False,
        is_enc_dec: bool = False,
        multi_block_mode: bool = True,
        enable_context_fmha_fp32_acc: Optional[bool] = None,
        cuda_graph_mode: Optional[bool] = None,
        logits_processor_map: Optional[Dict[str, LogitsProcessor]] = None,
        device_ids: List[int] | None = None,
        is_orchestrator_mode: bool = False,
        use_runtime_defaults: bool = True,
        gather_generation_logits: bool = False,
        use_variable_beam_width_search: bool = False,
        mm_embedding_offloading: bool = False,
        fail_fast_on_attention_window_too_large: bool = False,
    )

Source from the content-addressed store, hash-verified

88
89 @classmethod
90 def from_dir(
91 cls,
92 engine_dir: str,
93 *,
94 lora_dir: Optional[str] = None,
95 rank: int = 0,
96 max_batch_size: Optional[int] = None,
97 max_input_len: Optional[int] = None,
98 max_output_len: Optional[int] = None,
99 max_beam_width: Optional[int] = None,
100 max_attention_window_size: Optional[list[int]] = None,
101 sink_token_length: Optional[int] = None,
102 kv_cache_free_gpu_memory_fraction: Optional[float] = None,
103 cross_kv_cache_fraction: Optional[float] = None,
104 medusa_choices: list[list[int]] | None = None,
105 eagle_choices: list[list[int]] | None = None,
106 eagle_posterior_threshold: float | None = None,
107 eagle_use_dynamic_tree: bool = False,
108 eagle_dynamic_tree_max_top_k: Optional[int] = None,
109 lookahead_config: list[int] | None = None,
110 debug_mode: bool = False,
111 lora_ckpt_source: str = "hf",
112 use_gpu_direct_storage: bool = False,
113 gpu_weights_percent: float = 1,
114 max_tokens_in_paged_kv_cache: int | None = None,
115 kv_cache_enable_block_reuse: bool = False,
116 enable_chunked_context: bool = False,
117 is_enc_dec: bool = False,
118 multi_block_mode: bool = True,
119 enable_context_fmha_fp32_acc: Optional[bool] = None,
120 cuda_graph_mode: Optional[bool] = None,
121 logits_processor_map: Optional[Dict[str, LogitsProcessor]] = None,
122 device_ids: List[int] | None = None,
123 is_orchestrator_mode: bool = False,
124 use_runtime_defaults: bool = True,
125 gather_generation_logits: bool = False,
126 use_variable_beam_width_search: bool = False,
127 mm_embedding_offloading: bool = False,
128 fail_fast_on_attention_window_too_large: bool = False,
129 ) -> 'ModelRunnerCpp':
130 """
131 Create a ModelRunnerCpp instance from an engine directory.
132
133 Args:
134 engine_dir (str):
135 The directory that contains the serialized engine files and config files.
136 lora_dir (str):
137 The directory that contains LoRA weights.
138 rank (int):
139 The runtime rank id.
140 max_batch_size (int):
141 The runtime batch size limit. If max_batch_size is not None, it should not
142 be larger than the engine's max_batch_size; otherwise, the engine's max_batch_size
143 will be used.
144 max_input_len (int):
145 The runtime input length limit. If max_input_len is not None, it should not
146 be larger than the engine's max_input_len; otherwise, the engine's max_input_len
147 will be used.

Callers

nothing calls this directly

Calls 13

load_from_ckptMethod · 0.95
_world_config_to_mappingFunction · 0.85
LoraManagerClass · 0.85
mpi_broadcastFunction · 0.85
from_cppMethod · 0.80
from_model_config_cppMethod · 0.80
from_json_fileMethod · 0.45
startMethod · 0.45
warningMethod · 0.45
stopMethod · 0.45
elapsed_time_in_secMethod · 0.45

Tested by

no test coverage detected