MCPcopy
hub / github.com/NVIDIA/TensorRT-LLM / from_dir

Method from_dir

tensorrt_llm/runtime/model_runner.py:639–782  ·  view source on GitHub ↗

Create a ModelRunner instance from an engine directory. Args: engine_dir (str): The directory that contains the serialized engine files and config files. max_output_len (Optional[int]): max_output_len, this arg might be availa

(
        cls,
        engine_dir: str,
        *,
        max_output_len: Optional[int] = None,
        lora_dir: Optional[List[str]] = None,
        rank: int = 0,
        debug_mode: bool = False,
        lora_ckpt_source: str = "hf",
        medusa_choices: List[List[int]] = None,
        stream: torch.cuda.Stream = None,
        gpu_weights_percent: float = 1,
        enable_context_fmha_fp32_acc: Optional[bool] = None,
        multi_block_mode: Optional[bool] = None,
        fail_fast_on_attention_window_too_large: bool = False,
    )

Source from the content-addressed store, hash-verified

637
638 @classmethod
639 def from_dir(
640 cls,
641 engine_dir: str,
642 *,
643 max_output_len: Optional[int] = None,
644 lora_dir: Optional[List[str]] = None,
645 rank: int = 0,
646 debug_mode: bool = False,
647 lora_ckpt_source: str = "hf",
648 medusa_choices: List[List[int]] = None,
649 stream: torch.cuda.Stream = None,
650 gpu_weights_percent: float = 1,
651 enable_context_fmha_fp32_acc: Optional[bool] = None,
652 multi_block_mode: Optional[bool] = None,
653 fail_fast_on_attention_window_too_large: bool = False,
654 ) -> 'ModelRunner':
655 """
656 Create a ModelRunner instance from an engine directory.
657
658 Args:
659 engine_dir (str):
660 The directory that contains the serialized engine files and config files.
661 max_output_len (Optional[int]):
662 max_output_len, this arg might be available only when loading time, generate will still to check when disable_kv_cache is enabled.
663 lora_dir (Optional[List[str]]):
664 The directories that contain LoRA weights.
665 rank (int):
666 The runtime rank id.
667 debug_mode (bool):
668 Whether or not to turn on the debug mode.
669 medusa_choices (List[List[int]]):
670 Medusa choices to use when in Medusa decoding
671 stream (torch.cuda.Stream):
672 Stream to use.
673 multi_block_mode (bool):
674 Whether to distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel.
675 fail_fast_on_attention_window_too_large (bool):
676 Exit with runtime error when attention window is too large to fit even a single sequence in the KV cache.
677 Note: This parameter is only applicable to C++ runtime (ModelRunnerCpp).
678 Returns:
679 ModelRunner: An instance of ModelRunner.
680 """
681 engine_version = get_engine_version(engine_dir)
682 profiler.start('load tensorrt_llm engine')
683 # the old engine format
684 if engine_version is None:
685 engine_dir = Path(engine_dir)
686 config_path = engine_dir / "config.json"
687 model_config, other_config = read_config(config_path)
688 world_size = other_config.pop('world_size')
689 tp_size = other_config.pop('tp_size')
690 pp_size = other_config.pop('pp_size')
691 max_batch_size = other_config.pop('max_batch_size')
692 max_input_len = other_config.pop('max_input_len')
693 max_output_len = other_config.pop('max_output_len')
694 max_beam_width = other_config.pop('max_beam_width')
695 runtime_mapping = Mapping(world_size=world_size,
696 rank=rank,

Callers 2

_load_engine_bufferMethod · 0.45
init_llmMethod · 0.45

Calls 13

load_from_ckptMethod · 0.95
get_engine_versionFunction · 0.85
MappingClass · 0.85
LoraManagerClass · 0.85
popMethod · 0.80
read_configFunction · 0.70
get_engine_nameFunction · 0.70
startMethod · 0.45
_set_weight_streamingMethod · 0.45
stopMethod · 0.45
elapsed_time_in_secMethod · 0.45
infoMethod · 0.45

Tested by

no test coverage detected