Create a ModelRunner instance from an engine directory. Args: engine_dir (str): The directory that contains the serialized engine files and config files. max_output_len (Optional[int]): max_output_len, this arg might be availa
(
cls,
engine_dir: str,
*,
max_output_len: Optional[int] = None,
lora_dir: Optional[List[str]] = None,
rank: int = 0,
debug_mode: bool = False,
lora_ckpt_source: str = "hf",
medusa_choices: List[List[int]] = None,
stream: torch.cuda.Stream = None,
gpu_weights_percent: float = 1,
enable_context_fmha_fp32_acc: Optional[bool] = None,
multi_block_mode: Optional[bool] = None,
fail_fast_on_attention_window_too_large: bool = False,
)
| 637 | |
| 638 | @classmethod |
| 639 | def from_dir( |
| 640 | cls, |
| 641 | engine_dir: str, |
| 642 | *, |
| 643 | max_output_len: Optional[int] = None, |
| 644 | lora_dir: Optional[List[str]] = None, |
| 645 | rank: int = 0, |
| 646 | debug_mode: bool = False, |
| 647 | lora_ckpt_source: str = "hf", |
| 648 | medusa_choices: List[List[int]] = None, |
| 649 | stream: torch.cuda.Stream = None, |
| 650 | gpu_weights_percent: float = 1, |
| 651 | enable_context_fmha_fp32_acc: Optional[bool] = None, |
| 652 | multi_block_mode: Optional[bool] = None, |
| 653 | fail_fast_on_attention_window_too_large: bool = False, |
| 654 | ) -> 'ModelRunner': |
| 655 | """ |
| 656 | Create a ModelRunner instance from an engine directory. |
| 657 | |
| 658 | Args: |
| 659 | engine_dir (str): |
| 660 | The directory that contains the serialized engine files and config files. |
| 661 | max_output_len (Optional[int]): |
| 662 | max_output_len, this arg might be available only when loading time, generate will still to check when disable_kv_cache is enabled. |
| 663 | lora_dir (Optional[List[str]]): |
| 664 | The directories that contain LoRA weights. |
| 665 | rank (int): |
| 666 | The runtime rank id. |
| 667 | debug_mode (bool): |
| 668 | Whether or not to turn on the debug mode. |
| 669 | medusa_choices (List[List[int]]): |
| 670 | Medusa choices to use when in Medusa decoding |
| 671 | stream (torch.cuda.Stream): |
| 672 | Stream to use. |
| 673 | multi_block_mode (bool): |
| 674 | Whether to distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel. |
| 675 | fail_fast_on_attention_window_too_large (bool): |
| 676 | Exit with runtime error when attention window is too large to fit even a single sequence in the KV cache. |
| 677 | Note: This parameter is only applicable to C++ runtime (ModelRunnerCpp). |
| 678 | Returns: |
| 679 | ModelRunner: An instance of ModelRunner. |
| 680 | """ |
| 681 | engine_version = get_engine_version(engine_dir) |
| 682 | profiler.start('load tensorrt_llm engine') |
| 683 | # the old engine format |
| 684 | if engine_version is None: |
| 685 | engine_dir = Path(engine_dir) |
| 686 | config_path = engine_dir / "config.json" |
| 687 | model_config, other_config = read_config(config_path) |
| 688 | world_size = other_config.pop('world_size') |
| 689 | tp_size = other_config.pop('tp_size') |
| 690 | pp_size = other_config.pop('pp_size') |
| 691 | max_batch_size = other_config.pop('max_batch_size') |
| 692 | max_input_len = other_config.pop('max_input_len') |
| 693 | max_output_len = other_config.pop('max_output_len') |
| 694 | max_beam_width = other_config.pop('max_beam_width') |
| 695 | runtime_mapping = Mapping(world_size=world_size, |
| 696 | rank=rank, |
no test coverage detected