Create a ModelRunnerCpp instance from an engine directory. Args: engine_dir (str): The directory that contains the serialized engine files and config files. lora_dir (str): The directory that contains LoRA weights.
(
cls,
engine_dir: str,
*,
lora_dir: Optional[str] = None,
rank: int = 0,
max_batch_size: Optional[int] = None,
max_input_len: Optional[int] = None,
max_output_len: Optional[int] = None,
max_beam_width: Optional[int] = None,
max_attention_window_size: Optional[list[int]] = None,
sink_token_length: Optional[int] = None,
kv_cache_free_gpu_memory_fraction: Optional[float] = None,
cross_kv_cache_fraction: Optional[float] = None,
medusa_choices: list[list[int]] | None = None,
eagle_choices: list[list[int]] | None = None,
eagle_posterior_threshold: float | None = None,
eagle_use_dynamic_tree: bool = False,
eagle_dynamic_tree_max_top_k: Optional[int] = None,
lookahead_config: list[int] | None = None,
debug_mode: bool = False,
lora_ckpt_source: str = "hf",
use_gpu_direct_storage: bool = False,
gpu_weights_percent: float = 1,
max_tokens_in_paged_kv_cache: int | None = None,
kv_cache_enable_block_reuse: bool = False,
enable_chunked_context: bool = False,
is_enc_dec: bool = False,
multi_block_mode: bool = True,
enable_context_fmha_fp32_acc: Optional[bool] = None,
cuda_graph_mode: Optional[bool] = None,
logits_processor_map: Optional[Dict[str, LogitsProcessor]] = None,
device_ids: List[int] | None = None,
is_orchestrator_mode: bool = False,
use_runtime_defaults: bool = True,
gather_generation_logits: bool = False,
use_variable_beam_width_search: bool = False,
mm_embedding_offloading: bool = False,
fail_fast_on_attention_window_too_large: bool = False,
)
| 88 | |
| 89 | @classmethod |
| 90 | def from_dir( |
| 91 | cls, |
| 92 | engine_dir: str, |
| 93 | *, |
| 94 | lora_dir: Optional[str] = None, |
| 95 | rank: int = 0, |
| 96 | max_batch_size: Optional[int] = None, |
| 97 | max_input_len: Optional[int] = None, |
| 98 | max_output_len: Optional[int] = None, |
| 99 | max_beam_width: Optional[int] = None, |
| 100 | max_attention_window_size: Optional[list[int]] = None, |
| 101 | sink_token_length: Optional[int] = None, |
| 102 | kv_cache_free_gpu_memory_fraction: Optional[float] = None, |
| 103 | cross_kv_cache_fraction: Optional[float] = None, |
| 104 | medusa_choices: list[list[int]] | None = None, |
| 105 | eagle_choices: list[list[int]] | None = None, |
| 106 | eagle_posterior_threshold: float | None = None, |
| 107 | eagle_use_dynamic_tree: bool = False, |
| 108 | eagle_dynamic_tree_max_top_k: Optional[int] = None, |
| 109 | lookahead_config: list[int] | None = None, |
| 110 | debug_mode: bool = False, |
| 111 | lora_ckpt_source: str = "hf", |
| 112 | use_gpu_direct_storage: bool = False, |
| 113 | gpu_weights_percent: float = 1, |
| 114 | max_tokens_in_paged_kv_cache: int | None = None, |
| 115 | kv_cache_enable_block_reuse: bool = False, |
| 116 | enable_chunked_context: bool = False, |
| 117 | is_enc_dec: bool = False, |
| 118 | multi_block_mode: bool = True, |
| 119 | enable_context_fmha_fp32_acc: Optional[bool] = None, |
| 120 | cuda_graph_mode: Optional[bool] = None, |
| 121 | logits_processor_map: Optional[Dict[str, LogitsProcessor]] = None, |
| 122 | device_ids: List[int] | None = None, |
| 123 | is_orchestrator_mode: bool = False, |
| 124 | use_runtime_defaults: bool = True, |
| 125 | gather_generation_logits: bool = False, |
| 126 | use_variable_beam_width_search: bool = False, |
| 127 | mm_embedding_offloading: bool = False, |
| 128 | fail_fast_on_attention_window_too_large: bool = False, |
| 129 | ) -> 'ModelRunnerCpp': |
| 130 | """ |
| 131 | Create a ModelRunnerCpp instance from an engine directory. |
| 132 | |
| 133 | Args: |
| 134 | engine_dir (str): |
| 135 | The directory that contains the serialized engine files and config files. |
| 136 | lora_dir (str): |
| 137 | The directory that contains LoRA weights. |
| 138 | rank (int): |
| 139 | The runtime rank id. |
| 140 | max_batch_size (int): |
| 141 | The runtime batch size limit. If max_batch_size is not None, it should not |
| 142 | be larger than the engine's max_batch_size; otherwise, the engine's max_batch_size |
| 143 | will be used. |
| 144 | max_input_len (int): |
| 145 | The runtime input length limit. If max_input_len is not None, it should not |
| 146 | be larger than the engine's max_input_len; otherwise, the engine's max_input_len |
| 147 | will be used. |
nothing calls this directly
no test coverage detected