Configuration class for TensorRT LLM engine building parameters. This class contains all the configuration parameters needed to build a TensorRT LLM engine, including sequence length limits, batch sizes, optimization settings, and various features.
| 451 | |
| 452 | |
| 453 | class BuildConfig(BaseModel): |
| 454 | """Configuration class for TensorRT LLM engine building parameters. |
| 455 | |
| 456 | This class contains all the configuration parameters needed to build a TensorRT LLM engine, |
| 457 | including sequence length limits, batch sizes, optimization settings, and various features. |
| 458 | """ |
| 459 | max_input_len: int = Field(default=1024, |
| 460 | description="Maximum length of input sequences.") |
| 461 | max_seq_len: Optional[int] = Field( |
| 462 | default=None, |
| 463 | description= |
| 464 | "The maximum possible sequence length for a single request, including both input and generated " |
| 465 | "output tokens.") |
| 466 | opt_batch_size: int = Field( |
| 467 | default=8, description="Optimal batch size for engine optimization.") |
| 468 | max_batch_size: int = Field( |
| 469 | default=2048, description="Maximum batch size the engine can handle.") |
| 470 | max_beam_width: int = Field( |
| 471 | default=1, description="Maximum beam width for beam search decoding.") |
| 472 | max_num_tokens: int = Field( |
| 473 | default=8192, |
| 474 | description="Maximum number of batched input tokens after padding is " |
| 475 | "removed in each batch.") |
| 476 | opt_num_tokens: Optional[int] = Field( |
| 477 | default=None, |
| 478 | description= |
| 479 | "Optimal number of batched input tokens for engine optimization.") |
| 480 | max_prompt_embedding_table_size: int = Field( |
| 481 | default=0, |
| 482 | description="Maximum size of prompt embedding table for prompt tuning.") |
| 483 | kv_cache_type: Optional[KVCacheType] = Field( |
| 484 | default=None, |
| 485 | description= |
| 486 | "Type of KV cache to use (CONTINUOUS or PAGED). If None, defaults to PAGED." |
| 487 | ) |
| 488 | gather_context_logits: bool = Field( |
| 489 | default=False, |
| 490 | description="Whether to gather logits during context phase.") |
| 491 | gather_generation_logits: bool = Field( |
| 492 | default=False, |
| 493 | description="Whether to gather logits during generation phase.") |
| 494 | strongly_typed: bool = Field(default=True, |
| 495 | description="Whether to use strongly_typed.") |
| 496 | force_num_profiles: Optional[int] = Field( |
| 497 | default=None, |
| 498 | description= |
| 499 | "Force a specific number of optimization profiles. If None, auto-determined." |
| 500 | ) |
| 501 | profiling_verbosity: str = Field( |
| 502 | default='layer_names_only', |
| 503 | description= |
| 504 | "Verbosity level for TensorRT profiling ('layer_names_only', 'detailed', 'none')." |
| 505 | ) |
| 506 | enable_debug_output: bool = Field( |
| 507 | default=False, |
| 508 | description="Whether to enable debug output during building.") |
| 509 | max_draft_len: int = Field( |
| 510 | default=0, |