Config of key value cache.
| 88 | |
| 89 | @dataclass |
| 90 | class CacheConfig: |
| 91 | """Config of key value cache.""" |
| 92 | |
| 93 | max_batches: int |
| 94 | block_size: int |
| 95 | num_cpu_blocks: int |
| 96 | num_gpu_blocks: int |
| 97 | kernel_block_size: int = -1 |
| 98 | window_size: int = -1 |
| 99 | cache_max_entry_count: float = 0.8 |
| 100 | max_prefill_token_num: int = 8192 |
| 101 | enable_prefix_caching: bool = False |
| 102 | quant_policy: QuantPolicy = QuantPolicy.NONE |
| 103 | device_type: str = 'cuda' |
| 104 | num_state_caches: int = None |
| 105 | states_shapes: list[tuple] = field(default_factory=list) |
| 106 | |
| 107 | # reserved blocks for dummy inputs, init to 0 for unit test. |
| 108 | num_reserved_gpu_blocks: int = 0 |
| 109 | |
| 110 | # For PD Disaggregation |
| 111 | role: EngineRole = EngineRole.Hybrid |
| 112 | migration_backend: MigrationBackend = MigrationBackend.DLSlime |
| 113 | |
| 114 | def __post_init__(self): |
| 115 | """Post init.""" |
| 116 | if self.window_size > 1 and self.enable_prefix_caching: |
| 117 | logger.warning('Prefix caching is not available for window attention.') |
| 118 | self.enable_prefix_caching = False |
| 119 | if self.kernel_block_size == -1: |
| 120 | self.kernel_block_size = self.block_size |
| 121 | |
| 122 | |
| 123 | class TPMode(enum.Enum): |
no outgoing calls