Configuration for the KV cache.
| 1626 | |
| 1627 | @PybindMirror.mirror_pybind_fields(_KvCacheConfig) |
| 1628 | class KvCacheConfig(StrictBaseModel, PybindMirror): |
| 1629 | """ |
| 1630 | Configuration for the KV cache. |
| 1631 | """ |
| 1632 | enable_block_reuse: bool = Field( |
| 1633 | default=True, |
| 1634 | description= |
| 1635 | "Controls if KV cache blocks can be reused for different requests.") |
| 1636 | max_tokens: Optional[int] = Field( |
| 1637 | default=None, |
| 1638 | description= |
| 1639 | "The maximum number of tokens that should be stored in the KV cache. If both `max_tokens` and `free_gpu_memory_fraction` are specified, memory corresponding to the minimum will be used." |
| 1640 | ) |
| 1641 | max_attention_window: Optional[List[int]] = Field( |
| 1642 | default=None, |
| 1643 | description= |
| 1644 | "Size of the attention window for each sequence. Only the last tokens will be stored in the KV cache. If the number of elements in `max_attention_window` is less than the number of layers, `max_attention_window` will be repeated multiple times to the number of layers." |
| 1645 | ) |
| 1646 | sink_token_length: Optional[int] = Field( |
| 1647 | default=None, |
| 1648 | description= |
| 1649 | "Number of sink tokens (tokens to always keep in attention window).") |
| 1650 | free_gpu_memory_fraction: Optional[float] = Field( |
| 1651 | default=0.9, |
| 1652 | description= |
| 1653 | "The fraction of GPU memory fraction that should be allocated for the KV cache. Default is 90%. If both `max_tokens` and `free_gpu_memory_fraction` are specified, memory corresponding to the minimum will be used." |
| 1654 | ) |
| 1655 | host_cache_size: Optional[int] = Field( |
| 1656 | default=None, |
| 1657 | description= |
| 1658 | "Size of the host cache in bytes. If both `max_tokens` and `host_cache_size` are specified, memory corresponding to the minimum will be used." |
| 1659 | ) |
| 1660 | onboard_blocks: bool = Field( |
| 1661 | default=True, description="Controls if blocks are onboarded.") |
| 1662 | cross_kv_cache_fraction: Optional[float] = Field( |
| 1663 | default=None, |
| 1664 | description= |
| 1665 | "The fraction of the KV Cache memory should be reserved for cross attention. If set to p, self attention will use 1-p of KV Cache memory and cross attention will use p of KV Cache memory. Default is 50%. Should only be set when using encoder-decoder model." |
| 1666 | ) |
| 1667 | secondary_offload_min_priority: Optional[int] = Field( |
| 1668 | default=None, |
| 1669 | description= |
| 1670 | "Only blocks with priority > mSecondaryOfflineMinPriority can be offloaded to secondary memory." |
| 1671 | ) |
| 1672 | event_buffer_max_size: int = Field( |
| 1673 | default=0, |
| 1674 | description= |
| 1675 | "Maximum size of the event buffer. If set to 0, the event buffer will not be used." |
| 1676 | ) |
| 1677 | attention_dp_events_gather_period_ms: int = Field( |
| 1678 | default=5, |
| 1679 | description= |
| 1680 | "The period in milliseconds to gather attention DP events across ranks." |
| 1681 | ) |
| 1682 | enable_partial_reuse: bool = Field( |
| 1683 | default=True, |
| 1684 | description= |
| 1685 | "Whether blocks that are only partially matched can be reused.") |