Initialize the CacheConfig class. Args: block_size (int): Size of a cache block in number of tokens. gpu_memory_utilization (float): Fraction of GPU memory to use. cache_dtype (str): Data type for cache storage. Default is 'bfloat16'.
(self, args)
| 1395 | """ |
| 1396 | |
| 1397 | def __init__(self, args): |
| 1398 | """ |
| 1399 | Initialize the CacheConfig class. |
| 1400 | |
| 1401 | Args: |
| 1402 | block_size (int): Size of a cache block in number of tokens. |
| 1403 | gpu_memory_utilization (float): Fraction of GPU memory to use. |
| 1404 | cache_dtype (str): Data type for cache storage. Default is 'bfloat16'. |
| 1405 | num_gpu_blocks_override (Optional[int]): Override for number of GPU blocks. |
| 1406 | num_cpu_blocks (Optional[int]): Number of CPU blocks. |
| 1407 | kv_cache_ratio (float): Ratio for max block calculation. |
| 1408 | enc_dec_block_num (int): Number of encoder-decoder blocks. |
| 1409 | prealloc_dec_block_slot_num_threshold (int): Number of token slot threshold to allocate next blocks for decoding, used when ENABLE_V1_KVCACHE_SCHEDULER=1. |
| 1410 | enable_prefix_caching (bool): Enable prefix caching. |
| 1411 | max_encoder_cache(int): Maximum number of tokens in the encoder cache. |
| 1412 | max_processor_cache(int): Maximum number of bytes in the processor cache. |
| 1413 | """ |
| 1414 | self.block_size = 64 |
| 1415 | self.gpu_memory_utilization = 0.9 |
| 1416 | self.num_gpu_blocks_override = None |
| 1417 | if envs.ENABLE_V1_KVCACHE_SCHEDULER: |
| 1418 | self.kv_cache_ratio = 1.0 |
| 1419 | else: |
| 1420 | self.kv_cache_ratio = 0.75 |
| 1421 | self.enc_dec_block_num = envs.FD_ENC_DEC_BLOCK_NUM |
| 1422 | self.prealloc_dec_block_slot_num_threshold = 12 |
| 1423 | self.cache_dtype = "bfloat16" |
| 1424 | self.model_cfg = None |
| 1425 | self.enable_chunked_prefill = False |
| 1426 | self.rdma_comm_ports = None |
| 1427 | self.local_rdma_comm_ports = None |
| 1428 | self.cache_transfer_protocol = None |
| 1429 | self.pd_comm_port = None |
| 1430 | self.local_pd_comm_port = None |
| 1431 | self.enable_prefix_caching = False |
| 1432 | self.enable_ssd_cache = False |
| 1433 | self.cache_queue_port = None |
| 1434 | self.local_cache_queue_port = None |
| 1435 | self.swap_space = None |
| 1436 | self.max_encoder_cache = None |
| 1437 | self.max_processor_cache = None |
| 1438 | self.enable_output_caching = False |
| 1439 | self.disable_chunked_mm_input = False |
| 1440 | self.kvcache_storage_backend = None |
| 1441 | self.write_policy = None |
| 1442 | self.num_cpu_blocks = None |
| 1443 | self.use_mla_cache = envs.FD_ATTENTION_BACKEND == "MLA_ATTN" |
| 1444 | |
| 1445 | for key, value in args.items(): |
| 1446 | if hasattr(self, key): |
| 1447 | setattr(self, key, value) |
| 1448 | |
| 1449 | self.cache_queue_port = parse_ports(self.cache_queue_port) |
| 1450 | self.rdma_comm_ports = parse_ports(self.rdma_comm_ports) |
| 1451 | self.pd_comm_port = parse_ports(self.pd_comm_port) |
| 1452 | |
| 1453 | if self.model_cfg is not None: |
| 1454 | if self.model_cfg.quantization is not None and isinstance(self.model_cfg.quantization, dict): |
nothing calls this directly
no test coverage detected