MCPcopy Index your code
hub / github.com/PaddlePaddle/FastDeploy / __init__

Method __init__

fastdeploy/config.py:1397–1479  ·  view source on GitHub ↗

Initialize the CacheConfig class. Args: block_size (int): Size of a cache block in number of tokens. gpu_memory_utilization (float): Fraction of GPU memory to use. cache_dtype (str): Data type for cache storage. Default is 'bfloat16'.

(self, args)

Source from the content-addressed store, hash-verified

1395 """
1396
1397 def __init__(self, args):
1398 """
1399 Initialize the CacheConfig class.
1400
1401 Args:
1402 block_size (int): Size of a cache block in number of tokens.
1403 gpu_memory_utilization (float): Fraction of GPU memory to use.
1404 cache_dtype (str): Data type for cache storage. Default is 'bfloat16'.
1405 num_gpu_blocks_override (Optional[int]): Override for number of GPU blocks.
1406 num_cpu_blocks (Optional[int]): Number of CPU blocks.
1407 kv_cache_ratio (float): Ratio for max block calculation.
1408 enc_dec_block_num (int): Number of encoder-decoder blocks.
1409 prealloc_dec_block_slot_num_threshold (int): Number of token slot threshold to allocate next blocks for decoding, used when ENABLE_V1_KVCACHE_SCHEDULER=1.
1410 enable_prefix_caching (bool): Enable prefix caching.
1411 max_encoder_cache(int): Maximum number of tokens in the encoder cache.
1412 max_processor_cache(int): Maximum number of bytes in the processor cache.
1413 """
1414 self.block_size = 64
1415 self.gpu_memory_utilization = 0.9
1416 self.num_gpu_blocks_override = None
1417 if envs.ENABLE_V1_KVCACHE_SCHEDULER:
1418 self.kv_cache_ratio = 1.0
1419 else:
1420 self.kv_cache_ratio = 0.75
1421 self.enc_dec_block_num = envs.FD_ENC_DEC_BLOCK_NUM
1422 self.prealloc_dec_block_slot_num_threshold = 12
1423 self.cache_dtype = "bfloat16"
1424 self.model_cfg = None
1425 self.enable_chunked_prefill = False
1426 self.rdma_comm_ports = None
1427 self.local_rdma_comm_ports = None
1428 self.cache_transfer_protocol = None
1429 self.pd_comm_port = None
1430 self.local_pd_comm_port = None
1431 self.enable_prefix_caching = False
1432 self.enable_ssd_cache = False
1433 self.cache_queue_port = None
1434 self.local_cache_queue_port = None
1435 self.swap_space = None
1436 self.max_encoder_cache = None
1437 self.max_processor_cache = None
1438 self.enable_output_caching = False
1439 self.disable_chunked_mm_input = False
1440 self.kvcache_storage_backend = None
1441 self.write_policy = None
1442 self.num_cpu_blocks = None
1443 self.use_mla_cache = envs.FD_ATTENTION_BACKEND == "MLA_ATTN"
1444
1445 for key, value in args.items():
1446 if hasattr(self, key):
1447 setattr(self, key, value)
1448
1449 self.cache_queue_port = parse_ports(self.cache_queue_port)
1450 self.rdma_comm_ports = parse_ports(self.rdma_comm_ports)
1451 self.pd_comm_port = parse_ports(self.pd_comm_port)
1452
1453 if self.model_cfg is not None:
1454 if self.model_cfg.quantization is not None and isinstance(self.model_cfg.quantization, dict):

Callers

nothing calls this directly

Calls 5

get_cache_bytesMethod · 0.95
_verify_argsMethod · 0.95
parse_portsFunction · 0.90
itemsMethod · 0.80
getMethod · 0.45

Tested by

no test coverage detected