hub / github.com/PaddlePaddle/FastDeploy / __init__

Method init

fastdeploy/config.py:1397–1479 · view source on GitHub ↗

Initialize the CacheConfig class. Args: block_size (int): Size of a cache block in number of tokens. gpu_memory_utilization (float): Fraction of GPU memory to use. cache_dtype (str): Data type for cache storage. Default is 'bfloat16'.

(self, args)

Source from the content-addressed store, hash-verified

1395	"""
1396
1397	def __init__(self, args):
1398	"""
1399	Initialize the CacheConfig class.
1400
1401	Args:
1402	block_size (int): Size of a cache block in number of tokens.
1403	gpu_memory_utilization (float): Fraction of GPU memory to use.
1404	cache_dtype (str): Data type for cache storage. Default is 'bfloat16'.
1405	num_gpu_blocks_override (Optional[int]): Override for number of GPU blocks.
1406	num_cpu_blocks (Optional[int]): Number of CPU blocks.
1407	kv_cache_ratio (float): Ratio for max block calculation.
1408	enc_dec_block_num (int): Number of encoder-decoder blocks.
1409	prealloc_dec_block_slot_num_threshold (int): Number of token slot threshold to allocate next blocks for decoding, used when ENABLE_V1_KVCACHE_SCHEDULER=1.
1410	enable_prefix_caching (bool): Enable prefix caching.
1411	max_encoder_cache(int): Maximum number of tokens in the encoder cache.
1412	max_processor_cache(int): Maximum number of bytes in the processor cache.
1413	"""
1414	self.block_size = 64
1415	self.gpu_memory_utilization = 0.9
1416	self.num_gpu_blocks_override = None
1417	if envs.ENABLE_V1_KVCACHE_SCHEDULER:
1418	self.kv_cache_ratio = 1.0
1419	else:
1420	self.kv_cache_ratio = 0.75
1421	self.enc_dec_block_num = envs.FD_ENC_DEC_BLOCK_NUM
1422	self.prealloc_dec_block_slot_num_threshold = 12
1423	self.cache_dtype = "bfloat16"
1424	self.model_cfg = None
1425	self.enable_chunked_prefill = False
1426	self.rdma_comm_ports = None
1427	self.local_rdma_comm_ports = None
1428	self.cache_transfer_protocol = None
1429	self.pd_comm_port = None
1430	self.local_pd_comm_port = None
1431	self.enable_prefix_caching = False
1432	self.enable_ssd_cache = False
1433	self.cache_queue_port = None
1434	self.local_cache_queue_port = None
1435	self.swap_space = None
1436	self.max_encoder_cache = None
1437	self.max_processor_cache = None
1438	self.enable_output_caching = False
1439	self.disable_chunked_mm_input = False
1440	self.kvcache_storage_backend = None
1441	self.write_policy = None
1442	self.num_cpu_blocks = None
1443	self.use_mla_cache = envs.FD_ATTENTION_BACKEND == "MLA_ATTN"
1444
1445	for key, value in args.items():
1446	if hasattr(self, key):
1447	setattr(self, key, value)
1448
1449	self.cache_queue_port = parse_ports(self.cache_queue_port)
1450	self.rdma_comm_ports = parse_ports(self.rdma_comm_ports)
1451	self.pd_comm_port = parse_ports(self.pd_comm_port)
1452
1453	if self.model_cfg is not None:
1454	if self.model_cfg.quantization is not None and isinstance(self.model_cfg.quantization, dict):

Callers

nothing calls this directly

Calls 5

get_cache_bytesMethod · 0.95

_verify_argsMethod · 0.95

parse_portsFunction · 0.90

itemsMethod · 0.80

getMethod · 0.45

Tested by

no test coverage detected

Method __init__

Source from the content-addressed store, hash-verified

Callers

Calls 5

Tested by

Method init