hub / github.com/mudler/LocalAI / _search

Method _search

backend/python/mlx/mlx_cache.py:76–127 · view source on GitHub ↗

Search the cache for a prompt cache. Return exact or close match. The cache is organized as a trie where each node is keyed by a token. This allows efficient prefix matching.

(self, model, tokens: List[int])

Source from the content-addressed store, hash-verified

74	self._trim_fn = trim_fn
75
76	def _search(self, model, tokens: List[int]) -> SearchResult:
77	"""
78	Search the cache for a prompt cache. Return exact or close match.
79
80	The cache is organized as a trie where each node is keyed by a token.
81	This allows efficient prefix matching.
82	"""
83	if model not in self._cache:
84	return SearchResult(model, None, None, None, 0)
85
86	current = self._cache[model]
87	last_cache_index = -1
88	index = 0
89
90	# Traverse the trie following the token sequence
91	while index < len(tokens) and tokens[index] in current:
92	current = current[tokens[index]]
93	if "cache" in current:
94	last_cache_index = index
95	index += 1
96
97	# Exact match - no need to search for longer or shorter caches
98	if last_cache_index == len(tokens) - 1:
99	return SearchResult(model, tuple(tokens), None, None, 0)
100
101	# Find the shorter cache (a prefix that has a cache)
102	# Note: Uses > 0 (not >= 0) to match upstream mlx_lm/server.py behavior.
103	# Single-token prefixes are not matched, which allows longer cached
104	# sequences to be preferred for trimming. This is acceptable because
105	# real prompts with chat templates are always many tokens.
106	shorter = None
107	if last_cache_index > 0:
108	shorter = tuple(tokens[: last_cache_index + 1])
109
110	# Check for caches that are longer than our token sequence
111	longer = None
112	common_prefix = index
113	if index > 0 and last_cache_index <= 0:
114	best = None
115	stack = [(current, [])]
116	while stack:
117	current, extra = stack.pop()
118	if "cache" in current:
119	if best is None or len(extra) < len(best):
120	best = extra
121	else:
122	for tok in current:
123	stack.append((current[tok], extra + [tok]))
124	if best is not None:
125	longer = tuple(tokens[:index] + best)
126
127	return SearchResult(model, None, shorter, longer, common_prefix)
128
129	def _get(self, model, tokens: Tuple[int, ...]) -> CacheEntry:
130	"""Get a cache entry by traversing the trie."""

Callers 1

fetch_nearest_cacheMethod · 0.95

Calls 3

popMethod · 0.80

appendMethod · 0.80

SearchResultClass · 0.70

Tested by

no test coverage detected