Insert a cache entry after generation completes. Thread-safe. Handles LRU eviction if max_size is exceeded. Args: model: Model identifier (used to namespace caches) tokens: The full token sequence (prompt + generated) prompt_cache: The K
(
self, model, tokens: List[int], prompt_cache: List[Any]
)
| 214 | return None, list(tokens) |
| 215 | |
| 216 | def insert_cache( |
| 217 | self, model, tokens: List[int], prompt_cache: List[Any] |
| 218 | ) -> None: |
| 219 | """ |
| 220 | Insert a cache entry after generation completes. |
| 221 | |
| 222 | Thread-safe. Handles LRU eviction if max_size is exceeded. |
| 223 | |
| 224 | Args: |
| 225 | model: Model identifier (used to namespace caches) |
| 226 | tokens: The full token sequence (prompt + generated) |
| 227 | prompt_cache: The KV cache to store |
| 228 | """ |
| 229 | with self._lock: |
| 230 | tokens_tuple = tuple(tokens) |
| 231 | |
| 232 | if model not in self._cache: |
| 233 | self._cache[model] = {} |
| 234 | current = self._cache[model] |
| 235 | |
| 236 | # Build trie path |
| 237 | for tok in tokens_tuple: |
| 238 | if tok not in current: |
| 239 | current[tok] = {} |
| 240 | current = current[tok] |
| 241 | |
| 242 | # Update or create entry |
| 243 | if "cache" in current: |
| 244 | current["cache"].count += 1 |
| 245 | self._lru.remove((model, tokens_tuple)) |
| 246 | else: |
| 247 | current["cache"] = CacheEntry(prompt_cache, 1) |
| 248 | |
| 249 | # Update LRU order |
| 250 | self._lru.append((model, tokens_tuple)) |
| 251 | |
| 252 | # Evict if over capacity |
| 253 | if len(self._lru) > self.max_size: |
| 254 | evict_model, evict_tokens = self._lru.popleft() |
| 255 | self._delete(evict_model, evict_tokens) |
| 256 | |
| 257 | def clear(self) -> None: |
| 258 | """Clear all cache entries. Thread-safe.""" |