Method test_cache_reuse

backend/python/mlx/test.py:179–209 · view source on GitHub ↗

This method tests that repeated prompts reuse cached KV states. The second request should benefit from the cached prompt processing.

(self)

Source from the content-addressed store, hash-verified

177	self.tearDown()
178
179	def test_cache_reuse(self):
180	"""
181	This method tests that repeated prompts reuse cached KV states.
182	The second request should benefit from the cached prompt processing.
183	"""
184	try:
185	self.setUp()
186	with grpc.insecure_channel("localhost:50051") as channel:
187	stub = backend_pb2_grpc.BackendStub(channel)
188	response = stub.LoadModel(backend_pb2.ModelOptions(Model="mlx-community/Llama-3.2-1B-Instruct-4bit"))
189	self.assertTrue(response.success)
190
191	prompt = "The quick brown fox jumps over the lazy dog. "
192
193	# First request - populates cache
194	req1 = backend_pb2.PredictOptions(Prompt=prompt, Tokens=10)
195	resp1 = stub.Predict(req1)
196	self.assertIsNotNone(resp1.message)
197
198	# Second request with same prompt - should reuse cache
199	req2 = backend_pb2.PredictOptions(Prompt=prompt, Tokens=10)
200	resp2 = stub.Predict(req2)
201	self.assertIsNotNone(resp2.message)
202
203	print(f"Cache reuse test passed: first={len(resp1.message)} bytes, second={len(resp2.message)} bytes")
204
205	except Exception as err:
206	print(err)
207	self.fail("Cache reuse test failed")
208	finally:
209	self.tearDown()
210
211	def test_prefix_cache_reuse(self):
212	"""

nothing calls this directly

setUpMethod · 0.95

tearDownMethod · 0.95

LoadModelMethod · 0.65

PredictMethod · 0.65

no test coverage detected