This method tests that repeated prompts reuse cached KV states. The second request should benefit from the cached prompt processing.
(self)
| 177 | self.tearDown() |
| 178 | |
| 179 | def test_cache_reuse(self): |
| 180 | """ |
| 181 | This method tests that repeated prompts reuse cached KV states. |
| 182 | The second request should benefit from the cached prompt processing. |
| 183 | """ |
| 184 | try: |
| 185 | self.setUp() |
| 186 | with grpc.insecure_channel("localhost:50051") as channel: |
| 187 | stub = backend_pb2_grpc.BackendStub(channel) |
| 188 | response = stub.LoadModel(backend_pb2.ModelOptions(Model="mlx-community/Llama-3.2-1B-Instruct-4bit")) |
| 189 | self.assertTrue(response.success) |
| 190 | |
| 191 | prompt = "The quick brown fox jumps over the lazy dog. " |
| 192 | |
| 193 | # First request - populates cache |
| 194 | req1 = backend_pb2.PredictOptions(Prompt=prompt, Tokens=10) |
| 195 | resp1 = stub.Predict(req1) |
| 196 | self.assertIsNotNone(resp1.message) |
| 197 | |
| 198 | # Second request with same prompt - should reuse cache |
| 199 | req2 = backend_pb2.PredictOptions(Prompt=prompt, Tokens=10) |
| 200 | resp2 = stub.Predict(req2) |
| 201 | self.assertIsNotNone(resp2.message) |
| 202 | |
| 203 | print(f"Cache reuse test passed: first={len(resp1.message)} bytes, second={len(resp2.message)} bytes") |
| 204 | |
| 205 | except Exception as err: |
| 206 | print(err) |
| 207 | self.fail("Cache reuse test failed") |
| 208 | finally: |
| 209 | self.tearDown() |
| 210 | |
| 211 | def test_prefix_cache_reuse(self): |
| 212 | """ |