This method tests that prompts sharing a common prefix benefit from cached KV states.
(self)
| 209 | self.tearDown() |
| 210 | |
| 211 | def test_prefix_cache_reuse(self): |
| 212 | """ |
| 213 | This method tests that prompts sharing a common prefix benefit from cached KV states. |
| 214 | """ |
| 215 | try: |
| 216 | self.setUp() |
| 217 | with grpc.insecure_channel("localhost:50051") as channel: |
| 218 | stub = backend_pb2_grpc.BackendStub(channel) |
| 219 | response = stub.LoadModel(backend_pb2.ModelOptions(Model="mlx-community/Llama-3.2-1B-Instruct-4bit")) |
| 220 | self.assertTrue(response.success) |
| 221 | |
| 222 | # First request with base prompt |
| 223 | prompt_base = "Once upon a time in a land far away, " |
| 224 | req1 = backend_pb2.PredictOptions(Prompt=prompt_base, Tokens=10) |
| 225 | resp1 = stub.Predict(req1) |
| 226 | self.assertIsNotNone(resp1.message) |
| 227 | |
| 228 | # Second request with extended prompt (same prefix) |
| 229 | prompt_extended = prompt_base + "there lived a brave knight who " |
| 230 | req2 = backend_pb2.PredictOptions(Prompt=prompt_extended, Tokens=10) |
| 231 | resp2 = stub.Predict(req2) |
| 232 | self.assertIsNotNone(resp2.message) |
| 233 | |
| 234 | print(f"Prefix cache test passed: base={len(resp1.message)} bytes, extended={len(resp2.message)} bytes") |
| 235 | |
| 236 | except Exception as err: |
| 237 | print(err) |
| 238 | self.fail("Prefix cache reuse test failed") |
| 239 | finally: |
| 240 | self.tearDown() |
| 241 | |
| 242 | |
| 243 | def test_tokenize_string(self): |