(self, request, context)
| 513 | return backend_pb2.Result(success=False, message=f"LoadModel failed: {exc}") |
| 514 | |
| 515 | async def Predict(self, request, context): |
| 516 | if self.llm_model is None: |
| 517 | context.set_code(grpc.StatusCode.FAILED_PRECONDITION) |
| 518 | context.set_details("LLM not loaded") |
| 519 | return backend_pb2.Reply() |
| 520 | |
| 521 | try: |
| 522 | prompt = self._render_prompt(request) |
| 523 | max_new = request.Tokens if request.Tokens > 0 else 256 |
| 524 | temperature = request.Temperature if request.Temperature > 0 else 0.7 |
| 525 | |
| 526 | t0 = time.monotonic() |
| 527 | pieces: list[str] = [] |
| 528 | ntok = 0 |
| 529 | for _, text in self._generate_tokens(prompt, max_new, temperature): |
| 530 | pieces.append(text) |
| 531 | ntok += 1 |
| 532 | elapsed = time.monotonic() - t0 |
| 533 | |
| 534 | full = "".join(pieces) |
| 535 | from tool_parsers.hermes import HermesToolParser |
| 536 | if isinstance(self.tool_parser, HermesToolParser): |
| 537 | result = self.tool_parser.parse_full(full) |
| 538 | content, calls, reasoning = result.content, result.tool_calls, result.reasoning |
| 539 | else: |
| 540 | content, calls = self.tool_parser.parse(full) |
| 541 | reasoning = "" |
| 542 | |
| 543 | delta = backend_pb2.ChatDelta( |
| 544 | content=content, |
| 545 | reasoning_content=reasoning, |
| 546 | tool_calls=[ |
| 547 | backend_pb2.ToolCallDelta(index=c.index, id=c.id, name=c.name, arguments=c.arguments) |
| 548 | for c in calls |
| 549 | ], |
| 550 | ) |
| 551 | return backend_pb2.Reply( |
| 552 | message=content.encode("utf-8"), |
| 553 | tokens=ntok, |
| 554 | timing_token_generation=elapsed, |
| 555 | chat_deltas=[delta], |
| 556 | ) |
| 557 | except Exception as exc: |
| 558 | import traceback |
| 559 | traceback.print_exc() |
| 560 | context.set_code(grpc.StatusCode.INTERNAL) |
| 561 | context.set_details(f"Predict failed: {exc}") |
| 562 | return backend_pb2.Reply() |
| 563 | |
| 564 | async def PredictStream(self, request, context): |
| 565 | if self.llm_model is None: |
nothing calls this directly
no test coverage detected