(self, request, context)
| 562 | return backend_pb2.Reply() |
| 563 | |
| 564 | async def PredictStream(self, request, context): |
| 565 | if self.llm_model is None: |
| 566 | context.set_code(grpc.StatusCode.FAILED_PRECONDITION) |
| 567 | context.set_details("LLM not loaded") |
| 568 | return |
| 569 | |
| 570 | try: |
| 571 | prompt = self._render_prompt(request) |
| 572 | max_new = request.Tokens if request.Tokens > 0 else 256 |
| 573 | temperature = request.Temperature if request.Temperature > 0 else 0.7 |
| 574 | |
| 575 | buffer = "" |
| 576 | for _, text in self._generate_tokens(prompt, max_new, temperature): |
| 577 | buffer += text |
| 578 | yield backend_pb2.Reply( |
| 579 | message=text.encode("utf-8"), |
| 580 | chat_deltas=[backend_pb2.ChatDelta(content=text)], |
| 581 | ) |
| 582 | |
| 583 | # Final emission carries the extracted tool calls (vLLM semantics). |
| 584 | from tool_parsers.hermes import HermesToolParser |
| 585 | if isinstance(self.tool_parser, HermesToolParser): |
| 586 | result = self.tool_parser.parse_full(buffer) |
| 587 | calls = result.tool_calls |
| 588 | reasoning = result.reasoning |
| 589 | else: |
| 590 | _, calls = self.tool_parser.parse(buffer) |
| 591 | reasoning = "" |
| 592 | |
| 593 | if calls or reasoning: |
| 594 | yield backend_pb2.Reply( |
| 595 | chat_deltas=[backend_pb2.ChatDelta( |
| 596 | reasoning_content=reasoning, |
| 597 | tool_calls=[ |
| 598 | backend_pb2.ToolCallDelta(index=c.index, id=c.id, name=c.name, arguments=c.arguments) |
| 599 | for c in calls |
| 600 | ], |
| 601 | )], |
| 602 | ) |
| 603 | except Exception as exc: |
| 604 | import traceback |
| 605 | traceback.print_exc() |
| 606 | context.set_code(grpc.StatusCode.INTERNAL) |
| 607 | context.set_details(f"PredictStream failed: {exc}") |
| 608 | |
| 609 | async def Embedding(self, request, context): |
| 610 | if self.llm_model is None or self.llm_tokenizer is None: |
nothing calls this directly
no test coverage detected