Build a ChatDelta + token counts + logprobs from accumulated output. Returns ``(content, reasoning_content, tool_calls_proto, prompt_token_count, completion_token_count, logprobs_bytes)``.
(self, request, generated_text, last_response)
| 603 | ) |
| 604 | |
| 605 | def _finalize_output(self, request, generated_text, last_response): |
| 606 | """Build a ChatDelta + token counts + logprobs from accumulated output. |
| 607 | |
| 608 | Returns ``(content, reasoning_content, tool_calls_proto, |
| 609 | prompt_token_count, completion_token_count, logprobs_bytes)``. |
| 610 | """ |
| 611 | content = generated_text |
| 612 | reasoning_content = "" |
| 613 | |
| 614 | if getattr(self.tokenizer, "has_thinking", False): |
| 615 | think_start = getattr(self.tokenizer, "think_start", "") or "" |
| 616 | think_end = getattr(self.tokenizer, "think_end", "") or "" |
| 617 | reasoning_content, content = split_reasoning(content, think_start, think_end) |
| 618 | |
| 619 | tool_calls_proto: List[backend_pb2.ToolCallDelta] = [] |
| 620 | tool_module = None |
| 621 | if getattr(self.tokenizer, "has_tool_calling", False): |
| 622 | tool_module = self._tool_module_from_tokenizer() |
| 623 | if tool_module is not None: |
| 624 | parsed_tools = None |
| 625 | if request.Tools: |
| 626 | try: |
| 627 | parsed_tools = json.loads(request.Tools) |
| 628 | except json.JSONDecodeError: |
| 629 | parsed_tools = None |
| 630 | calls, content = parse_tool_calls(content, tool_module, parsed_tools) |
| 631 | for c in calls: |
| 632 | tool_calls_proto.append( |
| 633 | backend_pb2.ToolCallDelta( |
| 634 | index=c["index"], |
| 635 | id=c["id"], |
| 636 | name=c["name"], |
| 637 | arguments=c["arguments"], |
| 638 | ) |
| 639 | ) |
| 640 | |
| 641 | prompt_token_count = int(getattr(last_response, "prompt_tokens", 0) or 0) if last_response else 0 |
| 642 | completion_token_count = int(getattr(last_response, "generation_tokens", 0) or 0) if last_response else 0 |
| 643 | |
| 644 | logprobs_bytes = b"" |
| 645 | # Logprobs extraction — only when the request asked for them. |
| 646 | if last_response is not None and int(getattr(request, "Logprobs", 0) or 0) > 0: |
| 647 | try: |
| 648 | lp = getattr(last_response, "logprobs", None) |
| 649 | if lp is not None: |
| 650 | # GenerationResponse.logprobs on the last chunk is the |
| 651 | # logprob distribution of the final token. Without a |
| 652 | # per-token history we at minimum surface the last token's |
| 653 | # top-1 logprob so clients get a non-empty field. |
| 654 | token_id = int(getattr(last_response, "token", 0) or 0) |
| 655 | token_text = self.tokenizer.decode([token_id]) if token_id else "" |
| 656 | top_logprob = float(lp[token_id]) if hasattr(lp, "__getitem__") else 0.0 |
| 657 | logprobs_bytes = json.dumps( |
| 658 | { |
| 659 | "content": [ |
| 660 | {"token": token_text, "logprob": top_logprob} |
| 661 | ] |
| 662 | } |
no test coverage detected