(self, request, generated_text, last_response)
| 487 | return text[:earliest] if earliest < len(text) else text |
| 488 | |
| 489 | def _finalize_output(self, request, generated_text, last_response): |
| 490 | content = generated_text |
| 491 | reasoning_content = "" |
| 492 | if getattr(self.tokenizer, "has_thinking", False): |
| 493 | think_start = getattr(self.tokenizer, "think_start", "") or "" |
| 494 | think_end = getattr(self.tokenizer, "think_end", "") or "" |
| 495 | reasoning_content, content = split_reasoning(content, think_start, think_end) |
| 496 | |
| 497 | tool_calls_proto: List[backend_pb2.ToolCallDelta] = [] |
| 498 | tool_module = None |
| 499 | if getattr(self.tokenizer, "has_tool_calling", False): |
| 500 | tool_module = self._tool_module_from_tokenizer() |
| 501 | if tool_module is not None: |
| 502 | parsed_tools = None |
| 503 | if request.Tools: |
| 504 | try: |
| 505 | parsed_tools = json.loads(request.Tools) |
| 506 | except json.JSONDecodeError: |
| 507 | parsed_tools = None |
| 508 | calls, content = parse_tool_calls(content, tool_module, parsed_tools) |
| 509 | for c in calls: |
| 510 | tool_calls_proto.append( |
| 511 | backend_pb2.ToolCallDelta( |
| 512 | index=c["index"], id=c["id"], name=c["name"], arguments=c["arguments"], |
| 513 | ) |
| 514 | ) |
| 515 | |
| 516 | prompt_token_count = int(getattr(last_response, "prompt_tokens", 0) or 0) if last_response else 0 |
| 517 | completion_token_count = int(getattr(last_response, "generation_tokens", 0) or 0) if last_response else 0 |
| 518 | |
| 519 | logprobs_bytes = b"" |
| 520 | if last_response is not None and int(getattr(request, "Logprobs", 0) or 0) > 0: |
| 521 | try: |
| 522 | lp = getattr(last_response, "logprobs", None) |
| 523 | if lp is not None: |
| 524 | token_id = int(getattr(last_response, "token", 0) or 0) |
| 525 | token_text = self.tokenizer.decode([token_id]) if token_id else "" |
| 526 | top_logprob = float(lp[token_id]) if hasattr(lp, "__getitem__") else 0.0 |
| 527 | logprobs_bytes = json.dumps( |
| 528 | {"content": [{"token": token_text, "logprob": top_logprob}]} |
| 529 | ).encode("utf-8") |
| 530 | except Exception as e: |
| 531 | print(f"[Rank 0] Logprobs extraction failed: {e}", file=sys.stderr) |
| 532 | |
| 533 | return content, reasoning_content, tool_calls_proto, prompt_token_count, completion_token_count, logprobs_bytes |
| 534 | |
| 535 | def _build_generation_params(self, request, default_max_tokens=200): |
| 536 | import mlx.core as mx |
no test coverage detected