| 509 | return backend_pb2.ScoreResponse() |
| 510 | |
| 511 | async def _predict(self, request, context, streaming=False): |
| 512 | # Build the sampling parameters |
| 513 | # NOTE: this must stay in sync with the vllm backend |
| 514 | request_to_sampling_params = { |
| 515 | "N": "n", |
| 516 | "PresencePenalty": "presence_penalty", |
| 517 | "FrequencyPenalty": "frequency_penalty", |
| 518 | "RepetitionPenalty": "repetition_penalty", |
| 519 | "Temperature": "temperature", |
| 520 | "TopP": "top_p", |
| 521 | "TopK": "top_k", |
| 522 | "MinP": "min_p", |
| 523 | "Seed": "seed", |
| 524 | "StopPrompts": "stop", |
| 525 | "StopTokenIds": "stop_token_ids", |
| 526 | "BadWords": "bad_words", |
| 527 | "IncludeStopStrInOutput": "include_stop_str_in_output", |
| 528 | "IgnoreEOS": "ignore_eos", |
| 529 | "Tokens": "max_tokens", |
| 530 | "MinTokens": "min_tokens", |
| 531 | "Logprobs": "logprobs", |
| 532 | "PromptLogprobs": "prompt_logprobs", |
| 533 | "SkipSpecialTokens": "skip_special_tokens", |
| 534 | "SpacesBetweenSpecialTokens": "spaces_between_special_tokens", |
| 535 | "TruncatePromptTokens": "truncate_prompt_tokens", |
| 536 | } |
| 537 | |
| 538 | sampling_params = SamplingParams(top_p=0.9, max_tokens=200) |
| 539 | |
| 540 | for request_field, param_field in request_to_sampling_params.items(): |
| 541 | if hasattr(request, request_field): |
| 542 | value = getattr(request, request_field) |
| 543 | if value not in (None, 0, [], False): |
| 544 | setattr(sampling_params, param_field, value) |
| 545 | |
| 546 | # Structured-output decoding: use Grammar field to pass JSON schema or BNF |
| 547 | if HAS_GUIDED_DECODING and request.Grammar: |
| 548 | try: |
| 549 | json.loads(request.Grammar) # valid JSON = JSON schema |
| 550 | sampling_params.structured_outputs = StructuredOutputsParams(json=request.Grammar) |
| 551 | except json.JSONDecodeError: |
| 552 | sampling_params.structured_outputs = StructuredOutputsParams(grammar=request.Grammar) |
| 553 | |
| 554 | # Extract image paths and process images |
| 555 | prompt = request.Prompt |
| 556 | |
| 557 | image_paths = request.Images |
| 558 | image_data = [self.load_image(img_path) for img_path in image_paths] |
| 559 | |
| 560 | videos_path = request.Videos |
| 561 | video_data = [self.load_video(video_path) for video_path in videos_path] |
| 562 | |
| 563 | # If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template |
| 564 | if not request.Prompt and request.UseTokenizerTemplate and request.Messages: |
| 565 | messages_dicts = self._messages_to_dicts(request.Messages) |
| 566 | template_kwargs = {"tokenize": False, "add_generation_prompt": True} |
| 567 | |
| 568 | # Pass tools for tool calling |