Asynchronous version of __call__ that properly handles the async OpenAI client.
(
self,
tokenized_prediction: list[int],
prediction: str,
label: str,
query: str,
rollout_state: dict | None = None,
)
| 748 | ) / 1_000_000 |
| 749 | |
| 750 | async def async_call( |
| 751 | self, |
| 752 | tokenized_prediction: list[int], |
| 753 | prediction: str, |
| 754 | label: str, |
| 755 | query: str, |
| 756 | rollout_state: dict | None = None, |
| 757 | ) -> VerificationResult: |
| 758 | """ |
| 759 | Asynchronous version of __call__ that properly handles the async OpenAI client. |
| 760 | """ |
| 761 | final_answer = extract_final_answer(prediction) |
| 762 | prompt = self.prompt_template.format(input=query, output=final_answer, label=label) |
| 763 | |
| 764 | max_retries = 3 # for rate limits |
| 765 | retry_delay = 1.0 |
| 766 | |
| 767 | for attempt in range(max_retries): |
| 768 | # judges the quality of a response |
| 769 | try: |
| 770 | messages = build_messages(prompt) |
| 771 | |
| 772 | # Check if the request would exceed context window |
| 773 | if not context_window_checker.check_context_window_limit( |
| 774 | messages=messages, |
| 775 | max_completion_tokens=self.verifier_config.llm_judge_max_tokens, |
| 776 | model_name=self.verifier_config.llm_judge_model, |
| 777 | max_context_length=self.verifier_config.llm_judge_max_context_length, # Adjust based on your model |
| 778 | safety_margin=150, |
| 779 | ): |
| 780 | # Try to truncate messages to fit |
| 781 | messages = context_window_checker.truncate_messages_to_fit_context( |
| 782 | messages=messages, |
| 783 | max_completion_tokens=self.verifier_config.llm_judge_max_tokens, |
| 784 | model_name=self.verifier_config.llm_judge_model, |
| 785 | max_context_length=self.verifier_config.llm_judge_max_context_length, |
| 786 | safety_margin=200, |
| 787 | ) |
| 788 | |
| 789 | # Check again after truncation |
| 790 | if not context_window_checker.check_context_window_limit( |
| 791 | messages=messages, |
| 792 | max_completion_tokens=self.verifier_config.llm_judge_max_tokens, |
| 793 | model_name=self.verifier_config.llm_judge_model, |
| 794 | max_context_length=self.verifier_config.llm_judge_max_context_length, |
| 795 | safety_margin=150, |
| 796 | ): |
| 797 | logger.error("Cannot fit request within context window even after truncation.") |
| 798 | return VerificationResult(score=0.0, cost=0.0, reasoning="Error: Context window exceeded") |
| 799 | # end of Faeze's context window check |
| 800 | response = await run_litellm_async_raw( |
| 801 | model_name=self.verifier_config.llm_judge_model, |
| 802 | messages=messages, |
| 803 | temperature=self.verifier_config.llm_judge_temperature, |
| 804 | max_completion_tokens=self.verifier_config.llm_judge_max_tokens, |
| 805 | seed=self.verifier_config.seed, |
| 806 | timeout=self.verifier_config.llm_judge_timeout, |
| 807 | ) |
no test coverage detected