Rank the responses using the LLM.
(prompt: str, responses: List[Dict[str, Any]])
| 37 | } |
| 38 | |
| 39 | async def rank_responses(prompt: str, responses: List[Dict[str, Any]]) -> List[int]: |
| 40 | """Rank the responses using the LLM.""" |
| 41 | ranking_prompt = f"Given the following prompt:\n\n{prompt}\n\nRank the following responses from best to worst, considering accuracy, completeness, and relevance. Provide the ranking as a comma-separated list of indices (0-indexed). Do not add any explanations or any other text other than the comma-separated list.\n\n" |
| 42 | for i, response in enumerate(responses): |
| 43 | ranking_prompt += f"Response {i}:\n{response['content']}\n\n" |
| 44 | client = AsyncOpenAI() |
| 45 | ranking_response = await client.chat.completions.create( |
| 46 | model="gpt-4o-mini", |
| 47 | messages=[{"role": "user", "content": ranking_prompt}], |
| 48 | ) |
| 49 | |
| 50 | ranking_str = ranking_response.choices[0].message.content.strip() |
| 51 | print(f"Ranking str: {ranking_str}") |
| 52 | return [int(idx) for idx in ranking_str.split(",")] |
| 53 | |
| 54 | async def process_sample(sample: Dict[str, Any]) -> Dict[str, Any]: |
| 55 | """Process a single sample from the dataset.""" |
no test coverage detected