Rate response quality 0-10 via LLM-as-judge.
(context: str, question: str, response: str)
| 104 | # PRM Judge (single call per response, no max_tokens) # |
| 105 | # ------------------------------------------------------------------ # |
| 106 | async def prm_judge(context: str, question: str, response: str) -> dict: |
| 107 | """Rate response quality 0-10 via LLM-as-judge.""" |
| 108 | prompt = f"""Rate this AI response on a scale of 0 to 10. |
| 109 | Context from previous sessions: {context} |
| 110 | User question: {question} |
| 111 | Response: {response[:800]} |
| 112 | Reply ONLY with JSON: {{"score": <0-10>, "reasoning": "<brief>"}}""" |
| 113 | |
| 114 | for attempt in range(3): |
| 115 | try: |
| 116 | async with httpx.AsyncClient(timeout=60.0) as client: |
| 117 | resp = await client.post( |
| 118 | f"{AZURE_API_BASE}/chat/completions", |
| 119 | json={ |
| 120 | "model": AZURE_MODEL, |
| 121 | "messages": [{"role": "user", "content": prompt}], |
| 122 | "temperature": 0.1, |
| 123 | "max_completion_tokens": 150, |
| 124 | }, |
| 125 | headers={ |
| 126 | "Authorization": f"Bearer {AZURE_API_KEY}", |
| 127 | "Content-Type": "application/json", |
| 128 | }, |
| 129 | ) |
| 130 | resp.raise_for_status() |
| 131 | text = resp.json()["choices"][0]["message"]["content"].strip() |
| 132 | if "```" in text: |
| 133 | text = text.split("```")[1] |
| 134 | if text.startswith("json"): |
| 135 | text = text[4:] |
| 136 | result = json.loads(text) |
| 137 | return {"score": float(result.get("score", 5)) / 10.0, "reasoning": result.get("reasoning", "")} |
| 138 | except httpx.HTTPStatusError as e: |
| 139 | if e.response.status_code == 429: |
| 140 | wait = 15 * (attempt + 1) |
| 141 | logger.warning(" PRM judge rate limited, waiting %ds...", wait) |
| 142 | await asyncio.sleep(wait) |
| 143 | continue |
| 144 | logger.warning(" PRM judge HTTP error: %s", e) |
| 145 | return {"score": 0.5, "reasoning": f"http_error_{e.response.status_code}"} |
| 146 | except Exception as e: |
| 147 | logger.warning(" PRM judge error: %s", e) |
| 148 | return {"score": 0.5, "reasoning": "parse_error"} |
| 149 | |
| 150 | return {"score": 0.5, "reasoning": "rate_limited"} |
| 151 | |
| 152 | |
| 153 | # ------------------------------------------------------------------ # |
no test coverage detected