( input: JudgeCommitResultInput, )
| 212 | } |
| 213 | |
| 214 | export async function judgeCommitResult( |
| 215 | input: JudgeCommitResultInput, |
| 216 | ): Promise<JudgingResult> { |
| 217 | const { commit, contextFiles, agentDiff, error, finalCheckOutputs } = input |
| 218 | |
| 219 | const { prompt, fileDiffs } = commit |
| 220 | |
| 221 | const groundTruthDiffs = fileDiffs |
| 222 | .map(({ path, diff }) => { |
| 223 | return `### ${path}\n\`\`\`diff\n${diff}\n\`\`\`` |
| 224 | }) |
| 225 | .join('\n\n') |
| 226 | |
| 227 | const contextFilesContent = Object.entries(contextFiles) |
| 228 | .map(([filePath, content]) => { |
| 229 | return `### ${filePath}\n\`\`\`\n${content}\n\`\`\`` |
| 230 | }) |
| 231 | .join('\n\n') |
| 232 | |
| 233 | const judgePrompt = `## User Prompt (What the agent was asked to do) |
| 234 | ${prompt} |
| 235 | |
| 236 | ## Context Files (from parent commit) |
| 237 | ${contextFilesContent || '(No context files)'} |
| 238 | |
| 239 | ## Ground Truth Changes (One valid implementation) |
| 240 | ${groundTruthDiffs} |
| 241 | |
| 242 | ## Agent's Changes (What the agent actually did) |
| 243 | \`\`\`diff |
| 244 | ${agentDiff || '(No changes made)'} |
| 245 | \`\`\` |
| 246 | ${error ? `\n## Error Encountered\n${error}` : ''} |
| 247 | ${finalCheckOutputs ? `\n## Final Check Command Outputs\n${finalCheckOutputs}` : ''}` |
| 248 | |
| 249 | // Run 2 judges in parallel |
| 250 | const judgePromises = [ |
| 251 | runSingleJudge(input, judgePrompt, 'judge-gpt'), |
| 252 | runSingleJudge(input, judgePrompt, 'judge-gemini'), |
| 253 | ] |
| 254 | |
| 255 | const judgeResults = await Promise.all(judgePromises) |
| 256 | const validResults = judgeResults.filter( |
| 257 | (result): result is JudgingResult => result !== null, |
| 258 | ) |
| 259 | |
| 260 | if (validResults.length === 0) { |
| 261 | console.error('All judges failed to provide results') |
| 262 | return { |
| 263 | analysis: 'Error running judge agent - all judges failed', |
| 264 | strengths: [], |
| 265 | weaknesses: ['All judges failed to provide structured output'], |
| 266 | completionScore: 0, |
| 267 | codeQualityScore: 0, |
| 268 | overallScore: 0, |
| 269 | } |
| 270 | } |
| 271 |
no test coverage detected