MCPcopy Index your code
hub / github.com/CodebuffAI/codebuff / judgeCommitResult

Function judgeCommitResult

evals/buffbench/judge.ts:214–303  ·  view source on GitHub ↗
(
  input: JudgeCommitResultInput,
)

Source from the content-addressed store, hash-verified

212}
213
214export async function judgeCommitResult(
215 input: JudgeCommitResultInput,
216): Promise<JudgingResult> {
217 const { commit, contextFiles, agentDiff, error, finalCheckOutputs } = input
218
219 const { prompt, fileDiffs } = commit
220
221 const groundTruthDiffs = fileDiffs
222 .map(({ path, diff }) => {
223 return `### ${path}\n\`\`\`diff\n${diff}\n\`\`\``
224 })
225 .join('\n\n')
226
227 const contextFilesContent = Object.entries(contextFiles)
228 .map(([filePath, content]) => {
229 return `### ${filePath}\n\`\`\`\n${content}\n\`\`\``
230 })
231 .join('\n\n')
232
233 const judgePrompt = `## User Prompt (What the agent was asked to do)
234${prompt}
235
236## Context Files (from parent commit)
237${contextFilesContent || '(No context files)'}
238
239## Ground Truth Changes (One valid implementation)
240${groundTruthDiffs}
241
242## Agent's Changes (What the agent actually did)
243\`\`\`diff
244${agentDiff || '(No changes made)'}
245\`\`\`
246${error ? `\n## Error Encountered\n${error}` : ''}
247${finalCheckOutputs ? `\n## Final Check Command Outputs\n${finalCheckOutputs}` : ''}`
248
249 // Run 2 judges in parallel
250 const judgePromises = [
251 runSingleJudge(input, judgePrompt, 'judge-gpt'),
252 runSingleJudge(input, judgePrompt, 'judge-gemini'),
253 ]
254
255 const judgeResults = await Promise.all(judgePromises)
256 const validResults = judgeResults.filter(
257 (result): result is JudgingResult => result !== null,
258 )
259
260 if (validResults.length === 0) {
261 console.error('All judges failed to provide results')
262 return {
263 analysis: 'Error running judge agent - all judges failed',
264 strengths: [],
265 weaknesses: ['All judges failed to provide structured output'],
266 completionScore: 0,
267 codeQualityScore: 0,
268 overallScore: 0,
269 }
270 }
271

Callers 1

runTaskFunction · 0.90

Calls 1

runSingleJudgeFunction · 0.85

Tested by

no test coverage detected