hub / github.com/CodebuffAI/codebuff / judgeCommitResult

Function judgeCommitResult

evals/buffbench/judge.ts:214–303 · view source on GitHub ↗

(
  input: JudgeCommitResultInput,
)

Source from the content-addressed store, hash-verified

212	}
213
214	export async function judgeCommitResult(
215	input: JudgeCommitResultInput,
216	): Promise<JudgingResult> {
217	const { commit, contextFiles, agentDiff, error, finalCheckOutputs } = input
218
219	const { prompt, fileDiffs } = commit
220
221	const groundTruthDiffs = fileDiffs
222	.map(({ path, diff }) => {
223	return `### ${path}\n\`\`\`diff\n${diff}\n\`\`\``
224	})
225	.join('\n\n')
226
227	const contextFilesContent = Object.entries(contextFiles)
228	.map(([filePath, content]) => {
229	return `### ${filePath}\n\`\`\`\n${content}\n\`\`\``
230	})
231	.join('\n\n')
232
233	const judgePrompt = `## User Prompt (What the agent was asked to do)
234	${prompt}
235
236	## Context Files (from parent commit)
237	${contextFilesContent \|\| '(No context files)'}
238
239	## Ground Truth Changes (One valid implementation)
240	${groundTruthDiffs}
241
242	## Agent's Changes (What the agent actually did)
243	\`\`\`diff
244	${agentDiff \|\| '(No changes made)'}
245	\`\`\`
246	${error ? `\n## Error Encountered\n${error}` : ''}
247	${finalCheckOutputs ? `\n## Final Check Command Outputs\n${finalCheckOutputs}` : ''}`
248
249	// Run 2 judges in parallel
250	const judgePromises = [
251	runSingleJudge(input, judgePrompt, 'judge-gpt'),
252	runSingleJudge(input, judgePrompt, 'judge-gemini'),
253	]
254
255	const judgeResults = await Promise.all(judgePromises)
256	const validResults = judgeResults.filter(
257	(result): result is JudgingResult => result !== null,
258	)
259
260	if (validResults.length === 0) {
261	console.error('All judges failed to provide results')
262	return {
263	analysis: 'Error running judge agent - all judges failed',
264	strengths: [],
265	weaknesses: ['All judges failed to provide structured output'],
266	completionScore: 0,
267	codeQualityScore: 0,
268	overallScore: 0,
269	}
270	}
271

Callers 1

runTaskFunction · 0.90

Calls 1

runSingleJudgeFunction · 0.85

Tested by

no test coverage detected