Function runSingleJudge

evals/buffbench/judge.ts:150–212 · view source on GitHub ↗

(
  input: JudgeCommitResultInput,
  judgePrompt: string,
  judgeAgentId: string,
)

Source from the content-addressed store, hash-verified

148	}
149
150	async function runSingleJudge(
151	input: JudgeCommitResultInput,
152	judgePrompt: string,
153	judgeAgentId: string,
154	): Promise<JudgingResult \| null> {
155	const { client } = input
156
157	const judgeAgent = judgeAgents[judgeAgentId]
158	const agentOutput: string[] = []
159	try {
160	const judgeResult = await withTimeout(
161	client.run({
162	agent: judgeAgent.id,
163	prompt: judgePrompt,
164	agentDefinitions: Object.values(judgeAgents),
165	handleEvent: (event) => {
166	if (event.type === 'text') {
167	agentOutput.push(event.text)
168	} else if (event.type === 'tool_call') {
169	agentOutput.push(JSON.stringify(event, null, 2))
170	} else if (event.type === 'error') {
171	console.warn(`[Judge ${judgeAgentId}] Error event:`, event.message)
172	}
173	},
174	}),
175	20 * 60 * 1000,
176	'Judge agent timed out after 20 minutes',
177	)
178
179	if (judgeResult.output.type !== 'structuredOutput') {
180	console.error(
181	`Judge ${judgeAgentId} - not structured output`,
182	JSON.stringify(judgeResult.output, null, 2),
183	)
184	console.error(
185	'Judge agent output:',
186	JSON.stringify(judgeResult.output, null, 2),
187	'Judge agent output trace:',
188	agentOutput.join(''),
189	)
190	if (DEBUG_ERROR) {
191	fs.writeFileSync(
192	path.join(
193	__dirname,
194	'..',
195	`${input.commit.id}-${judgeAgentId}-agent-output-error.json`,
196	),
197	JSON.stringify(
198	{ output: judgeResult.output, trace: agentOutput },
199	null,
200	2,
201	),
202	)
203	}
204	return null
205	}
206
207	return judgeResult.output.value as JudgingResult

judgeCommitResultFunction · 0.85

withTimeoutFunction · 0.90

runMethod · 0.45

no test coverage detected