()
| 221 | } |
| 222 | |
| 223 | async function main() { |
| 224 | // Load the eval file |
| 225 | const evalFilePath = path.join( |
| 226 | __dirname, |
| 227 | '..', |
| 228 | 'git-evals', |
| 229 | 'eval-codebuff2.json', |
| 230 | ) |
| 231 | const evalData: EvalData = JSON.parse(fs.readFileSync(evalFilePath, 'utf-8')) |
| 232 | |
| 233 | const { repoUrl, initCommand, evalCommits } = evalData |
| 234 | |
| 235 | const client = new CodebuffClient({ |
| 236 | apiKey: process.env[API_KEY_ENV_VAR] || getUserCredentials()?.authToken, |
| 237 | }) |
| 238 | |
| 239 | const agentsPath = path.join(__dirname, '../../.agents') |
| 240 | const localAgentDefinitions = Object.values( |
| 241 | await loadLocalAgents({ |
| 242 | agentsPath, |
| 243 | }), |
| 244 | ) |
| 245 | |
| 246 | const allResults = [] as Array<{ |
| 247 | sha: string |
| 248 | spec: string |
| 249 | agentOutput: string |
| 250 | judgingResults: { |
| 251 | reasoning: string |
| 252 | pros: string |
| 253 | cons: string |
| 254 | overallScore: number |
| 255 | } |
| 256 | plannerLatencyMs: number |
| 257 | }> |
| 258 | |
| 259 | // Track statistics |
| 260 | const stats = { |
| 261 | total: evalCommits.length, |
| 262 | completed: 0, |
| 263 | failed: 0, |
| 264 | scores: [] as number[], |
| 265 | plannerLatencies: [] as number[], |
| 266 | } |
| 267 | |
| 268 | // Loop through each eval task |
| 269 | for (const evalCommit of evalCommits) { |
| 270 | const { sha, parentSha, spec, fileStates } = evalCommit |
| 271 | |
| 272 | console.log(`\n=== Running eval for commit ${sha} ===`) |
| 273 | console.log(`Spec: ${spec.substring(0, 100)}...\n`) |
| 274 | |
| 275 | try { |
| 276 | const result = await evalPlannerAgent({ |
| 277 | client, |
| 278 | agentId: 'implementation-planner', |
| 279 | agentDefinitions: localAgentDefinitions, |
| 280 | spec, |
no test coverage detected