(rejudge: boolean)
| 507 | } |
| 508 | |
| 509 | async function judgeLatestSession(rejudge: boolean): Promise<void> { |
| 510 | const files = listRunLogFiles() |
| 511 | const payloads = files |
| 512 | .map((path) => ({ path, payload: readRunLog(path) })) |
| 513 | .filter( |
| 514 | (entry): entry is { path: string; payload: RunLogPayload } => |
| 515 | !!entry.payload, |
| 516 | ) |
| 517 | |
| 518 | if (payloads.length === 0) { |
| 519 | console.error('[models-eval] No run logs found to judge.') |
| 520 | process.exit(1) |
| 521 | } |
| 522 | |
| 523 | const latestSession = payloads |
| 524 | .map((entry) => entry.payload.sessionStartedAt) |
| 525 | .sort() |
| 526 | .at(-1) |
| 527 | if (!latestSession) { |
| 528 | console.error('[models-eval] Could not determine latest session.') |
| 529 | process.exit(1) |
| 530 | } |
| 531 | |
| 532 | const sessionPayloads = payloads.filter( |
| 533 | (entry) => entry.payload.sessionStartedAt === latestSession, |
| 534 | ) |
| 535 | const rows: Array<EvalRow> = [] |
| 536 | |
| 537 | for (const entry of sessionPayloads) { |
| 538 | const payload = entry.payload |
| 539 | if (payload.error) { |
| 540 | rows.push(toEvalRow(payload)) |
| 541 | continue |
| 542 | } |
| 543 | if (payload.judge && !rejudge) { |
| 544 | rows.push(toEvalRow(payload)) |
| 545 | continue |
| 546 | } |
| 547 | |
| 548 | const computed = computeMetrics(payload.messages) |
| 549 | const judged = await judgeReports({ |
| 550 | query: payload.query, |
| 551 | goldReport: payload.goldReport, |
| 552 | candidateReport: payload.candidateReport, |
| 553 | typescriptEvidence: formatTypescriptEvidence(computed.typeScriptAttempts), |
| 554 | }) |
| 555 | payload.judge = judged |
| 556 | payload.noJudge = false |
| 557 | |
| 558 | writeFileSync(entry.path, `${JSON.stringify(payload, null, 2)}\n`, 'utf8') |
| 559 | rows.push(toEvalRow(payload)) |
| 560 | console.log(` ✓ judged from log: ${payload.entry.model}`) |
| 561 | } |
| 562 | |
| 563 | finalizeRows( |
| 564 | rows, |
| 565 | payloads.map((p) => p.payload), |
| 566 | ) |
no test coverage detected