MCPcopy Index your code
hub / github.com/TanStack/ai / judgeLatestSession

Function judgeLatestSession

packages/ai-code-mode/models-eval/run-eval.ts:509–576  ·  view source on GitHub ↗
(rejudge: boolean)

Source from the content-addressed store, hash-verified

507}
508
509async function judgeLatestSession(rejudge: boolean): Promise<void> {
510 const files = listRunLogFiles()
511 const payloads = files
512 .map((path) => ({ path, payload: readRunLog(path) }))
513 .filter(
514 (entry): entry is { path: string; payload: RunLogPayload } =>
515 !!entry.payload,
516 )
517
518 if (payloads.length === 0) {
519 console.error('[models-eval] No run logs found to judge.')
520 process.exit(1)
521 }
522
523 const latestSession = payloads
524 .map((entry) => entry.payload.sessionStartedAt)
525 .sort()
526 .at(-1)
527 if (!latestSession) {
528 console.error('[models-eval] Could not determine latest session.')
529 process.exit(1)
530 }
531
532 const sessionPayloads = payloads.filter(
533 (entry) => entry.payload.sessionStartedAt === latestSession,
534 )
535 const rows: Array<EvalRow> = []
536
537 for (const entry of sessionPayloads) {
538 const payload = entry.payload
539 if (payload.error) {
540 rows.push(toEvalRow(payload))
541 continue
542 }
543 if (payload.judge && !rejudge) {
544 rows.push(toEvalRow(payload))
545 continue
546 }
547
548 const computed = computeMetrics(payload.messages)
549 const judged = await judgeReports({
550 query: payload.query,
551 goldReport: payload.goldReport,
552 candidateReport: payload.candidateReport,
553 typescriptEvidence: formatTypescriptEvidence(computed.typeScriptAttempts),
554 })
555 payload.judge = judged
556 payload.noJudge = false
557
558 writeFileSync(entry.path, `${JSON.stringify(payload, null, 2)}\n`, 'utf8')
559 rows.push(toEvalRow(payload))
560 console.log(` ✓ judged from log: ${payload.entry.model}`)
561 }
562
563 finalizeRows(
564 rows,
565 payloads.map((p) => p.payload),
566 )

Callers 1

mainFunction · 0.85

Calls 13

computeMetricsFunction · 0.90
judgeReportsFunction · 0.90
formatTypescriptEvidenceFunction · 0.90
listRunLogFilesFunction · 0.85
readRunLogFunction · 0.85
toEvalRowFunction · 0.85
finalizeRowsFunction · 0.85
writeResultsSummaryFunction · 0.85
writeCanonicalResultsFunction · 0.85
printTableFunction · 0.85
logMethod · 0.80
errorMethod · 0.65

Tested by

no test coverage detected