Function main

packages/ai-code-mode/models-eval/run-eval.ts:731–958 · view source on GitHub ↗

()

Source from the content-addressed store, hash-verified

729	}
730
731	async function main() {
732	const {
733	ollamaOnly,
734	noJudge,
735	captureOnly,
736	judgeLatest,
737	rejudge,
738	modelFilter,
739	} = parseArgs(process.argv.slice(2))
740
741	mkdirSync(LOG_DIR, { recursive: true })
742	if (judgeLatest) {
743	await judgeLatestSession(rejudge)
744	return
745	}
746
747	const effectiveNoJudge = noJudge \|\| captureOnly
748	const sessionStartedAt = new Date().toISOString()
749	console.log(
750	`[models-eval] Run logs → ${relative(process.cwd(), LOG_DIR) \|\| LOG_DIR} (gitignored)`,
751	)
752
753	if (!effectiveNoJudge && !process.env.ANTHROPIC_API_KEY) {
754	console.warn(
755	'[models-eval] ANTHROPIC_API_KEY is not set; judging will fail. Use --no-judge to skip, or set the key.',
756	)
757	}
758
759	const gold = loadGoldFile()
760	const query = gold.prompt
761	const goldReport = extractReportTextFromGold(gold.messages)
762
763	const { tool, systemPrompt } = getCodeModeTools()
764	const tools = [tool, getSchemaInfoTool]
765	const systemPrompts = [DATABASE_DEMO_SYSTEM_PROMPT, systemPrompt]
766
767	let models = ollamaOnly
768	? EVAL_MODELS.filter((m) => m.model.startsWith('ollama:'))
769	: EVAL_MODELS
770
771	if (modelFilter && modelFilter.size > 0) {
772	models = models.filter((m) => modelFilter.has(m.model))
773	if (models.length === 0) {
774	console.error(
775	'[models-eval] No models matched --models filter. Check eval-config model ids.',
776	)
777	process.exit(1)
778	}
779	}
780
781	const rows: Array<EvalRow> = []
782
783	for (const entry of models) {
784	const { provider, modelId } = parseModelId(entry.model)
785	const modelCategory = getModelCategory(entry.model)
786	console.log(`\n→ Running: ${entry.name} (${entry.model})`)
787	const adapter = getTextAdapter(provider, modelId)
788	const t0 = Date.now()

Callers 1

run-eval.tsFile · 0.70

Calls 15

processMethod · 0.95

getMessagesMethod · 0.95

parseModelIdFunction · 0.90

getModelCategoryFunction · 0.90

normalizeToUIMessageFunction · 0.90

chatFunction · 0.90

maxIterationsFunction · 0.90

computeMetricsFunction · 0.90

judgeReportsFunction · 0.90

formatTypescriptEvidenceFunction · 0.90

judgeLatestSessionFunction · 0.85

loadGoldFileFunction · 0.85

Tested by

no test coverage detected