()
| 729 | } |
| 730 | |
| 731 | async function main() { |
| 732 | const { |
| 733 | ollamaOnly, |
| 734 | noJudge, |
| 735 | captureOnly, |
| 736 | judgeLatest, |
| 737 | rejudge, |
| 738 | modelFilter, |
| 739 | } = parseArgs(process.argv.slice(2)) |
| 740 | |
| 741 | mkdirSync(LOG_DIR, { recursive: true }) |
| 742 | if (judgeLatest) { |
| 743 | await judgeLatestSession(rejudge) |
| 744 | return |
| 745 | } |
| 746 | |
| 747 | const effectiveNoJudge = noJudge || captureOnly |
| 748 | const sessionStartedAt = new Date().toISOString() |
| 749 | console.log( |
| 750 | `[models-eval] Run logs → ${relative(process.cwd(), LOG_DIR) || LOG_DIR} (gitignored)`, |
| 751 | ) |
| 752 | |
| 753 | if (!effectiveNoJudge && !process.env.ANTHROPIC_API_KEY) { |
| 754 | console.warn( |
| 755 | '[models-eval] ANTHROPIC_API_KEY is not set; judging will fail. Use --no-judge to skip, or set the key.', |
| 756 | ) |
| 757 | } |
| 758 | |
| 759 | const gold = loadGoldFile() |
| 760 | const query = gold.prompt |
| 761 | const goldReport = extractReportTextFromGold(gold.messages) |
| 762 | |
| 763 | const { tool, systemPrompt } = getCodeModeTools() |
| 764 | const tools = [tool, getSchemaInfoTool] |
| 765 | const systemPrompts = [DATABASE_DEMO_SYSTEM_PROMPT, systemPrompt] |
| 766 | |
| 767 | let models = ollamaOnly |
| 768 | ? EVAL_MODELS.filter((m) => m.model.startsWith('ollama:')) |
| 769 | : EVAL_MODELS |
| 770 | |
| 771 | if (modelFilter && modelFilter.size > 0) { |
| 772 | models = models.filter((m) => modelFilter.has(m.model)) |
| 773 | if (models.length === 0) { |
| 774 | console.error( |
| 775 | '[models-eval] No models matched --models filter. Check eval-config model ids.', |
| 776 | ) |
| 777 | process.exit(1) |
| 778 | } |
| 779 | } |
| 780 | |
| 781 | const rows: Array<EvalRow> = [] |
| 782 | |
| 783 | for (const entry of models) { |
| 784 | const { provider, modelId } = parseModelId(entry.model) |
| 785 | const modelCategory = getModelCategory(entry.model) |
| 786 | console.log(`\n→ Running: ${entry.name} (${entry.model})`) |
| 787 | const adapter = getTextAdapter(provider, modelId) |
| 788 | const t0 = Date.now() |
no test coverage detected