Evaluate runs evaluations with a specified run name. ttyOut is used for progress bar rendering (should be the console/TTY). out is used for results and status messages (can be tee'd to a log file).
(ctx context.Context, ttyOut, out io.Writer, isTTY bool, runName string, runConfig *config.RuntimeConfig, cfg Config)
| 64 | // ttyOut is used for progress bar rendering (should be the console/TTY). |
| 65 | // out is used for results and status messages (can be tee'd to a log file). |
| 66 | func Evaluate(ctx context.Context, ttyOut, out io.Writer, isTTY bool, runName string, runConfig *config.RuntimeConfig, cfg Config) (*EvalRun, error) { |
| 67 | agentSource, err := config.Resolve(cfg.AgentFilename, nil) |
| 68 | if err != nil { |
| 69 | return nil, fmt.Errorf("resolving agent: %w", err) |
| 70 | } |
| 71 | |
| 72 | // Create judge model provider for relevance checking |
| 73 | judgeModel, err := createJudgeModel(ctx, cfg.JudgeModel, runConfig) |
| 74 | if err != nil { |
| 75 | return nil, err |
| 76 | } |
| 77 | |
| 78 | runner := newRunner(agentSource, runConfig, judgeModel, cfg) |
| 79 | |
| 80 | fmt.Fprintf(out, "Evaluation run: %s\n", runName) |
| 81 | |
| 82 | startTime := time.Now() |
| 83 | results, err := runner.Run(ctx, ttyOut, out, isTTY) |
| 84 | duration := time.Since(startTime) |
| 85 | |
| 86 | summary := computeSummary(results) |
| 87 | printSummary(out, summary, duration) |
| 88 | |
| 89 | run := &EvalRun{ |
| 90 | Name: runName, |
| 91 | Timestamp: startTime, |
| 92 | Duration: duration, |
| 93 | Config: cfg, |
| 94 | Results: results, |
| 95 | Summary: summary, |
| 96 | } |
| 97 | |
| 98 | if err != nil { |
| 99 | return run, fmt.Errorf("running evaluations: %w", err) |
| 100 | } |
| 101 | |
| 102 | return run, nil |
| 103 | } |
| 104 | |
| 105 | // workItem represents a single evaluation to be processed. |
| 106 | type workItem struct { |
no test coverage detected