MCPcopy
hub / github.com/docker/docker-agent / Evaluate

Function Evaluate

pkg/evaluation/eval.go:66–103  ·  view source on GitHub ↗

Evaluate runs evaluations with a specified run name. ttyOut is used for progress bar rendering (should be the console/TTY). out is used for results and status messages (can be tee'd to a log file).

(ctx context.Context, ttyOut, out io.Writer, isTTY bool, runName string, runConfig *config.RuntimeConfig, cfg Config)

Source from the content-addressed store, hash-verified

64// ttyOut is used for progress bar rendering (should be the console/TTY).
65// out is used for results and status messages (can be tee'd to a log file).
66func Evaluate(ctx context.Context, ttyOut, out io.Writer, isTTY bool, runName string, runConfig *config.RuntimeConfig, cfg Config) (*EvalRun, error) {
67 agentSource, err := config.Resolve(cfg.AgentFilename, nil)
68 if err != nil {
69 return nil, fmt.Errorf("resolving agent: %w", err)
70 }
71
72 // Create judge model provider for relevance checking
73 judgeModel, err := createJudgeModel(ctx, cfg.JudgeModel, runConfig)
74 if err != nil {
75 return nil, err
76 }
77
78 runner := newRunner(agentSource, runConfig, judgeModel, cfg)
79
80 fmt.Fprintf(out, "Evaluation run: %s\n", runName)
81
82 startTime := time.Now()
83 results, err := runner.Run(ctx, ttyOut, out, isTTY)
84 duration := time.Since(startTime)
85
86 summary := computeSummary(results)
87 printSummary(out, summary, duration)
88
89 run := &EvalRun{
90 Name: runName,
91 Timestamp: startTime,
92 Duration: duration,
93 Config: cfg,
94 Results: results,
95 Summary: summary,
96 }
97
98 if err != nil {
99 return run, fmt.Errorf("running evaluations: %w", err)
100 }
101
102 return run, nil
103}
104
105// workItem represents a single evaluation to be processed.
106type workItem struct {

Callers 1

runEvalCommandMethod · 0.92

Calls 7

ResolveFunction · 0.92
createJudgeModelFunction · 0.85
newRunnerFunction · 0.85
computeSummaryFunction · 0.85
printSummaryFunction · 0.85
NowMethod · 0.80
RunMethod · 0.65

Tested by

no test coverage detected