| 118 | // --- Main runner --- |
| 119 | |
| 120 | export async function runSkillTest(options: { |
| 121 | prompt: string; |
| 122 | workingDirectory: string; |
| 123 | maxTurns?: number; |
| 124 | allowedTools?: string[]; |
| 125 | timeout?: number; |
| 126 | testName?: string; |
| 127 | runId?: string; |
| 128 | /** Model to use. Defaults to claude-sonnet-4-6 (overridable via EVALS_MODEL env). */ |
| 129 | model?: string; |
| 130 | /** Extra env vars merged into the spawned claude -p process. Useful for |
| 131 | * per-test GSTACK_HOME overrides so the test doesn't have to spell out |
| 132 | * env setup in the prompt itself. */ |
| 133 | env?: Record<string, string>; |
| 134 | }): Promise<SkillTestResult> { |
| 135 | const { |
| 136 | prompt, |
| 137 | workingDirectory, |
| 138 | maxTurns = 15, |
| 139 | allowedTools = ['Bash', 'Read', 'Write'], |
| 140 | timeout = 120_000, |
| 141 | testName, |
| 142 | runId, |
| 143 | env: extraEnv, |
| 144 | } = options; |
| 145 | const model = options.model ?? process.env.EVALS_MODEL ?? 'claude-sonnet-4-6'; |
| 146 | |
| 147 | const startTime = Date.now(); |
| 148 | const startedAt = new Date().toISOString(); |
| 149 | |
| 150 | // Set up per-run log directory if runId is provided |
| 151 | let runDir: string | null = null; |
| 152 | const safeName = testName ? sanitizeTestName(testName) : null; |
| 153 | if (runId) { |
| 154 | try { |
| 155 | runDir = path.join(PROJECT_DIR, 'e2e-runs', runId); |
| 156 | fs.mkdirSync(runDir, { recursive: true }); |
| 157 | } catch { /* non-fatal */ } |
| 158 | } |
| 159 | |
| 160 | // Spawn claude -p with streaming NDJSON output. Prompt piped via stdin to |
| 161 | // avoid shell escaping issues. --verbose is required for stream-json mode. |
| 162 | const args = [ |
| 163 | '-p', |
| 164 | '--model', model, |
| 165 | '--output-format', 'stream-json', |
| 166 | '--verbose', |
| 167 | '--dangerously-skip-permissions', |
| 168 | '--max-turns', String(maxTurns), |
| 169 | '--allowed-tools', ...allowedTools, |
| 170 | ]; |
| 171 | // Hermetic children get zero MCP servers (no --mcp-config is passed). |
| 172 | // Gated on the same call-time check as the env scrub so EVALS_HERMETIC=0 |
| 173 | // restores operator MCP along with the operator env. |
| 174 | if (isHermeticEnabled()) args.push('--strict-mcp-config'); |
| 175 | |
| 176 | // Write prompt to a temp file OUTSIDE workingDirectory to avoid race conditions |
| 177 | // where afterAll cleanup deletes the dir before cat reads the file (especially |