* Shared planted-bug eval runner. * Gives the agent concise bug-finding instructions (not the full QA workflow), * then scores the report with an LLM outcome judge.
(fixture: string, groundTruthFile: string, label: string)
| 705 | * then scores the report with an LLM outcome judge. |
| 706 | */ |
| 707 | async function runPlantedBugEval(fixture: string, groundTruthFile: string, label: string) { |
| 708 | // Each test gets its own isolated working directory to prevent cross-contamination |
| 709 | // (agents reading previous tests' reports and hallucinating those bugs) |
| 710 | const testWorkDir = fs.mkdtempSync(path.join(os.tmpdir(), `skill-e2e-${label}-`)); |
| 711 | setupBrowseShims(testWorkDir); |
| 712 | const reportDir = path.join(testWorkDir, 'reports'); |
| 713 | fs.mkdirSync(path.join(reportDir, 'screenshots'), { recursive: true }); |
| 714 | const reportPath = path.join(reportDir, 'qa-report.md'); |
| 715 | |
| 716 | // Direct bug-finding with browse. Keep prompt concise — no reading long SKILL.md docs. |
| 717 | // "Write early, update later" pattern ensures report exists even if agent hits max turns. |
| 718 | const targetUrl = `${testServer.url}/${fixture}`; |
| 719 | const result = await runSkillTest({ |
| 720 | prompt: `Find bugs on this page: ${targetUrl} |
| 721 | |
| 722 | Browser binary: B="${browseBin}" |
| 723 | |
| 724 | PHASE 1 — Quick scan (5 commands max): |
| 725 | $B goto ${targetUrl} |
| 726 | $B console --errors |
| 727 | $B snapshot -i |
| 728 | $B snapshot -c |
| 729 | $B accessibility |
| 730 | |
| 731 | PHASE 2 — Write initial report to ${reportPath}: |
| 732 | Write every bug you found so far. Format each as: |
| 733 | - Category: functional / visual / accessibility / console |
| 734 | - Severity: high / medium / low |
| 735 | - Evidence: what you observed |
| 736 | |
| 737 | PHASE 3 — Interactive testing (targeted — max 15 commands): |
| 738 | - Test email: type "user@" (no domain) and blur — does it validate? |
| 739 | - Test quantity: clear the field entirely — check the total display |
| 740 | - Test credit card: type a 25-character string — check for overflow |
| 741 | - Submit the form with zip code empty — does it require zip? |
| 742 | - Submit a valid form and run $B console --errors |
| 743 | - After finding more bugs, UPDATE ${reportPath} with new findings |
| 744 | |
| 745 | PHASE 4 — Finalize report: |
| 746 | - UPDATE ${reportPath} with ALL bugs found across all phases |
| 747 | - Include console errors, form validation issues, visual overflow, missing attributes |
| 748 | |
| 749 | CRITICAL RULES: |
| 750 | - ONLY test the page at ${targetUrl} — do not navigate to other sites |
| 751 | - Write the report file in PHASE 2 before doing interactive testing |
| 752 | - The report MUST exist at ${reportPath} when you finish`, |
| 753 | workingDirectory: testWorkDir, |
| 754 | maxTurns: 50, |
| 755 | timeout: 300_000, |
| 756 | testName: `qa-${label}`, |
| 757 | runId, |
| 758 | }); |
| 759 | |
| 760 | logCost(`/qa ${label}`, result); |
| 761 | |
| 762 | // Phase 1: browse mechanics. Accept error_max_turns — agent may have written |
| 763 | // a partial report before running out of turns. What matters is detection rate. |
| 764 | if (result.browseErrors.length > 0) { |
no test coverage detected