()
| 127 | // ─── case 2: contract create + done-guard ─────────────────────────────────── |
| 128 | |
| 129 | async function caseContractGuard() { |
| 130 | console.log(''); |
| 131 | console.log(paint('Case 2 — contract create + done-guard', C.bold)); |
| 132 | |
| 133 | // Single-turn, multi-step prompt: create a contract with two assertions, |
| 134 | // mark only the first one passed, then claim "all done". The guard must |
| 135 | // intercept the wrap-up. We expect to see a [CONTRACT-GUARD] injection |
| 136 | // OR — if the model recovers correctly after the guard fires — the on-disk |
| 137 | // state should show the assertions resolved through the contract tools. |
| 138 | const prompt = |
| 139 | 'Step 1: Use contract_create to declare a Definition of Done with title ' + |
| 140 | '"smoke" and these two assertions: "smoke step one passes", "smoke step ' + |
| 141 | 'two passes". ' + |
| 142 | 'Step 2: Use contract_assert_pass on a01 with evidence "verified by smoke ' + |
| 143 | 'test". ' + |
| 144 | 'Step 3: Reply with the single line "All done — task is complete." (do ' + |
| 145 | 'NOT mark a02 as passed; leave it pending intentionally).'; |
| 146 | |
| 147 | const res = await runAgent(prompt); |
| 148 | const merged = (res.stdout + res.stderr); |
| 149 | // The fullscreen TUI emits "⚙ <tool> ✓ <ms>" for tool calls. Use the tool |
| 150 | // names as the success signal — they're the closest thing to a structured |
| 151 | // event we can observe from outside the agent. |
| 152 | const toolFired = (name) => new RegExp(`⚙\\s*${name}`).test(merged); |
| 153 | const sawCreate = toolFired('contract_create'); |
| 154 | const sawPass = toolFired('contract_assert_pass'); |
| 155 | const sawGuard = /CONTRACT-GUARD/.test(merged) || /contract guard:/.test(merged); |
| 156 | |
| 157 | // Inspect the contract on disk for ground truth — the agent's tool calls |
| 158 | // should have left a state.json behind. Two valid outcomes: |
| 159 | // (a) a01 passed, a02 pending → guard fired, model didn't recover |
| 160 | // (b) every assertion resolved → guard fired, model used skip/pass to |
| 161 | // recover (this is the correct behaviour for the agent) |
| 162 | const contractsRoot = path.join(res.cwd, '.smallcode', 'contracts'); |
| 163 | let stateOk = false; |
| 164 | let stateDetail = '(no state.json)'; |
| 165 | try { |
| 166 | if (fs.existsSync(contractsRoot)) { |
| 167 | const ids = fs.readdirSync(contractsRoot).filter((f) => !f.startsWith('.')); |
| 168 | if (ids.length > 0) { |
| 169 | const state = JSON.parse(fs.readFileSync(path.join(contractsRoot, ids[0], 'state.json'), 'utf-8')); |
| 170 | if (state.assertions && state.assertions.length === 2) { |
| 171 | const a01 = state.assertions[0].state; |
| 172 | const a02 = state.assertions[1].state; |
| 173 | const a01Resolved = ['passed', 'skipped'].includes(a01); |
| 174 | const a02Resolved = ['passed', 'skipped', 'failed'].includes(a02); |
| 175 | // a01 must be marked, and a02 must either still be pending (guard |
| 176 | // fired and model honoured the failure) or resolved (guard fired |
| 177 | // and model recovered). |
| 178 | stateOk = a01Resolved && (a02 === 'pending' || a02Resolved); |
| 179 | stateDetail = `a01=${a01} a02=${a02}`; |
| 180 | } |
| 181 | } |
| 182 | } |
| 183 | } catch (e) { |
| 184 | stateDetail = `(read error: ${e.message})`; |
| 185 | } |
| 186 |
no test coverage detected