| 147 | } |
| 148 | |
| 149 | export function combineVerdict(signals: LayerSignal[], opts: CombineVerdictOpts = {}): SecurityResult { |
| 150 | // Reduce to the strongest signal per layer. For transcript, we'll re-derive |
| 151 | // the label below; for scalar layers, max confidence is the layer signal. |
| 152 | const byLayerMax: Record<string, number> = {}; |
| 153 | const transcriptSignals: LayerSignal[] = []; |
| 154 | for (const s of signals) { |
| 155 | byLayerMax[s.layer] = Math.max(byLayerMax[s.layer] ?? 0, s.confidence); |
| 156 | if (s.layer === 'transcript_classifier') transcriptSignals.push(s); |
| 157 | } |
| 158 | const content = byLayerMax.testsavant_content ?? 0; |
| 159 | const deberta = byLayerMax.deberta_content ?? 0; |
| 160 | const transcriptMax = byLayerMax.transcript_classifier ?? 0; |
| 161 | const canary = byLayerMax.canary ?? 0; |
| 162 | |
| 163 | // Canary leak is deterministic. Never gated through ensemble. |
| 164 | if (canary >= 1.0) { |
| 165 | return { |
| 166 | verdict: 'block', |
| 167 | reason: 'canary_leaked', |
| 168 | signals, |
| 169 | confidence: 1.0, |
| 170 | }; |
| 171 | } |
| 172 | |
| 173 | // Transcript vote: pick the strongest signal (block > warn > none). |
| 174 | let transcriptVote: VoteStrength = 'none'; |
| 175 | for (const s of transcriptSignals) { |
| 176 | const v = classifyTranscript(s); |
| 177 | if (v === 'block') { transcriptVote = 'block'; break; } |
| 178 | if (v === 'warn' && transcriptVote !== 'block') transcriptVote = 'warn'; |
| 179 | } |
| 180 | |
| 181 | // Scalar-layer votes. |
| 182 | const contentBlockVote = content >= THRESHOLDS.WARN; |
| 183 | const debertaBlockVote = deberta >= THRESHOLDS.WARN; |
| 184 | |
| 185 | let blockVotes = 0; |
| 186 | if (contentBlockVote) blockVotes++; |
| 187 | if (debertaBlockVote) blockVotes++; |
| 188 | if (transcriptVote === 'block') blockVotes++; |
| 189 | |
| 190 | // Ensemble: 2-of-N block-votes trigger BLOCK. |
| 191 | if (blockVotes >= 2) { |
| 192 | // Report confidence as the min of the contributing signals (weakest link), |
| 193 | // matching v1 behavior for consistency with the review banner. |
| 194 | const contributing: number[] = []; |
| 195 | if (contentBlockVote) contributing.push(content); |
| 196 | if (debertaBlockVote) contributing.push(deberta); |
| 197 | if (transcriptVote === 'block') contributing.push(transcriptMax); |
| 198 | return { |
| 199 | verdict: 'block', |
| 200 | reason: 'ensemble_agreement', |
| 201 | signals, |
| 202 | confidence: Math.min(...contributing), |
| 203 | }; |
| 204 | } |
| 205 | |
| 206 | // Single-layer BLOCK. For tool-output, BLOCK directly; for user-input, |