( observations: CompressedObservation[], queries: LabeledQuery[], )
| 171 | } |
| 172 | |
| 173 | async function evalDualStream( |
| 174 | observations: CompressedObservation[], |
| 175 | queries: LabeledQuery[], |
| 176 | ): Promise<SystemMetrics> { |
| 177 | const kv = mockKV(); |
| 178 | const bm25 = new SearchIndex(); |
| 179 | const vector = new VectorIndex(); |
| 180 | const dims = 384; |
| 181 | |
| 182 | for (const obs of observations) { |
| 183 | bm25.add(obs); |
| 184 | const text = [obs.title, obs.narrative, ...obs.concepts, ...obs.facts].join(" "); |
| 185 | vector.add(obs.id, obs.sessionId, deterministicEmbedding(text, dims)); |
| 186 | await kv.set(`mem:obs:${obs.sessionId}`, obs.id, obs); |
| 187 | } |
| 188 | |
| 189 | const mockEmbed: any = { |
| 190 | name: "deterministic", |
| 191 | dimensions: dims, |
| 192 | embed: async (text: string) => deterministicEmbedding(text, dims), |
| 193 | embedBatch: async (texts: string[]) => texts.map(t => deterministicEmbedding(t, dims)), |
| 194 | }; |
| 195 | |
| 196 | const hybrid = new HybridSearch(bm25, vector, mockEmbed, kv as never, 0.4, 0.6, 0); |
| 197 | const perQuery: QualityMetrics[] = []; |
| 198 | |
| 199 | for (const q of queries) { |
| 200 | const relevant = new Set(q.relevantObsIds); |
| 201 | const start = performance.now(); |
| 202 | const results = await hybrid.search(q.query, 20); |
| 203 | const latency = performance.now() - start; |
| 204 | |
| 205 | const retrieved = results.map(r => r.observation.id); |
| 206 | perQuery.push({ |
| 207 | query: q.query, |
| 208 | category: q.category, |
| 209 | recall_at_5: recall(retrieved, relevant, 5), |
| 210 | recall_at_10: recall(retrieved, relevant, 10), |
| 211 | recall_at_20: recall(retrieved, relevant, 20), |
| 212 | precision_at_5: precision(retrieved, relevant, 5), |
| 213 | precision_at_10: precision(retrieved, relevant, 10), |
| 214 | ndcg_at_10: ndcg(retrieved, relevant, 10), |
| 215 | mrr: mrr(retrieved, relevant), |
| 216 | relevant_count: relevant.size, |
| 217 | retrieved_count: results.length, |
| 218 | latency_ms: latency, |
| 219 | }); |
| 220 | } |
| 221 | |
| 222 | const avgResultTokens = perQuery.reduce((sum, q) => { |
| 223 | return sum + q.retrieved_count; |
| 224 | }, 0) / perQuery.length; |
| 225 | const avgObsTokens2 = observations.slice(0, 50).reduce((s, o) => s + estimateTokens(JSON.stringify(o)), 0) / 50; |
| 226 | |
| 227 | return { |
| 228 | system: "Dual-stream (BM25+Vector)", |
| 229 | avg_recall_at_5: avg(perQuery.map(q => q.recall_at_5)), |
| 230 | avg_recall_at_10: avg(perQuery.map(q => q.recall_at_10)), |
no test coverage detected