| 121 | } |
| 122 | |
| 123 | async function evalBm25Only( |
| 124 | observations: CompressedObservation[], |
| 125 | queries: LabeledQuery[], |
| 126 | ): Promise<SystemMetrics> { |
| 127 | const index = new SearchIndex(); |
| 128 | for (const obs of observations) index.add(obs); |
| 129 | |
| 130 | const perQuery: QualityMetrics[] = []; |
| 131 | |
| 132 | for (const q of queries) { |
| 133 | const relevant = new Set(q.relevantObsIds); |
| 134 | const start = performance.now(); |
| 135 | const results = index.search(q.query, 20); |
| 136 | const latency = performance.now() - start; |
| 137 | |
| 138 | const retrieved = results.map(r => r.obsId); |
| 139 | perQuery.push({ |
| 140 | query: q.query, |
| 141 | category: q.category, |
| 142 | recall_at_5: recall(retrieved, relevant, 5), |
| 143 | recall_at_10: recall(retrieved, relevant, 10), |
| 144 | recall_at_20: recall(retrieved, relevant, 20), |
| 145 | precision_at_5: precision(retrieved, relevant, 5), |
| 146 | precision_at_10: precision(retrieved, relevant, 10), |
| 147 | ndcg_at_10: ndcg(retrieved, relevant, 10), |
| 148 | mrr: mrr(retrieved, relevant), |
| 149 | relevant_count: relevant.size, |
| 150 | retrieved_count: results.length, |
| 151 | latency_ms: latency, |
| 152 | }); |
| 153 | } |
| 154 | |
| 155 | const avgTokens = perQuery.reduce((sum, q) => sum + q.retrieved_count, 0) / perQuery.length; |
| 156 | const avgObsTokens = observations.slice(0, 50).reduce((s, o) => s + estimateTokens(JSON.stringify(o)), 0) / 50; |
| 157 | |
| 158 | return { |
| 159 | system: "BM25-only", |
| 160 | avg_recall_at_5: avg(perQuery.map(q => q.recall_at_5)), |
| 161 | avg_recall_at_10: avg(perQuery.map(q => q.recall_at_10)), |
| 162 | avg_recall_at_20: avg(perQuery.map(q => q.recall_at_20)), |
| 163 | avg_precision_at_5: avg(perQuery.map(q => q.precision_at_5)), |
| 164 | avg_precision_at_10: avg(perQuery.map(q => q.precision_at_10)), |
| 165 | avg_ndcg_at_10: avg(perQuery.map(q => q.ndcg_at_10)), |
| 166 | avg_mrr: avg(perQuery.map(q => q.mrr)), |
| 167 | avg_latency_ms: avg(perQuery.map(q => q.latency_ms)), |
| 168 | total_tokens_per_query: Math.round(avgObsTokens * avgTokens), |
| 169 | per_query: perQuery, |
| 170 | }; |
| 171 | } |
| 172 | |
| 173 | async function evalDualStream( |
| 174 | observations: CompressedObservation[], |