( inputs: string[], ubatchSize: number, charsPerToken = DEFAULT_CHARS_PER_TOKEN )
| 228 | } |
| 229 | |
| 230 | export function buildEmbedBatches( |
| 231 | inputs: string[], |
| 232 | ubatchSize: number, |
| 233 | charsPerToken = DEFAULT_CHARS_PER_TOKEN |
| 234 | ): EmbedBatch[] { |
| 235 | const minUbatchSize = Math.ceil(1 / UBATCH_SAFETY_MARGIN) |
| 236 | if (ubatchSize < minUbatchSize) { |
| 237 | throw new Error( |
| 238 | `ubatch_size (${ubatchSize}) is too small. Minimum required: ${minUbatchSize}` |
| 239 | ) |
| 240 | } |
| 241 | |
| 242 | const safeLimit = Math.floor(ubatchSize * UBATCH_SAFETY_MARGIN) |
| 243 | |
| 244 | const batches: EmbedBatch[] = [] |
| 245 | let current: string[] = [] |
| 246 | let currentTokens = 0 |
| 247 | let offset = 0 |
| 248 | |
| 249 | const push = () => { |
| 250 | if (current.length) { |
| 251 | batches.push({ batch: current, offset }) |
| 252 | offset += current.length |
| 253 | current = [] |
| 254 | currentTokens = 0 |
| 255 | } |
| 256 | } |
| 257 | |
| 258 | for (const raw of inputs) { |
| 259 | const text = |
| 260 | estimateTokensFromText(raw, charsPerToken) > safeLimit |
| 261 | ? truncateToTokenBudget(raw, safeLimit, charsPerToken) |
| 262 | : raw |
| 263 | const estTokens = estimateTokensFromText(text, charsPerToken) |
| 264 | |
| 265 | if (currentTokens + estTokens > safeLimit && current.length) { |
| 266 | push() |
| 267 | } |
| 268 | |
| 269 | current.push(text) |
| 270 | currentTokens += estTokens |
| 271 | } |
| 272 | |
| 273 | push() |
| 274 | |
| 275 | if (batches.some(b => b.batch.length === 0)) { |
| 276 | throw new Error('Internal error: empty batch detected') |
| 277 | } |
| 278 | |
| 279 | return batches |
| 280 | } |
| 281 | |
| 282 | export function mergeEmbedResponses( |
| 283 | model: string, |
no test coverage detected