MCPcopy Index your code
hub / github.com/simstudioai/sim / batchByTokenLimit

Function batchByTokenLimit

apps/sim/lib/tokenization/estimators.ts:134–173  ·  view source on GitHub ↗
(
  texts: string[],
  maxTokensPerBatch: number,
  modelName = 'text-embedding-3-small'
)

Source from the content-addressed store, hash-verified

132 * Returns array of batches where each batch's total tokens <= maxTokensPerBatch
133 */
134export function batchByTokenLimit(
135 texts: string[],
136 maxTokensPerBatch: number,
137 modelName = 'text-embedding-3-small'
138): string[][] {
139 const batches: string[][] = []
140 let currentBatch: string[] = []
141 let currentTokenCount = 0
142
143 for (const text of texts) {
144 const tokenCount = getAccurateTokenCount(text, modelName)
145
146 if (tokenCount > maxTokensPerBatch) {
147 if (currentBatch.length > 0) {
148 batches.push(currentBatch)
149 currentBatch = []
150 currentTokenCount = 0
151 }
152
153 const truncated = truncateToTokenLimit(text, maxTokensPerBatch, modelName)
154 batches.push([truncated])
155 continue
156 }
157
158 if (currentBatch.length > 0 && currentTokenCount + tokenCount > maxTokensPerBatch) {
159 batches.push(currentBatch)
160 currentBatch = [text]
161 currentTokenCount = tokenCount
162 } else {
163 currentBatch.push(text)
164 currentTokenCount += tokenCount
165 }
166 }
167
168 if (currentBatch.length > 0) {
169 batches.push(currentBatch)
170 }
171
172 return batches
173}
174
175/**
176 * Clean up cached encodings (call when shutting down)

Callers 1

generateEmbeddingsFunction · 0.90

Calls 3

getAccurateTokenCountFunction · 0.85
truncateToTokenLimitFunction · 0.85
pushMethod · 0.45

Tested by

no test coverage detected