Function batchByTokenLimit

apps/sim/lib/tokenization/estimators.ts:134–173 · view source on GitHub ↗

(
  texts: string[],
  maxTokensPerBatch: number,
  modelName = 'text-embedding-3-small'
)

Source from the content-addressed store, hash-verified

132	* Returns array of batches where each batch's total tokens <= maxTokensPerBatch
133	*/
134	export function batchByTokenLimit(
135	texts: string[],
136	maxTokensPerBatch: number,
137	modelName = 'text-embedding-3-small'
138	): string[][] {
139	const batches: string[][] = []
140	let currentBatch: string[] = []
141	let currentTokenCount = 0
142
143	for (const text of texts) {
144	const tokenCount = getAccurateTokenCount(text, modelName)
145
146	if (tokenCount > maxTokensPerBatch) {
147	if (currentBatch.length > 0) {
148	batches.push(currentBatch)
149	currentBatch = []
150	currentTokenCount = 0
151	}
152
153	const truncated = truncateToTokenLimit(text, maxTokensPerBatch, modelName)
154	batches.push([truncated])
155	continue
156	}
157
158	if (currentBatch.length > 0 && currentTokenCount + tokenCount > maxTokensPerBatch) {
159	batches.push(currentBatch)
160	currentBatch = [text]
161	currentTokenCount = tokenCount
162	} else {
163	currentBatch.push(text)
164	currentTokenCount += tokenCount
165	}
166	}
167
168	if (currentBatch.length > 0) {
169	batches.push(currentBatch)
170	}
171
172	return batches
173	}
174
175	/**
176	* Clean up cached encodings (call when shutting down)

generateEmbeddingsFunction · 0.90

getAccurateTokenCountFunction · 0.85

truncateToTokenLimitFunction · 0.85

pushMethod · 0.45

no test coverage detected