MCPcopy
hub / github.com/FlowiseAI/Flowise / index

Function index

packages/components/src/indexing.ts:257–378  ·  view source on GitHub ↗
(args: IndexArgs)

Source from the content-addressed store, hash-verified

255 * @returns {Promise<IndexingResult>}
256 */
257export async function index(args: IndexArgs): Promise<IndexingResult> {
258 const { docsSource, recordManager, vectorStore, options } = args
259 const { batchSize = 100, cleanup, sourceIdKey, cleanupBatchSize = 1000, forceUpdate = false, vectorStoreName } = options ?? {}
260
261 if (cleanup === 'incremental' && !sourceIdKey) {
262 throw new Error("sourceIdKey is required when cleanup mode is incremental. Please provide through 'options.sourceIdKey'.")
263 }
264
265 if (vectorStoreName) {
266 ;(recordManager as any).namespace = (recordManager as any).namespace + '_' + vectorStoreName
267 }
268
269 const docs = _isBaseDocumentLoader(docsSource) ? await docsSource.load() : docsSource
270
271 const sourceIdAssigner = _getSourceIdAssigner(sourceIdKey ?? null)
272
273 const indexStartDt = await recordManager.getTime()
274 let numAdded = 0
275 let addedDocs: Document[] = []
276 let numDeleted = 0
277 let numUpdated = 0
278 let numSkipped = 0
279 let totalKeys = 0
280
281 const batches = _batch<DocumentInterface>(batchSize ?? 100, docs)
282
283 for (const batch of batches) {
284 const hashedDocs = _deduplicateInOrder(batch.map((doc) => _HashedDocument.fromDocument(doc)))
285
286 const sourceIds = hashedDocs.map((doc) => sourceIdAssigner(doc))
287
288 if (cleanup === 'incremental') {
289 hashedDocs.forEach((_hashedDoc, index) => {
290 const source = sourceIds[index]
291 if (source === null) {
292 throw new Error('sourceIdKey must be provided when cleanup is incremental')
293 }
294 })
295 }
296
297 const batchExists = await recordManager.exists(hashedDocs.map((doc) => doc.uid))
298
299 const uids: string[] = []
300 const docsToIndex: DocumentInterface[] = []
301 const docsToUpdate: Array<{ uid: string; docId: string }> = []
302 const seenDocs = new Set<string>()
303 hashedDocs.forEach((hashedDoc, i) => {
304 const docExists = batchExists[i]
305 if (docExists) {
306 if (forceUpdate) {
307 seenDocs.add(hashedDoc.uid)
308 } else {
309 docsToUpdate.push({ uid: hashedDoc.uid, docId: hashedDoc.metadata.docId as string })
310 return
311 }
312 }
313 uids.push(hashedDoc.uid)
314 docsToIndex.push(hashedDoc.toDocument())

Callers 8

upsertMethod · 0.90
upsertMethod · 0.90
upsertMethod · 0.90
upsertMethod · 0.90
upsertMethod · 0.90
upsertMethod · 0.90
upsertMethod · 0.90
upsertMethod · 0.90

Calls 15

_isBaseDocumentLoaderFunction · 0.85
_getSourceIdAssignerFunction · 0.85
_batchFunction · 0.85
_deduplicateInOrderFunction · 0.85
fromDocumentMethod · 0.80
loadMethod · 0.65
toDocumentMethod · 0.65
updateMethod · 0.65
getTimeMethod · 0.45
existsMethod · 0.45
addMethod · 0.45
addDocumentsMethod · 0.45

Tested by

no test coverage detected