(args: IndexArgs)
| 255 | * @returns {Promise<IndexingResult>} |
| 256 | */ |
| 257 | export async function index(args: IndexArgs): Promise<IndexingResult> { |
| 258 | const { docsSource, recordManager, vectorStore, options } = args |
| 259 | const { batchSize = 100, cleanup, sourceIdKey, cleanupBatchSize = 1000, forceUpdate = false, vectorStoreName } = options ?? {} |
| 260 | |
| 261 | if (cleanup === 'incremental' && !sourceIdKey) { |
| 262 | throw new Error("sourceIdKey is required when cleanup mode is incremental. Please provide through 'options.sourceIdKey'.") |
| 263 | } |
| 264 | |
| 265 | if (vectorStoreName) { |
| 266 | ;(recordManager as any).namespace = (recordManager as any).namespace + '_' + vectorStoreName |
| 267 | } |
| 268 | |
| 269 | const docs = _isBaseDocumentLoader(docsSource) ? await docsSource.load() : docsSource |
| 270 | |
| 271 | const sourceIdAssigner = _getSourceIdAssigner(sourceIdKey ?? null) |
| 272 | |
| 273 | const indexStartDt = await recordManager.getTime() |
| 274 | let numAdded = 0 |
| 275 | let addedDocs: Document[] = [] |
| 276 | let numDeleted = 0 |
| 277 | let numUpdated = 0 |
| 278 | let numSkipped = 0 |
| 279 | let totalKeys = 0 |
| 280 | |
| 281 | const batches = _batch<DocumentInterface>(batchSize ?? 100, docs) |
| 282 | |
| 283 | for (const batch of batches) { |
| 284 | const hashedDocs = _deduplicateInOrder(batch.map((doc) => _HashedDocument.fromDocument(doc))) |
| 285 | |
| 286 | const sourceIds = hashedDocs.map((doc) => sourceIdAssigner(doc)) |
| 287 | |
| 288 | if (cleanup === 'incremental') { |
| 289 | hashedDocs.forEach((_hashedDoc, index) => { |
| 290 | const source = sourceIds[index] |
| 291 | if (source === null) { |
| 292 | throw new Error('sourceIdKey must be provided when cleanup is incremental') |
| 293 | } |
| 294 | }) |
| 295 | } |
| 296 | |
| 297 | const batchExists = await recordManager.exists(hashedDocs.map((doc) => doc.uid)) |
| 298 | |
| 299 | const uids: string[] = [] |
| 300 | const docsToIndex: DocumentInterface[] = [] |
| 301 | const docsToUpdate: Array<{ uid: string; docId: string }> = [] |
| 302 | const seenDocs = new Set<string>() |
| 303 | hashedDocs.forEach((hashedDoc, i) => { |
| 304 | const docExists = batchExists[i] |
| 305 | if (docExists) { |
| 306 | if (forceUpdate) { |
| 307 | seenDocs.add(hashedDoc.uid) |
| 308 | } else { |
| 309 | docsToUpdate.push({ uid: hashedDoc.uid, docId: hashedDoc.metadata.docId as string }) |
| 310 | return |
| 311 | } |
| 312 | } |
| 313 | uids.push(hashedDoc.uid) |
| 314 | docsToIndex.push(hashedDoc.toDocument()) |
no test coverage detected