MCPcopy
hub / github.com/langroid/langroid / add_documents

Method add_documents

langroid/vector_store/lancedb.py:196–258  ·  view source on GitHub ↗
(self, documents: Sequence[Document])

Source from the content-addressed store, hash-verified

194 self.delete_collection(collection_name)
195
196 def add_documents(self, documents: Sequence[Document]) -> None:
197 super().maybe_add_ids(documents)
198 colls = self.list_collections(empty=True)
199 if len(documents) == 0:
200 return
201 embedding_vecs = self.embedding_fn([doc.content for doc in documents])
202 coll_name = self.config.collection_name
203 if coll_name is None:
204 raise ValueError("No collection name set, cannot ingest docs")
205 # self._maybe_set_doc_class_schema(documents[0])
206 table_exists = False
207 if (
208 coll_name in colls
209 and self.client.open_table(coll_name).head(1).shape[0] > 0
210 ):
211 # collection exists and is not empty:
212 # if replace_collection is True, we'll overwrite the existing collection,
213 # else we'll append to it.
214 if self.config.replace_collection:
215 self.client.drop_table(coll_name)
216 else:
217 table_exists = True
218
219 ids = [str(d.id()) for d in documents]
220 # don't insert all at once, batch in chunks of b,
221 # else we get an API error
222 b = self.config.batch_size
223
224 def make_batches() -> Generator[List[Dict[str, Any]], None, None]:
225 for i in range(0, len(ids), b):
226 batch = [
227 dict(
228 id=ids[i + j],
229 vector=embedding_vecs[i + j],
230 **doc.model_dump(),
231 )
232 for j, doc in enumerate(documents[i : i + b])
233 ]
234 yield batch
235
236 try:
237 if table_exists:
238 tbl = self.client.open_table(coll_name)
239 tbl.add(make_batches())
240 else:
241 batch_gen = make_batches()
242 batch = next(batch_gen)
243 # use first batch to create table...
244 tbl = self.client.create_table(
245 coll_name,
246 data=batch,
247 mode="create",
248 )
249 # ... and add the rest
250 tbl.add(batch_gen)
251 except Exception as e:
252 logger.error(
253 f"""

Callers 10

vecdbFunction · 0.95
test_lance_metadataFunction · 0.95
ingest_docsMethod · 0.45
ingestMethod · 0.45
test_vector_storesFunction · 0.45

Calls 5

list_collectionsMethod · 0.95
maybe_add_idsMethod · 0.80
idMethod · 0.80
embedding_fnMethod · 0.45
addMethod · 0.45

Tested by 8

vecdbFunction · 0.76
test_lance_metadataFunction · 0.76
test_vector_storesFunction · 0.36