Method split_simple

langroid/parsing/parser.py:204–227 · view source on GitHub ↗

(self, docs: List[Document])

Source from the content-addressed store, hash-verified

202	c.metadata.is_chunk = True
203
204	def split_simple(self, docs: List[Document]) -> List[Document]:
205	if len(self.config.separators) == 0:
206	raise ValueError("Must have at least one separator")
207	final_docs = []
208
209	for d in docs:
210	if d.content.strip() == "":
211	continue
212	chunks = remove_extra_whitespace(d.content).split(self.config.separators[0])
213	# note we are ensuring we COPY the document metadata into each chunk,
214	# which ensures all chunks of a given doc have same metadata
215	# (and in particular same metadata.id, which is important later for
216	# add_window_ids)
217	chunk_docs = [
218	Document(
219	content=c,
220	metadata=d.metadata.model_copy(update=dict(is_chunk=True)),
221	)
222	for c in chunks
223	if c.strip() != ""
224	]
225	self.add_window_ids(chunk_docs)
226	final_docs += chunk_docs
227	return final_docs
228
229	def split_para_sentence(self, docs: List[Document]) -> List[Document]:
230	chunks = docs

splitMethod · 0.95

add_window_idsMethod · 0.95

remove_extra_whitespaceFunction · 0.90

DocumentClass · 0.90

model_copyMethod · 0.80

splitMethod · 0.45

no test coverage detected