MCPcopy
hub / github.com/langroid/langroid / split_simple

Method split_simple

langroid/parsing/parser.py:204–227  ·  view source on GitHub ↗
(self, docs: List[Document])

Source from the content-addressed store, hash-verified

202 c.metadata.is_chunk = True
203
204 def split_simple(self, docs: List[Document]) -> List[Document]:
205 if len(self.config.separators) == 0:
206 raise ValueError("Must have at least one separator")
207 final_docs = []
208
209 for d in docs:
210 if d.content.strip() == "":
211 continue
212 chunks = remove_extra_whitespace(d.content).split(self.config.separators[0])
213 # note we are ensuring we COPY the document metadata into each chunk,
214 # which ensures all chunks of a given doc have same metadata
215 # (and in particular same metadata.id, which is important later for
216 # add_window_ids)
217 chunk_docs = [
218 Document(
219 content=c,
220 metadata=d.metadata.model_copy(update=dict(is_chunk=True)),
221 )
222 for c in chunks
223 if c.strip() != ""
224 ]
225 self.add_window_ids(chunk_docs)
226 final_docs += chunk_docs
227 return final_docs
228
229 def split_para_sentence(self, docs: List[Document]) -> List[Document]:
230 chunks = docs

Callers 1

splitMethod · 0.95

Calls 5

add_window_idsMethod · 0.95
remove_extra_whitespaceFunction · 0.90
DocumentClass · 0.90
model_copyMethod · 0.80
splitMethod · 0.45

Tested by

no test coverage detected