MCPcopy
hub / github.com/langroid/langroid / split_chunk_tokens

Method split_chunk_tokens

langroid/parsing/parser.py:282–313  ·  view source on GitHub ↗
(self, docs: List[Document])

Source from the content-addressed store, hash-verified

280 return final_chunks
281
282 def split_chunk_tokens(self, docs: List[Document]) -> List[Document]:
283 final_docs = []
284 for d in docs:
285 if self.config.splitter == Splitter.MARKDOWN:
286 chunks = chunk_markdown(
287 d.content,
288 MarkdownChunkConfig(
289 # apply rough adjustment factor to convert from tokens to words,
290 # which is what the markdown chunker uses
291 chunk_size=int(self.config.chunk_size * 0.75),
292 overlap_tokens=int(self.config.overlap * 0.75),
293 variation_percent=self.config.chunk_size_variation,
294 rollup=True,
295 ),
296 )
297 else:
298 chunks = self.chunk_tokens(d.content)
299 # note we are ensuring we COPY the document metadata into each chunk,
300 # which ensures all chunks of a given doc have same metadata
301 # (and in particular same metadata.id, which is important later for
302 # add_window_ids)
303 chunk_docs = [
304 Document(
305 content=c,
306 metadata=d.metadata.model_copy(update=dict(is_chunk=True)),
307 )
308 for c in chunks
309 if c.strip() != ""
310 ]
311 self.add_window_ids(chunk_docs)
312 final_docs += chunk_docs
313 return final_docs
314
315 def chunk_tokens(
316 self,

Callers 1

splitMethod · 0.95

Calls 6

chunk_tokensMethod · 0.95
add_window_idsMethod · 0.95
chunk_markdownFunction · 0.90
MarkdownChunkConfigClass · 0.90
DocumentClass · 0.90
model_copyMethod · 0.80

Tested by

no test coverage detected