Method split_chunk_tokens

langroid/parsing/parser.py:282–313 · view source on GitHub ↗

(self, docs: List[Document])

Source from the content-addressed store, hash-verified

280	return final_chunks
281
282	def split_chunk_tokens(self, docs: List[Document]) -> List[Document]:
283	final_docs = []
284	for d in docs:
285	if self.config.splitter == Splitter.MARKDOWN:
286	chunks = chunk_markdown(
287	d.content,
288	MarkdownChunkConfig(
289	# apply rough adjustment factor to convert from tokens to words,
290	# which is what the markdown chunker uses
291	chunk_size=int(self.config.chunk_size * 0.75),
292	overlap_tokens=int(self.config.overlap * 0.75),
293	variation_percent=self.config.chunk_size_variation,
294	rollup=True,
295	),
296	)
297	else:
298	chunks = self.chunk_tokens(d.content)
299	# note we are ensuring we COPY the document metadata into each chunk,
300	# which ensures all chunks of a given doc have same metadata
301	# (and in particular same metadata.id, which is important later for
302	# add_window_ids)
303	chunk_docs = [
304	Document(
305	content=c,
306	metadata=d.metadata.model_copy(update=dict(is_chunk=True)),
307	)
308	for c in chunks
309	if c.strip() != ""
310	]
311	self.add_window_ids(chunk_docs)
312	final_docs += chunk_docs
313	return final_docs
314
315	def chunk_tokens(
316	self,

splitMethod · 0.95

chunk_tokensMethod · 0.95

add_window_idsMethod · 0.95

chunk_markdownFunction · 0.90

MarkdownChunkConfigClass · 0.90

DocumentClass · 0.90

model_copyMethod · 0.80

no test coverage detected