hub / github.com/langroid/langroid / test_recursive_chunk_enhanced

Function test_recursive_chunk_enhanced

tests/main/test_md_parser.py:625–709 · view source on GitHub ↗

()

Source from the content-addressed store, hash-verified

623
624
625	def test_recursive_chunk_enhanced():
626	config = MarkdownChunkConfig(
627	chunk_size=8,
628	overlap_tokens=2,
629	variation_percent=0.3,
630	)
631
632	# Construct a text with 2 paragraphs, each containing 2 sentences,
633	# plus paragraph markers
634	paragraph1 = (
635	"word1 word2 word3 word4 sentence1.\n"
636	"word5 word6 word7 word8 sentence2. PARA1"
637	)
638	paragraph2 = (
639	"cat1 cat2 cat3 cat4 sentence1.\n" "cat5 cat6 cat7 cat8 sentence2. PARA2"
640	)
641
642	text = paragraph1 + "\n\n" + paragraph2
643
644	# Now chunk it
645	chunks = recursive_chunk(text, config)
646
647	print("\n------------------ ENHANCED CHUNK TEST ------------------")
648	for i, c in enumerate(chunks, 1):
649	print(f"Chunk {i} ({len(c.split())} words):\n{c}\n")
650
651	# A. Check no chunk splits mid-sentence
652	for i, chunk in enumerate(chunks, 1):
653	# We expect every sentence boundary to remain intact:
654	# "sentence1." or "sentence2." should not be truncated in the middle
655	assert (
656	"sentence1." in chunk or "sentence2." in chunk or "PARA" in chunk
657	), f"Chunk {i} might have truncated a sentence or lost markers: {chunk}"
658
659	# B. Check paragraph markers do not get merged.
660	# We expect that "PARA1" and "PARA2" never appear in the same chunk.
661	for i, chunk in enumerate(chunks, 1):
662	assert not (
663	"PARA1" in chunk and "PARA2" in chunk
664	), "Found both PARA1 and PARA2 in the same chunk!"
665
666	# C. If there's overlap, ensure it's only from chunk (i) to chunk (i+1).
667	# A naive check: the last 2 tokens of chunk i = the first 2 tokens of chunk i+1,
668	# but chunk i+2 does not contain that same overlap at the start.
669	for i in range(len(chunks) - 1):
670	chunk_i_tokens = chunks[i].split()
671	chunk_i_plus_1_tokens = chunks[i + 1].split()
672
673	overlap_i = chunk_i_tokens[-2:] # last 2 tokens of chunk i
674	start_of_chunk_i_plus_1 = chunk_i_plus_1_tokens[
675	:2
676	] # first 2 tokens of chunk i+1
677	assert overlap_i == start_of_chunk_i_plus_1, (
678	f"Expected chunk {i+1} to start with overlap tokens from chunk {i}.\n"
679	f"Overlap {overlap_i}, got {start_of_chunk_i_plus_1}"
680	)
681
682	# Now check chunk (i+2) if it exists

Callers

nothing calls this directly

Calls 3

MarkdownChunkConfigClass · 0.90

recursive_chunkFunction · 0.90

splitMethod · 0.45

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…