MCPcopy
hub / github.com/langroid/langroid / test_recursive_chunk_enhanced

Function test_recursive_chunk_enhanced

tests/main/test_md_parser.py:625–709  ·  view source on GitHub ↗
()

Source from the content-addressed store, hash-verified

623
624
625def test_recursive_chunk_enhanced():
626 config = MarkdownChunkConfig(
627 chunk_size=8,
628 overlap_tokens=2,
629 variation_percent=0.3,
630 )
631
632 # Construct a text with 2 paragraphs, each containing 2 sentences,
633 # plus paragraph markers
634 paragraph1 = (
635 "word1 word2 word3 word4 sentence1.\n"
636 "word5 word6 word7 word8 sentence2. PARA1"
637 )
638 paragraph2 = (
639 "cat1 cat2 cat3 cat4 sentence1.\n" "cat5 cat6 cat7 cat8 sentence2. PARA2"
640 )
641
642 text = paragraph1 + "\n\n" + paragraph2
643
644 # Now chunk it
645 chunks = recursive_chunk(text, config)
646
647 print("\n------------------ ENHANCED CHUNK TEST ------------------")
648 for i, c in enumerate(chunks, 1):
649 print(f"Chunk {i} ({len(c.split())} words):\n{c}\n")
650
651 # A. Check no chunk splits mid-sentence
652 for i, chunk in enumerate(chunks, 1):
653 # We expect every sentence boundary to remain intact:
654 # "sentence1." or "sentence2." should not be truncated in the middle
655 assert (
656 "sentence1." in chunk or "sentence2." in chunk or "PARA" in chunk
657 ), f"Chunk {i} might have truncated a sentence or lost markers: {chunk}"
658
659 # B. Check paragraph markers do not get merged.
660 # We expect that "PARA1" and "PARA2" never appear in the same chunk.
661 for i, chunk in enumerate(chunks, 1):
662 assert not (
663 "PARA1" in chunk and "PARA2" in chunk
664 ), "Found both PARA1 and PARA2 in the same chunk!"
665
666 # C. If there's overlap, ensure it's only from chunk (i) to chunk (i+1).
667 # A naive check: the last 2 tokens of chunk i = the first 2 tokens of chunk i+1,
668 # but chunk i+2 does not contain that same overlap at the start.
669 for i in range(len(chunks) - 1):
670 chunk_i_tokens = chunks[i].split()
671 chunk_i_plus_1_tokens = chunks[i + 1].split()
672
673 overlap_i = chunk_i_tokens[-2:] # last 2 tokens of chunk i
674 start_of_chunk_i_plus_1 = chunk_i_plus_1_tokens[
675 :2
676 ] # first 2 tokens of chunk i+1
677 assert overlap_i == start_of_chunk_i_plus_1, (
678 f"Expected chunk {i+1} to start with overlap tokens from chunk {i}.\n"
679 f"Overlap {overlap_i}, got {start_of_chunk_i_plus_1}"
680 )
681
682 # Now check chunk (i+2) if it exists

Callers

nothing calls this directly

Calls 3

MarkdownChunkConfigClass · 0.90
recursive_chunkFunction · 0.90
splitMethod · 0.45

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…