()
| 623 | |
| 624 | |
| 625 | def test_recursive_chunk_enhanced(): |
| 626 | config = MarkdownChunkConfig( |
| 627 | chunk_size=8, |
| 628 | overlap_tokens=2, |
| 629 | variation_percent=0.3, |
| 630 | ) |
| 631 | |
| 632 | # Construct a text with 2 paragraphs, each containing 2 sentences, |
| 633 | # plus paragraph markers |
| 634 | paragraph1 = ( |
| 635 | "word1 word2 word3 word4 sentence1.\n" |
| 636 | "word5 word6 word7 word8 sentence2. PARA1" |
| 637 | ) |
| 638 | paragraph2 = ( |
| 639 | "cat1 cat2 cat3 cat4 sentence1.\n" "cat5 cat6 cat7 cat8 sentence2. PARA2" |
| 640 | ) |
| 641 | |
| 642 | text = paragraph1 + "\n\n" + paragraph2 |
| 643 | |
| 644 | # Now chunk it |
| 645 | chunks = recursive_chunk(text, config) |
| 646 | |
| 647 | print("\n------------------ ENHANCED CHUNK TEST ------------------") |
| 648 | for i, c in enumerate(chunks, 1): |
| 649 | print(f"Chunk {i} ({len(c.split())} words):\n{c}\n") |
| 650 | |
| 651 | # A. Check no chunk splits mid-sentence |
| 652 | for i, chunk in enumerate(chunks, 1): |
| 653 | # We expect every sentence boundary to remain intact: |
| 654 | # "sentence1." or "sentence2." should not be truncated in the middle |
| 655 | assert ( |
| 656 | "sentence1." in chunk or "sentence2." in chunk or "PARA" in chunk |
| 657 | ), f"Chunk {i} might have truncated a sentence or lost markers: {chunk}" |
| 658 | |
| 659 | # B. Check paragraph markers do not get merged. |
| 660 | # We expect that "PARA1" and "PARA2" never appear in the same chunk. |
| 661 | for i, chunk in enumerate(chunks, 1): |
| 662 | assert not ( |
| 663 | "PARA1" in chunk and "PARA2" in chunk |
| 664 | ), "Found both PARA1 and PARA2 in the same chunk!" |
| 665 | |
| 666 | # C. If there's overlap, ensure it's only from chunk (i) to chunk (i+1). |
| 667 | # A naive check: the last 2 tokens of chunk i = the first 2 tokens of chunk i+1, |
| 668 | # but chunk i+2 does not contain that same overlap at the start. |
| 669 | for i in range(len(chunks) - 1): |
| 670 | chunk_i_tokens = chunks[i].split() |
| 671 | chunk_i_plus_1_tokens = chunks[i + 1].split() |
| 672 | |
| 673 | overlap_i = chunk_i_tokens[-2:] # last 2 tokens of chunk i |
| 674 | start_of_chunk_i_plus_1 = chunk_i_plus_1_tokens[ |
| 675 | :2 |
| 676 | ] # first 2 tokens of chunk i+1 |
| 677 | assert overlap_i == start_of_chunk_i_plus_1, ( |
| 678 | f"Expected chunk {i+1} to start with overlap tokens from chunk {i}.\n" |
| 679 | f"Overlap {overlap_i}, got {start_of_chunk_i_plus_1}" |
| 680 | ) |
| 681 | |
| 682 | # Now check chunk (i+2) if it exists |
nothing calls this directly
no test coverage detected
searching dependent graphs…