Tests that the chunker respects paragraph boundaries when possible, then sentence boundaries, and only splits sentences when no other option is possible under the given config.
(chunk_size, overlap_tokens, variation_percent)
| 559 | ], |
| 560 | ) |
| 561 | def test_recursive_chunk(chunk_size, overlap_tokens, variation_percent): |
| 562 | """ |
| 563 | Tests that the chunker respects paragraph boundaries when possible, |
| 564 | then sentence boundaries, and only splits sentences when no other option |
| 565 | is possible under the given config. |
| 566 | """ |
| 567 | # Generate some text with 2 paragraphs, each having 3 sentences of 10 words. |
| 568 | # ~ Each paragraph => 3 sentences => |
| 569 | # each sentence has ~10 words => ~30 words per paragraph. |
| 570 | # So total words ~60. This helps us see chunking behavior across boundaries. |
| 571 | paragraph1 = generate_paragraph( |
| 572 | sentence_count=3, words_per_sentence=10, paragraph_id=1 |
| 573 | ) |
| 574 | paragraph2 = generate_paragraph( |
| 575 | sentence_count=3, words_per_sentence=10, paragraph_id=2 |
| 576 | ) |
| 577 | |
| 578 | # Combine paragraphs with a double-newline |
| 579 | text = paragraph1 + "\n\n" + paragraph2 |
| 580 | |
| 581 | config = MarkdownChunkConfig( |
| 582 | chunk_size=chunk_size, |
| 583 | overlap_tokens=overlap_tokens, |
| 584 | variation_percent=variation_percent, |
| 585 | ) |
| 586 | |
| 587 | chunks = recursive_chunk(text, config) |
| 588 | |
| 589 | # Print a condensed view for manual inspection |
| 590 | print("\n===================================") |
| 591 | print( |
| 592 | f"Config: chunk_size={chunk_size}, " |
| 593 | f"overlap_tokens={overlap_tokens}, " |
| 594 | f"variation_percent={variation_percent}\n" |
| 595 | ) |
| 596 | print("Generated Text (first 30 words):") |
| 597 | print(" ".join(text.split()[:30]), "...") |
| 598 | print("\nChunks:") |
| 599 | print(condensed_chunk_view(chunks, max_words=5)) |
| 600 | print("===================================\n") |
| 601 | |
| 602 | # Basic asserts: |
| 603 | # 1. No chunk should exceed the upper bound in terms of word count |
| 604 | upper_bound = chunk_size * (1 + variation_percent) |
| 605 | for i, chunk in enumerate(chunks): |
| 606 | word_count_in_chunk = len(chunk.split()) |
| 607 | assert word_count_in_chunk <= upper_bound + 5, ( |
| 608 | f"Chunk {i+1} has {word_count_in_chunk} words, " |
| 609 | f"exceeds upper bound (~{upper_bound:.1f})." |
| 610 | ) |
| 611 | |
| 612 | # 2. Check that chunking doesn't produce empty chunks |
| 613 | for i, chunk in enumerate(chunks): |
| 614 | assert chunk.strip(), f"Chunk {i+1} is empty!" |
| 615 | |
| 616 | # 3. (Optional) If chunk_size is >= total words, we expect exactly 1 chunk |
| 617 | total_words = len(text.split()) |
| 618 | if total_words <= chunk_size * (1 + variation_percent): |
nothing calls this directly
no test coverage detected
searching dependent graphs…