Consecutive chunks share exactly chunk_overlap words.
(self)
| 56 | assert chunks[2]["chunk_index"] == 2 |
| 57 | |
| 58 | def test_overlap(self): |
| 59 | """Consecutive chunks share exactly chunk_overlap words.""" |
| 60 | config = ChunkingConfig( |
| 61 | chunk_size=10, chunk_overlap=3, min_chunk_size=3, max_chunk_chars=None |
| 62 | ) |
| 63 | chunker = TextChunker(config=config) |
| 64 | text = " ".join([f"w{i}" for i in range(20)]) |
| 65 | |
| 66 | chunks = chunker.load_parse_and_chunk( |
| 67 | source=text, source_id="doc1", source_column="text" |
| 68 | ) |
| 69 | assert len(chunks) >= 2 |
| 70 | |
| 71 | words_0 = chunks[0]["text"].split() |
| 72 | words_1 = chunks[1]["text"].split() |
| 73 | |
| 74 | assert words_0[-3:] == words_1[:3] |
| 75 | |
| 76 | def test_min_chunk_size_filters_small_trailing(self): |
| 77 | """Trailing chunk smaller than min_chunk_size is dropped.""" |
nothing calls this directly
no test coverage detected