Given a Markdown document with sections and sub-sections, this test verifies that: - The tree is built correctly from the document. - The chunking process produces distinct chunks with enriched header context. - A header-only node does not duplicate the header in its own chunk
(
sample_markdown,
markdown_sections,
chunk_size_factor: int,
rollup: bool,
)
| 231 | @pytest.mark.parametrize("chunk_size_factor", [1.2, 100]) |
| 232 | @pytest.mark.parametrize("rollup", [True, False]) |
| 233 | def test_markdown_chunking( |
| 234 | sample_markdown, |
| 235 | markdown_sections, |
| 236 | chunk_size_factor: int, |
| 237 | rollup: bool, |
| 238 | ): |
| 239 | """ |
| 240 | Given a Markdown document with sections and sub-sections, this test verifies that: |
| 241 | - The tree is built correctly from the document. |
| 242 | - The chunking process produces distinct chunks with enriched header context. |
| 243 | - A header-only node does not duplicate the header in its own chunk. |
| 244 | |
| 245 | The sample document has: |
| 246 | - Chapter 1 with a preamble. |
| 247 | - Section 1.1 with content. |
| 248 | - Section 1.2 with bullet content. |
| 249 | - Chapter 2 with its own content. |
| 250 | """ |
| 251 | |
| 252 | ch1, sec1_1, sec1_2, ch2 = markdown_sections |
| 253 | chunk_size = chunk_size_factor * count_words(ch1.content) |
| 254 | config = MarkdownChunkConfig( |
| 255 | chunk_size=chunk_size, |
| 256 | overlap_tokens=5, |
| 257 | variation_percent=0.2, |
| 258 | rollup=rollup, |
| 259 | ) |
| 260 | |
| 261 | # Structure-aware chunking of the text into enriched chunks. |
| 262 | chunks: List[str] = chunk_markdown(sample_markdown, config) |
| 263 | |
| 264 | if rollup and chunk_size > count_words(sample_markdown): |
| 265 | assert len(chunks) == 1, f"Expected 1 chunk, got {len(chunks)}" |
| 266 | assert ( |
| 267 | chunks[0].split() == sample_markdown.split() |
| 268 | ), "Chunk does not match original Markdown" |
| 269 | # check that line-breaks in each section content are preserved |
| 270 | for section in markdown_sections: |
| 271 | assert ( |
| 272 | section.content in chunks[0] |
| 273 | ), f"Section content not found in the chunk: {section.content}" |
| 274 | |
| 275 | if not rollup or chunk_size < count_words(sample_markdown): |
| 276 | # Based on our document structure, we expect four chunks: |
| 277 | # 1. Chapter 1's preamble content (enriched with prefix "# Chapter 1") |
| 278 | # 2. Section 1.1 content (enriched with prefix "# Chapter 1 \n\n # Section 1.1") |
| 279 | # 3. Section 1.2 content (enriched with prefix "# Chapter 1 \n\n # Section 1.2") |
| 280 | # 4. Chapter 2 content (enriched with prefix "# Chapter 2") |
| 281 | assert len(chunks) == 4, f"Expected 4 chunks, got {len(chunks)}" |
| 282 | |
| 283 | assert ( |
| 284 | chunks[0].split() == ch1.to_markdown().split() |
| 285 | ), "Chunk 1 does not match Chapter 1 preamble" |
| 286 | assert ( |
| 287 | ch1.content.strip() in chunks[0] |
| 288 | ), "Chapter 1 content not preserved in Chunk 1" |
| 289 | |
| 290 | assert chunks[1].split() == ( |
nothing calls this directly
no test coverage detected
searching dependent graphs…