MCPcopy
hub / github.com/langroid/langroid / test_markdown_chunking

Function test_markdown_chunking

tests/main/test_md_parser.py:233–309  ·  view source on GitHub ↗

Given a Markdown document with sections and sub-sections, this test verifies that: - The tree is built correctly from the document. - The chunking process produces distinct chunks with enriched header context. - A header-only node does not duplicate the header in its own chunk

(
    sample_markdown,
    markdown_sections,
    chunk_size_factor: int,
    rollup: bool,
)

Source from the content-addressed store, hash-verified

231@pytest.mark.parametrize("chunk_size_factor", [1.2, 100])
232@pytest.mark.parametrize("rollup", [True, False])
233def test_markdown_chunking(
234 sample_markdown,
235 markdown_sections,
236 chunk_size_factor: int,
237 rollup: bool,
238):
239 """
240 Given a Markdown document with sections and sub-sections, this test verifies that:
241 - The tree is built correctly from the document.
242 - The chunking process produces distinct chunks with enriched header context.
243 - A header-only node does not duplicate the header in its own chunk.
244
245 The sample document has:
246 - Chapter 1 with a preamble.
247 - Section 1.1 with content.
248 - Section 1.2 with bullet content.
249 - Chapter 2 with its own content.
250 """
251
252 ch1, sec1_1, sec1_2, ch2 = markdown_sections
253 chunk_size = chunk_size_factor * count_words(ch1.content)
254 config = MarkdownChunkConfig(
255 chunk_size=chunk_size,
256 overlap_tokens=5,
257 variation_percent=0.2,
258 rollup=rollup,
259 )
260
261 # Structure-aware chunking of the text into enriched chunks.
262 chunks: List[str] = chunk_markdown(sample_markdown, config)
263
264 if rollup and chunk_size > count_words(sample_markdown):
265 assert len(chunks) == 1, f"Expected 1 chunk, got {len(chunks)}"
266 assert (
267 chunks[0].split() == sample_markdown.split()
268 ), "Chunk does not match original Markdown"
269 # check that line-breaks in each section content are preserved
270 for section in markdown_sections:
271 assert (
272 section.content in chunks[0]
273 ), f"Section content not found in the chunk: {section.content}"
274
275 if not rollup or chunk_size < count_words(sample_markdown):
276 # Based on our document structure, we expect four chunks:
277 # 1. Chapter 1's preamble content (enriched with prefix "# Chapter 1")
278 # 2. Section 1.1 content (enriched with prefix "# Chapter 1 \n\n # Section 1.1")
279 # 3. Section 1.2 content (enriched with prefix "# Chapter 1 \n\n # Section 1.2")
280 # 4. Chapter 2 content (enriched with prefix "# Chapter 2")
281 assert len(chunks) == 4, f"Expected 4 chunks, got {len(chunks)}"
282
283 assert (
284 chunks[0].split() == ch1.to_markdown().split()
285 ), "Chunk 1 does not match Chapter 1 preamble"
286 assert (
287 ch1.content.strip() in chunks[0]
288 ), "Chapter 1 content not preserved in Chunk 1"
289
290 assert chunks[1].split() == (

Callers

nothing calls this directly

Calls 5

count_wordsFunction · 0.90
MarkdownChunkConfigClass · 0.90
chunk_markdownFunction · 0.90
to_markdownMethod · 0.80
splitMethod · 0.45

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…