Test that the chunking logic produces chunks that: - Have token counts between the lower and upper bounds (except possibly the final chunk) - Include the header enrichment in each chunk's text - Include the expected overlap between consecutive chunks
(
chunk_size: int,
rollup: bool,
)
| 312 | @pytest.mark.parametrize("rollup", [False, True]) |
| 313 | @pytest.mark.parametrize("chunk_size", [20, 500]) |
| 314 | def test_chunking_sizes( |
| 315 | chunk_size: int, |
| 316 | rollup: bool, |
| 317 | ): |
| 318 | """ |
| 319 | Test that the chunking logic produces chunks that: |
| 320 | - Have token counts between the lower and upper bounds |
| 321 | (except possibly the final chunk) |
| 322 | - Include the header enrichment in each chunk's text |
| 323 | - Include the expected overlap between consecutive chunks |
| 324 | """ |
| 325 | # Create a long text consisting of 200 repeated tokens ("word") |
| 326 | long_text = " ".join(["word"] * 200) # 200 tokens |
| 327 | md_text = f"""# Chapter 1 |
| 328 | {long_text} |
| 329 | """ |
| 330 | |
| 331 | # Set chunking configuration. |
| 332 | # Here chunk_size=50 means that (with variation_percent=0.2) |
| 333 | # we expect chunks to have between 40 and 60 tokens. |
| 334 | config = MarkdownChunkConfig( |
| 335 | chunk_size=chunk_size, rollup=rollup, overlap_tokens=5, variation_percent=0.2 |
| 336 | ) |
| 337 | |
| 338 | # Produce the enriched chunks from the tree. |
| 339 | chunks = chunk_markdown(md_text, config) |
| 340 | |
| 341 | # Compute the allowed bounds. |
| 342 | lower_bound = config.chunk_size * (1 - config.variation_percent) |
| 343 | upper_bound = config.chunk_size * (1 + config.variation_percent) |
| 344 | |
| 345 | # Verify each chunk's token count. |
| 346 | # For all chunks except possibly the final one, |
| 347 | # we expect at least lower_bound tokens. |
| 348 | for i, chunk in enumerate(chunks): |
| 349 | tokens = count_words(chunk) |
| 350 | if i < len(chunks) - 1: |
| 351 | assert ( |
| 352 | tokens >= lower_bound |
| 353 | ), f"Chunk {i} has {tokens} tokens, expected at least {lower_bound}" |
| 354 | assert ( |
| 355 | tokens <= 2 * upper_bound |
| 356 | ), ( # relaxed check |
| 357 | f"Chunk {i} has {tokens} tokens, expected at most {upper_bound}" |
| 358 | ) |
| 359 | |
| 360 | # Check that each chunk is enriched with the header context. |
| 361 | # Each chunk's text should contain "Chapter 1" since that is the header path. |
| 362 | for i, chunk in enumerate(chunks): |
| 363 | assert "Chapter 1" in chunk, f"Chunk {i} is missing header enrichment" |
| 364 | |
| 365 | # Verify that consecutive chunks share the expected overlap. |
| 366 | # For each consecutive pair of chunks, the last `overlap_tokens` |
| 367 | # tokens of the previous chunk |
| 368 | # should appear among the first tokens of the next chunk. |
| 369 | if len(chunks) > 1: |
| 370 | for i in range(len(chunks) - 1): |
| 371 | prev_tokens = chunks[i].split() |
nothing calls this directly
no test coverage detected
searching dependent graphs…