(
chunk_size: int, max_chunks: int, min_chunk_chars: int, discard_chunk_chars: int
)
| 77 | ], |
| 78 | ) |
| 79 | def test_text_token_chunking( |
| 80 | chunk_size: int, max_chunks: int, min_chunk_chars: int, discard_chunk_chars: int |
| 81 | ): |
| 82 | cfg = ParsingConfig( |
| 83 | chunk_size=chunk_size, |
| 84 | max_chunks=max_chunks, |
| 85 | min_chunk_chars=min_chunk_chars, |
| 86 | discard_chunk_chars=discard_chunk_chars, |
| 87 | token_encoding_model="text-embedding-3-small", |
| 88 | ) |
| 89 | |
| 90 | parser = Parser(cfg) |
| 91 | |
| 92 | text = generate_random_text(60) |
| 93 | chunks = parser.chunk_tokens(text) |
| 94 | |
| 95 | assert len(chunks) <= max_chunks |
| 96 | assert all(len(c) >= discard_chunk_chars for c in chunks) |
| 97 | assert all(parser.num_tokens(c) <= chunk_size + 5 for c in chunks) |
| 98 | |
| 99 | |
| 100 | def test_extract_content(): |
nothing calls this directly
no test coverage detected
searching dependent graphs…