(
splitter: Splitter,
chunk_size: int,
max_chunks: int,
min_chunk_chars: int,
discard_chunk_chars: int,
)
| 18 | ], |
| 19 | ) |
| 20 | def test_parser( |
| 21 | splitter: Splitter, |
| 22 | chunk_size: int, |
| 23 | max_chunks: int, |
| 24 | min_chunk_chars: int, |
| 25 | discard_chunk_chars: int, |
| 26 | ): |
| 27 | cfg = ParsingConfig( |
| 28 | splitter=splitter, |
| 29 | n_neighbor_ids=2, |
| 30 | chunk_size_variation=0.2, |
| 31 | chunk_size=chunk_size, |
| 32 | max_chunks=max_chunks, |
| 33 | separators=["."], |
| 34 | min_chunk_chars=min_chunk_chars, |
| 35 | discard_chunk_chars=discard_chunk_chars, |
| 36 | token_encoding_model="text-embedding-3-small", |
| 37 | ) |
| 38 | |
| 39 | parser = Parser(cfg) |
| 40 | docs = [ |
| 41 | Document(content=generate_random_text(500), metadata={"id": i}) |
| 42 | for i in range(5) |
| 43 | ] |
| 44 | |
| 45 | split_docs = parser.split(docs) |
| 46 | |
| 47 | chunk_size_upper_bound = ( |
| 48 | chunk_size * (1 + cfg.chunk_size_variation) |
| 49 | if splitter == Splitter.MARKDOWN |
| 50 | else chunk_size + 5 |
| 51 | ) |
| 52 | assert all( |
| 53 | parser.num_tokens(d.content) <= chunk_size_upper_bound for d in split_docs |
| 54 | ) |
| 55 | assert len(split_docs) <= max_chunks * len(docs) |
| 56 | assert all(len(d.content) >= discard_chunk_chars for d in split_docs) |
| 57 | assert all(d.metadata.is_chunk for d in split_docs) |
| 58 | |
| 59 | # test neighbor chunks |
| 60 | doc = Document(content=generate_random_text(500), metadata={"id": 0}) |
| 61 | chunks = parser.split([doc]) |
| 62 | n = len(chunks) |
| 63 | if n > 2 * cfg.n_neighbor_ids + 1: |
| 64 | assert len(chunks[n // 2].metadata.window_ids) == 2 * cfg.n_neighbor_ids + 1 |
| 65 | |
| 66 | |
| 67 | def length_fn(text): |
nothing calls this directly
no test coverage detected
searching dependent graphs…