(self, docs: List[Document])
| 280 | return final_chunks |
| 281 | |
| 282 | def split_chunk_tokens(self, docs: List[Document]) -> List[Document]: |
| 283 | final_docs = [] |
| 284 | for d in docs: |
| 285 | if self.config.splitter == Splitter.MARKDOWN: |
| 286 | chunks = chunk_markdown( |
| 287 | d.content, |
| 288 | MarkdownChunkConfig( |
| 289 | # apply rough adjustment factor to convert from tokens to words, |
| 290 | # which is what the markdown chunker uses |
| 291 | chunk_size=int(self.config.chunk_size * 0.75), |
| 292 | overlap_tokens=int(self.config.overlap * 0.75), |
| 293 | variation_percent=self.config.chunk_size_variation, |
| 294 | rollup=True, |
| 295 | ), |
| 296 | ) |
| 297 | else: |
| 298 | chunks = self.chunk_tokens(d.content) |
| 299 | # note we are ensuring we COPY the document metadata into each chunk, |
| 300 | # which ensures all chunks of a given doc have same metadata |
| 301 | # (and in particular same metadata.id, which is important later for |
| 302 | # add_window_ids) |
| 303 | chunk_docs = [ |
| 304 | Document( |
| 305 | content=c, |
| 306 | metadata=d.metadata.model_copy(update=dict(is_chunk=True)), |
| 307 | ) |
| 308 | for c in chunks |
| 309 | if c.strip() != "" |
| 310 | ] |
| 311 | self.add_window_ids(chunk_docs) |
| 312 | final_docs += chunk_docs |
| 313 | return final_docs |
| 314 | |
| 315 | def chunk_tokens( |
| 316 | self, |
no test coverage detected