| 400 | return chunks |
| 401 | |
| 402 | def split(self, docs: List[Document]) -> List[Document]: |
| 403 | if len(docs) == 0: |
| 404 | return [] |
| 405 | # create ids in metadata of docs if absent: |
| 406 | # we need this to distinguish docs later in add_window_ids |
| 407 | for d in docs: |
| 408 | if d.metadata.id in [None, ""]: |
| 409 | d.metadata.id = ObjectRegistry.new_id() |
| 410 | # some docs are already splits, so don't split them further! |
| 411 | chunked_docs = [d for d in docs if d.metadata.is_chunk] |
| 412 | big_docs = [d for d in docs if not d.metadata.is_chunk] |
| 413 | if len(big_docs) == 0: |
| 414 | return chunked_docs |
| 415 | match self.config.splitter: |
| 416 | case Splitter.MARKDOWN | Splitter.TOKENS: |
| 417 | big_doc_chunks = self.split_chunk_tokens(big_docs) |
| 418 | case Splitter.PARA_SENTENCE: |
| 419 | big_doc_chunks = self.split_para_sentence(big_docs) |
| 420 | case Splitter.SIMPLE: |
| 421 | big_doc_chunks = self.split_simple(big_docs) |
| 422 | case _: |
| 423 | raise ValueError(f"Unknown splitter: {self.config.splitter}") |
| 424 | |
| 425 | return chunked_docs + big_doc_chunks |