(self, docs: List[Document])
| 202 | c.metadata.is_chunk = True |
| 203 | |
| 204 | def split_simple(self, docs: List[Document]) -> List[Document]: |
| 205 | if len(self.config.separators) == 0: |
| 206 | raise ValueError("Must have at least one separator") |
| 207 | final_docs = [] |
| 208 | |
| 209 | for d in docs: |
| 210 | if d.content.strip() == "": |
| 211 | continue |
| 212 | chunks = remove_extra_whitespace(d.content).split(self.config.separators[0]) |
| 213 | # note we are ensuring we COPY the document metadata into each chunk, |
| 214 | # which ensures all chunks of a given doc have same metadata |
| 215 | # (and in particular same metadata.id, which is important later for |
| 216 | # add_window_ids) |
| 217 | chunk_docs = [ |
| 218 | Document( |
| 219 | content=c, |
| 220 | metadata=d.metadata.model_copy(update=dict(is_chunk=True)), |
| 221 | ) |
| 222 | for c in chunks |
| 223 | if c.strip() != "" |
| 224 | ] |
| 225 | self.add_window_ids(chunk_docs) |
| 226 | final_docs += chunk_docs |
| 227 | return final_docs |
| 228 | |
| 229 | def split_para_sentence(self, docs: List[Document]) -> List[Document]: |
| 230 | chunks = docs |
no test coverage detected