Split incoming text and return chunks.
(self, text: str)
| 140 | self._disallowed_special = disallowed_special |
| 141 | |
| 142 | def split_text(self, text: str) -> List[str]: |
| 143 | """Split incoming text and return chunks.""" |
| 144 | # Filter content with a large number of whitespace characters in the input text to increase the proportion of effective content within chunks |
| 145 | text = text_filter(text) |
| 146 | splits = [] |
| 147 | input_ids = self._tokenizer.encode( |
| 148 | text, |
| 149 | allowed_special=self._allowed_special, |
| 150 | disallowed_special=self._disallowed_special, |
| 151 | ) |
| 152 | |
| 153 | start_idx = 0 |
| 154 | while start_idx < len(input_ids): |
| 155 | cur_idx = min(start_idx + self._chunk_size, len(input_ids)) |
| 156 | chunk_ids = input_ids[start_idx:cur_idx] |
| 157 | s = self._tokenizer.decode(chunk_ids).strip() |
| 158 | if s: |
| 159 | s = self._cut_meaningless_head_tail(s) |
| 160 | if s: |
| 161 | splits.append(s) |
| 162 | start_idx += self._chunk_size - self._chunk_overlap |
| 163 | logging.debug("finished split_text(): %s splits", len(splits)) |
| 164 | return splits |
| 165 | |
| 166 | def _cut_meaningless_head_tail(self, text: str) -> str: |
| 167 | # Only split when there are multiple newlines, as parsing of PDF/Word often contains false newlines |
no test coverage detected