If a single sentence is too long, it can only be forcibly split, split by punctuation within the sentence, trying to preserve links and other data that require complete information
(
self, text: str, url_strings: List[str] = []
)
| 631 | return chunks |
| 632 | |
| 633 | def _force_split_to_chunks( |
| 634 | self, text: str, url_strings: List[str] = [] |
| 635 | ) -> List[str]: |
| 636 | # TODO: In the future, consider adding forced splitting logic, such as: if a single sentence is too long, split by punctuation within the sentence, trying to preserve links and other data that require complete information |
| 637 | """If a single sentence is too long, it can only be forcibly split, split by punctuation within the sentence, trying to preserve links and other data that require complete information""" |
| 638 | splits = [] |
| 639 | input_ids = self._tokenizer.encode( |
| 640 | text, |
| 641 | allowed_special=self._allowed_special, |
| 642 | disallowed_special=self._disallowed_special, |
| 643 | ) |
| 644 | if len(input_ids) < self._chunk_size: |
| 645 | return [text] |
| 646 | |
| 647 | if text[-1] not in self.sentence_terminators + self.intra_sentence_delimiters: |
| 648 | text += self.sentence_terminators[0] |
| 649 | |
| 650 | cur_sentence, cur_sentence_len = "", 0 |
| 651 | sub_sentence = "" |
| 652 | for c in text: |
| 653 | sub_sentence += c |
| 654 | if c in self.intra_sentence_delimiters + self.sentence_terminators: |
| 655 | sub_sentence_len = len(self._tokenizer.encode(sub_sentence)) |
| 656 | if ( |
| 657 | cur_sentence_len + sub_sentence_len |
| 658 | > self._chunk_size - self._chunk_overlap |
| 659 | ): |
| 660 | if cur_sentence: |
| 661 | splits.append(cur_sentence) |
| 662 | cur_sentence, cur_sentence_len = sub_sentence, sub_sentence_len |
| 663 | else: |
| 664 | # This indicates that sub_sentence is too long, at this point directly follow the forced splitting logic based on tokens |
| 665 | _splits = self.safe_split(sub_sentence, url_strings) |
| 666 | splits.extend(_splits[:-1]) |
| 667 | cur_sentence, cur_sentence_len = _splits[-1], len(_splits[-1]) |
| 668 | else: |
| 669 | cur_sentence += sub_sentence |
| 670 | cur_sentence_len += sub_sentence_len |
| 671 | sub_sentence = "" |
| 672 | |
| 673 | if cur_sentence: |
| 674 | splits.append(cur_sentence) |
| 675 | |
| 676 | return splits |
| 677 | |
| 678 | def safe_split(self, sub_sentence: str, url_strings: List[str] = []) -> List[str]: |
| 679 | sub_sentence_tokens = self._tokenizer.encode(sub_sentence) |
no test coverage detected