MCPcopy
hub / github.com/mindverse/Second-Me / _force_split_to_chunks

Method _force_split_to_chunks

lpm_kernel/utils.py:633–676  ·  view source on GitHub ↗

If a single sentence is too long, it can only be forcibly split, split by punctuation within the sentence, trying to preserve links and other data that require complete information

(
        self, text: str, url_strings: List[str] = []
    )

Source from the content-addressed store, hash-verified

631 return chunks
632
633 def _force_split_to_chunks(
634 self, text: str, url_strings: List[str] = []
635 ) -> List[str]:
636 # TODO: In the future, consider adding forced splitting logic, such as: if a single sentence is too long, split by punctuation within the sentence, trying to preserve links and other data that require complete information
637 """If a single sentence is too long, it can only be forcibly split, split by punctuation within the sentence, trying to preserve links and other data that require complete information"""
638 splits = []
639 input_ids = self._tokenizer.encode(
640 text,
641 allowed_special=self._allowed_special,
642 disallowed_special=self._disallowed_special,
643 )
644 if len(input_ids) < self._chunk_size:
645 return [text]
646
647 if text[-1] not in self.sentence_terminators + self.intra_sentence_delimiters:
648 text += self.sentence_terminators[0]
649
650 cur_sentence, cur_sentence_len = "", 0
651 sub_sentence = ""
652 for c in text:
653 sub_sentence += c
654 if c in self.intra_sentence_delimiters + self.sentence_terminators:
655 sub_sentence_len = len(self._tokenizer.encode(sub_sentence))
656 if (
657 cur_sentence_len + sub_sentence_len
658 > self._chunk_size - self._chunk_overlap
659 ):
660 if cur_sentence:
661 splits.append(cur_sentence)
662 cur_sentence, cur_sentence_len = sub_sentence, sub_sentence_len
663 else:
664 # This indicates that sub_sentence is too long, at this point directly follow the forced splitting logic based on tokens
665 _splits = self.safe_split(sub_sentence, url_strings)
666 splits.extend(_splits[:-1])
667 cur_sentence, cur_sentence_len = _splits[-1], len(_splits[-1])
668 else:
669 cur_sentence += sub_sentence
670 cur_sentence_len += sub_sentence_len
671 sub_sentence = ""
672
673 if cur_sentence:
674 splits.append(cur_sentence)
675
676 return splits
677
678 def safe_split(self, sub_sentence: str, url_strings: List[str] = []) -> List[str]:
679 sub_sentence_tokens = self._tokenizer.encode(sub_sentence)

Callers 1

_split_to_sentencesMethod · 0.95

Calls 1

safe_splitMethod · 0.95

Tested by

no test coverage detected