MCPcopy
hub / github.com/mindverse/Second-Me / safe_split

Method safe_split

lpm_kernel/utils.py:678–708  ·  view source on GitHub ↗
(self, sub_sentence: str, url_strings: List[str] = [])

Source from the content-addressed store, hash-verified

676 return splits
677
678 def safe_split(self, sub_sentence: str, url_strings: List[str] = []) -> List[str]:
679 sub_sentence_tokens = self._tokenizer.encode(sub_sentence)
680
681 # Find the position intervals of all strings in url_strings
682 url_string_intervals = []
683 for url_string in url_strings:
684 encoded_url_string = self._tokenizer.encode(url_string)
685 # Use find_sublist_indices to find all position intervals
686 url_string_intervals.extend(
687 find_sublist_indices(sub_sentence_tokens, encoded_url_string)
688 )
689
690 _splits = []
691 i = 0
692 while i < len(sub_sentence_tokens):
693 if i + self._chunk_size >= len(sub_sentence_tokens):
694 slice_end = len(sub_sentence_tokens)
695 else:
696 slice_end = i + self._chunk_size - self._chunk_overlap
697
698 # Determine if the split interval overlaps with any important string intervals
699 for s_begin, s_end in url_string_intervals:
700 if i < s_end <= slice_end or i < s_begin < slice_end:
701 slice_end = max(slice_end, s_end)
702
703 # Split and record the current chunk
704 _splits.append(self._tokenizer.decode(sub_sentence_tokens[i:slice_end]))
705 # Move to the starting point of the next chunk
706 i = slice_end
707
708 return _splits
709
710
711def get_summarize_title_keywords(responses):

Callers 1

Calls 1

find_sublist_indicesFunction · 0.85

Tested by

no test coverage detected