(self, sub_sentence: str, url_strings: List[str] = [])
| 676 | return splits |
| 677 | |
| 678 | def safe_split(self, sub_sentence: str, url_strings: List[str] = []) -> List[str]: |
| 679 | sub_sentence_tokens = self._tokenizer.encode(sub_sentence) |
| 680 | |
| 681 | # Find the position intervals of all strings in url_strings |
| 682 | url_string_intervals = [] |
| 683 | for url_string in url_strings: |
| 684 | encoded_url_string = self._tokenizer.encode(url_string) |
| 685 | # Use find_sublist_indices to find all position intervals |
| 686 | url_string_intervals.extend( |
| 687 | find_sublist_indices(sub_sentence_tokens, encoded_url_string) |
| 688 | ) |
| 689 | |
| 690 | _splits = [] |
| 691 | i = 0 |
| 692 | while i < len(sub_sentence_tokens): |
| 693 | if i + self._chunk_size >= len(sub_sentence_tokens): |
| 694 | slice_end = len(sub_sentence_tokens) |
| 695 | else: |
| 696 | slice_end = i + self._chunk_size - self._chunk_overlap |
| 697 | |
| 698 | # Determine if the split interval overlaps with any important string intervals |
| 699 | for s_begin, s_end in url_string_intervals: |
| 700 | if i < s_end <= slice_end or i < s_begin < slice_end: |
| 701 | slice_end = max(slice_end, s_end) |
| 702 | |
| 703 | # Split and record the current chunk |
| 704 | _splits.append(self._tokenizer.decode(sub_sentence_tokens[i:slice_end])) |
| 705 | # Move to the starting point of the next chunk |
| 706 | i = slice_end |
| 707 | |
| 708 | return _splits |
| 709 | |
| 710 | |
| 711 | def get_summarize_title_keywords(responses): |
no test coverage detected