Method safe_split

lpm_kernel/utils.py:678–708 · view source on GitHub ↗

(self, sub_sentence: str, url_strings: List[str] = [])

Source from the content-addressed store, hash-verified

676	return splits
677
678	def safe_split(self, sub_sentence: str, url_strings: List[str] = []) -> List[str]:
679	sub_sentence_tokens = self._tokenizer.encode(sub_sentence)
680
681	# Find the position intervals of all strings in url_strings
682	url_string_intervals = []
683	for url_string in url_strings:
684	encoded_url_string = self._tokenizer.encode(url_string)
685	# Use find_sublist_indices to find all position intervals
686	url_string_intervals.extend(
687	find_sublist_indices(sub_sentence_tokens, encoded_url_string)
688	)
689
690	_splits = []
691	i = 0
692	while i < len(sub_sentence_tokens):
693	if i + self._chunk_size >= len(sub_sentence_tokens):
694	slice_end = len(sub_sentence_tokens)
695	else:
696	slice_end = i + self._chunk_size - self._chunk_overlap
697
698	# Determine if the split interval overlaps with any important string intervals
699	for s_begin, s_end in url_string_intervals:
700	if i < s_end <= slice_end or i < s_begin < slice_end:
701	slice_end = max(slice_end, s_end)
702
703	# Split and record the current chunk
704	_splits.append(self._tokenizer.decode(sub_sentence_tokens[i:slice_end]))
705	# Move to the starting point of the next chunk
706	i = slice_end
707
708	return _splits
709
710
711	def get_summarize_title_keywords(responses):

_force_split_to_chunksMethod · 0.95

find_sublist_indicesFunction · 0.85

no test coverage detected