MCPcopy
hub / github.com/mindverse/Second-Me / _split_to_sentences

Method _split_to_sentences

lpm_kernel/utils.py:454–480  ·  view source on GitHub ↗
(self, text: str, url_strings: List[str] = [])

Source from the content-addressed store, hash-verified

452 return new_paragraphs
453
454 def _split_to_sentences(self, text: str, url_strings: List[str] = []) -> List[str]:
455 # Use capture groups to preserve sentence separators
456 pattern = (
457 f"({'|'.join(re.escape(symbol) for symbol in self.sentence_terminators)})+"
458 )
459 parts = re.split(pattern, text)
460 sentences = []
461 # Merge by skipping steps to ensure punctuation is added to the end of the corresponding sentence
462 if len(parts) % 2 == 1:
463 parts.append("")
464
465 sentences = ["".join(parts[i : i + 2]) for i in range(0, len(parts), 2)]
466
467 sentences = [s for s in sentences if s.strip()]
468
469 if not sentences:
470 return []
471
472 # Fix fragmented sentences, mainly for special cases such as numeric indices, floating-point numbers, etc., which may be separated
473 sentences = self.recombine_broken_sentences(sentences)
474
475 # Split sentences that are too long; in the short term, split directly by character length; future optimizations could consider splitting by punctuation within sentences
476 sentences_list = [
477 self._force_split_to_chunks(s, url_strings) for s in sentences
478 ]
479 sentences = list(chain.from_iterable(sentences_list))
480 return sentences
481
482 def recombine_broken_sentences(self, sentences: List[str]) -> List[str]:
483 """Fix fragmented sentences, mainly for special cases such as numeric indices, floating-point numbers, etc., which may be separated。"""

Callers 1

_split_to_chunksMethod · 0.95

Calls 3

splitMethod · 0.80

Tested by

no test coverage detected