(self, text: str, url_strings: List[str] = [])
| 452 | return new_paragraphs |
| 453 | |
| 454 | def _split_to_sentences(self, text: str, url_strings: List[str] = []) -> List[str]: |
| 455 | # Use capture groups to preserve sentence separators |
| 456 | pattern = ( |
| 457 | f"({'|'.join(re.escape(symbol) for symbol in self.sentence_terminators)})+" |
| 458 | ) |
| 459 | parts = re.split(pattern, text) |
| 460 | sentences = [] |
| 461 | # Merge by skipping steps to ensure punctuation is added to the end of the corresponding sentence |
| 462 | if len(parts) % 2 == 1: |
| 463 | parts.append("") |
| 464 | |
| 465 | sentences = ["".join(parts[i : i + 2]) for i in range(0, len(parts), 2)] |
| 466 | |
| 467 | sentences = [s for s in sentences if s.strip()] |
| 468 | |
| 469 | if not sentences: |
| 470 | return [] |
| 471 | |
| 472 | # Fix fragmented sentences, mainly for special cases such as numeric indices, floating-point numbers, etc., which may be separated |
| 473 | sentences = self.recombine_broken_sentences(sentences) |
| 474 | |
| 475 | # Split sentences that are too long; in the short term, split directly by character length; future optimizations could consider splitting by punctuation within sentences |
| 476 | sentences_list = [ |
| 477 | self._force_split_to_chunks(s, url_strings) for s in sentences |
| 478 | ] |
| 479 | sentences = list(chain.from_iterable(sentences_list)) |
| 480 | return sentences |
| 481 | |
| 482 | def recombine_broken_sentences(self, sentences: List[str]) -> List[str]: |
| 483 | """Fix fragmented sentences, mainly for special cases such as numeric indices, floating-point numbers, etc., which may be separated。""" |
no test coverage detected