Currently split the original document into paragraphs directly based on the \n[any space]\n rule.
(
self, text: str, min_paragraph_length: int = 0
)
| 412 | return chunks |
| 413 | |
| 414 | def _split_to_paragraphs( |
| 415 | self, text: str, min_paragraph_length: int = 0 |
| 416 | ) -> List[str]: |
| 417 | """Currently split the original document into paragraphs directly based on the \n[any space]\n rule.""" |
| 418 | line_break_characters = "".join(self.line_break_characters) |
| 419 | whitespace_characters = "".join(self.whitespace_characters) |
| 420 | paragraphs = re.split( |
| 421 | f"([{line_break_characters}]+[{whitespace_characters}]*[{line_break_characters}])+", |
| 422 | text, |
| 423 | ) |
| 424 | if len(paragraphs) % 2 == 1: |
| 425 | paragraphs = [""] + paragraphs |
| 426 | paragraphs = [ |
| 427 | (paragraphs[i], paragraphs[i + 1]) |
| 428 | for i in range(0, len(paragraphs), 2) |
| 429 | if (paragraphs[i] + paragraphs[i + 1]).strip() |
| 430 | ] |
| 431 | |
| 432 | if not paragraphs: |
| 433 | return [] |
| 434 | |
| 435 | new_paragraphs = [] |
| 436 | cur_paragraph, cur_paragraph_len = "", 0 |
| 437 | |
| 438 | # merge short or broken paragraphs |
| 439 | for sep, paragraph in paragraphs: |
| 440 | if cur_paragraph_len >= min_paragraph_length and any( |
| 441 | cur_paragraph.endswith(sym) for sym in self.sentence_terminators |
| 442 | ): |
| 443 | new_paragraphs.append(cur_paragraph.strip()) |
| 444 | cur_paragraph, cur_paragraph_len = "", 0 |
| 445 | |
| 446 | cur_paragraph_len += len(self._tokenizer.encode(sep + paragraph)) |
| 447 | cur_paragraph += sep + paragraph |
| 448 | |
| 449 | if cur_paragraph: |
| 450 | new_paragraphs.append(cur_paragraph.strip()) |
| 451 | |
| 452 | return new_paragraphs |
| 453 | |
| 454 | def _split_to_sentences(self, text: str, url_strings: List[str] = []) -> List[str]: |
| 455 | # Use capture groups to preserve sentence separators |