(self, text: str)
| 368 | self._disallowed_special = disallowed_special |
| 369 | |
| 370 | def split_text(self, text: str) -> List[str]: |
| 371 | chunks = [] |
| 372 | |
| 373 | # Clean up abnormal whitespace characters in the text, such as replacing 3 or more consecutive \n with \n\n |
| 374 | text = text_filter(text) |
| 375 | |
| 376 | # Replace URLs in the text to avoid symbols like ./?/ in URLs interfering with sentence splitting |
| 377 | text, string2url_dict = encode_urls(text) |
| 378 | url_strings = list(string2url_dict.keys()) |
| 379 | |
| 380 | # Split by paragraphs according to rules |
| 381 | paragraphs = self._split_to_paragraphs( |
| 382 | text, min_paragraph_length=self._chunk_size // 2 |
| 383 | ) |
| 384 | |
| 385 | for i, paragraph in enumerate(paragraphs): |
| 386 | splits = self._split_to_chunks(paragraph, url_strings) |
| 387 | logging.debug( |
| 388 | "paragraph %s/%s %s characters: %s", |
| 389 | i + 1, |
| 390 | len(paragraphs), |
| 391 | len(paragraph), |
| 392 | paragraph, |
| 393 | ) |
| 394 | logging.debug( |
| 395 | "paragraph %s/%s split into %s chunks: %s", |
| 396 | i + 1, |
| 397 | len(paragraphs), |
| 398 | len(splits), |
| 399 | splits, |
| 400 | ) |
| 401 | chunks.extend(splits) |
| 402 | |
| 403 | chunks = [decode_urls(chunk, string2url_dict) for chunk in chunks] |
| 404 | |
| 405 | return chunks |
| 406 | |
| 407 | def _split_to_chunks(self, text: str, url_strings: List[str] = []) -> List[str]: |
| 408 | sentences = self._split_to_sentences(text, url_strings) |
no test coverage detected