MCPcopy
hub / github.com/mindverse/Second-Me / _split_to_paragraphs

Method _split_to_paragraphs

lpm_kernel/utils.py:414–452  ·  view source on GitHub ↗

Currently split the original document into paragraphs directly based on the \n[any space]\n rule.

(
        self, text: str, min_paragraph_length: int = 0
    )

Source from the content-addressed store, hash-verified

412 return chunks
413
414 def _split_to_paragraphs(
415 self, text: str, min_paragraph_length: int = 0
416 ) -> List[str]:
417 """Currently split the original document into paragraphs directly based on the \n[any space]\n rule."""
418 line_break_characters = "".join(self.line_break_characters)
419 whitespace_characters = "".join(self.whitespace_characters)
420 paragraphs = re.split(
421 f"([{line_break_characters}]+[{whitespace_characters}]*[{line_break_characters}])+",
422 text,
423 )
424 if len(paragraphs) % 2 == 1:
425 paragraphs = [""] + paragraphs
426 paragraphs = [
427 (paragraphs[i], paragraphs[i + 1])
428 for i in range(0, len(paragraphs), 2)
429 if (paragraphs[i] + paragraphs[i + 1]).strip()
430 ]
431
432 if not paragraphs:
433 return []
434
435 new_paragraphs = []
436 cur_paragraph, cur_paragraph_len = "", 0
437
438 # merge short or broken paragraphs
439 for sep, paragraph in paragraphs:
440 if cur_paragraph_len >= min_paragraph_length and any(
441 cur_paragraph.endswith(sym) for sym in self.sentence_terminators
442 ):
443 new_paragraphs.append(cur_paragraph.strip())
444 cur_paragraph, cur_paragraph_len = "", 0
445
446 cur_paragraph_len += len(self._tokenizer.encode(sep + paragraph))
447 cur_paragraph += sep + paragraph
448
449 if cur_paragraph:
450 new_paragraphs.append(cur_paragraph.strip())
451
452 return new_paragraphs
453
454 def _split_to_sentences(self, text: str, url_strings: List[str] = []) -> List[str]:
455 # Use capture groups to preserve sentence separators

Callers 1

split_textMethod · 0.95

Calls 1

splitMethod · 0.80

Tested by

no test coverage detected