Method _split_to_paragraphs

lpm_kernel/utils.py:414–452 · view source on GitHub ↗

Currently split the original document into paragraphs directly based on the \n[any space]\n rule.

(
        self, text: str, min_paragraph_length: int = 0
    )

Source from the content-addressed store, hash-verified

412	return chunks
413
414	def _split_to_paragraphs(
415	self, text: str, min_paragraph_length: int = 0
416	) -> List[str]:
417	"""Currently split the original document into paragraphs directly based on the \n[any space]\n rule."""
418	line_break_characters = "".join(self.line_break_characters)
419	whitespace_characters = "".join(self.whitespace_characters)
420	paragraphs = re.split(
421	f"([{line_break_characters}]+[{whitespace_characters}]*[{line_break_characters}])+",
422	text,
423	)
424	if len(paragraphs) % 2 == 1:
425	paragraphs = [""] + paragraphs
426	paragraphs = [
427	(paragraphs[i], paragraphs[i + 1])
428	for i in range(0, len(paragraphs), 2)
429	if (paragraphs[i] + paragraphs[i + 1]).strip()
430	]
431
432	if not paragraphs:
433	return []
434
435	new_paragraphs = []
436	cur_paragraph, cur_paragraph_len = "", 0
437
438	# merge short or broken paragraphs
439	for sep, paragraph in paragraphs:
440	if cur_paragraph_len >= min_paragraph_length and any(
441	cur_paragraph.endswith(sym) for sym in self.sentence_terminators
442	):
443	new_paragraphs.append(cur_paragraph.strip())
444	cur_paragraph, cur_paragraph_len = "", 0
445
446	cur_paragraph_len += len(self._tokenizer.encode(sep + paragraph))
447	cur_paragraph += sep + paragraph
448
449	if cur_paragraph:
450	new_paragraphs.append(cur_paragraph.strip())
451
452	return new_paragraphs
453
454	def _split_to_sentences(self, text: str, url_strings: List[str] = []) -> List[str]:
455	# Use capture groups to preserve sentence separators

split_textMethod · 0.95

splitMethod · 0.80

no test coverage detected