Method split_text

lpm_kernel/utils.py:370–405 · view source on GitHub ↗

(self, text: str)

Source from the content-addressed store, hash-verified

368	self._disallowed_special = disallowed_special
369
370	def split_text(self, text: str) -> List[str]:
371	chunks = []
372
373	# Clean up abnormal whitespace characters in the text, such as replacing 3 or more consecutive \n with \n\n
374	text = text_filter(text)
375
376	# Replace URLs in the text to avoid symbols like ./?/ in URLs interfering with sentence splitting
377	text, string2url_dict = encode_urls(text)
378	url_strings = list(string2url_dict.keys())
379
380	# Split by paragraphs according to rules
381	paragraphs = self._split_to_paragraphs(
382	text, min_paragraph_length=self._chunk_size // 2
383	)
384
385	for i, paragraph in enumerate(paragraphs):
386	splits = self._split_to_chunks(paragraph, url_strings)
387	logging.debug(
388	"paragraph %s/%s %s characters: %s",
389	i + 1,
390	len(paragraphs),
391	len(paragraph),
392	paragraph,
393	)
394	logging.debug(
395	"paragraph %s/%s split into %s chunks: %s",
396	i + 1,
397	len(paragraphs),
398	len(splits),
399	splits,
400	)
401	chunks.extend(splits)
402
403	chunks = [decode_urls(chunk, string2url_dict) for chunk in chunks]
404
405	return chunks
406
407	def _split_to_chunks(self, text: str, url_strings: List[str] = []) -> List[str]:
408	sentences = self._split_to_sentences(text, url_strings)

_summarize_title_abstract_keywordsMethod · 0.95

_split_to_paragraphsMethod · 0.95

_split_to_chunksMethod · 0.95

text_filterFunction · 0.85

encode_urlsFunction · 0.85

decode_urlsFunction · 0.85

no test coverage detected