MCPcopy
hub / github.com/mindverse/Second-Me / split_text

Method split_text

lpm_kernel/utils.py:370–405  ·  view source on GitHub ↗
(self, text: str)

Source from the content-addressed store, hash-verified

368 self._disallowed_special = disallowed_special
369
370 def split_text(self, text: str) -> List[str]:
371 chunks = []
372
373 # Clean up abnormal whitespace characters in the text, such as replacing 3 or more consecutive \n with \n\n
374 text = text_filter(text)
375
376 # Replace URLs in the text to avoid symbols like ./?/ in URLs interfering with sentence splitting
377 text, string2url_dict = encode_urls(text)
378 url_strings = list(string2url_dict.keys())
379
380 # Split by paragraphs according to rules
381 paragraphs = self._split_to_paragraphs(
382 text, min_paragraph_length=self._chunk_size // 2
383 )
384
385 for i, paragraph in enumerate(paragraphs):
386 splits = self._split_to_chunks(paragraph, url_strings)
387 logging.debug(
388 "paragraph %s/%s %s characters: %s",
389 i + 1,
390 len(paragraphs),
391 len(paragraph),
392 paragraph,
393 )
394 logging.debug(
395 "paragraph %s/%s split into %s chunks: %s",
396 i + 1,
397 len(paragraphs),
398 len(splits),
399 splits,
400 )
401 chunks.extend(splits)
402
403 chunks = [decode_urls(chunk, string2url_dict) for chunk in chunks]
404
405 return chunks
406
407 def _split_to_chunks(self, text: str, url_strings: List[str] = []) -> List[str]:
408 sentences = self._split_to_sentences(text, url_strings)

Callers 1

Calls 5

_split_to_paragraphsMethod · 0.95
_split_to_chunksMethod · 0.95
text_filterFunction · 0.85
encode_urlsFunction · 0.85
decode_urlsFunction · 0.85

Tested by

no test coverage detected