MCPcopy
hub / github.com/mindverse/Second-Me / split_text

Method split_text

lpm_kernel/utils.py:142–164  ·  view source on GitHub ↗

Split incoming text and return chunks.

(self, text: str)

Source from the content-addressed store, hash-verified

140 self._disallowed_special = disallowed_special
141
142 def split_text(self, text: str) -> List[str]:
143 """Split incoming text and return chunks."""
144 # Filter content with a large number of whitespace characters in the input text to increase the proportion of effective content within chunks
145 text = text_filter(text)
146 splits = []
147 input_ids = self._tokenizer.encode(
148 text,
149 allowed_special=self._allowed_special,
150 disallowed_special=self._disallowed_special,
151 )
152
153 start_idx = 0
154 while start_idx < len(input_ids):
155 cur_idx = min(start_idx + self._chunk_size, len(input_ids))
156 chunk_ids = input_ids[start_idx:cur_idx]
157 s = self._tokenizer.decode(chunk_ids).strip()
158 if s:
159 s = self._cut_meaningless_head_tail(s)
160 if s:
161 splits.append(s)
162 start_idx += self._chunk_size - self._chunk_overlap
163 logging.debug("finished split_text(): %s splits", len(splits))
164 return splits
165
166 def _cut_meaningless_head_tail(self, text: str) -> str:
167 # Only split when there are multiple newlines, as parsing of PDF/Word often contains false newlines

Callers 2

_insighter_docMethod · 0.95
splitMethod · 0.45

Calls 2

text_filterFunction · 0.85

Tested by

no test coverage detected