Method split_text

lpm_kernel/utils.py:142–164 · view source on GitHub ↗

Split incoming text and return chunks.

(self, text: str)

Source from the content-addressed store, hash-verified

140	self._disallowed_special = disallowed_special
141
142	def split_text(self, text: str) -> List[str]:
143	"""Split incoming text and return chunks."""
144	# Filter content with a large number of whitespace characters in the input text to increase the proportion of effective content within chunks
145	text = text_filter(text)
146	splits = []
147	input_ids = self._tokenizer.encode(
148	text,
149	allowed_special=self._allowed_special,
150	disallowed_special=self._disallowed_special,
151	)
152
153	start_idx = 0
154	while start_idx < len(input_ids):
155	cur_idx = min(start_idx + self._chunk_size, len(input_ids))
156	chunk_ids = input_ids[start_idx:cur_idx]
157	s = self._tokenizer.decode(chunk_ids).strip()
158	if s:
159	s = self._cut_meaningless_head_tail(s)
160	if s:
161	splits.append(s)
162	start_idx += self._chunk_size - self._chunk_overlap
163	logging.debug("finished split_text(): %s splits", len(splits))
164	return splits
165
166	def _cut_meaningless_head_tail(self, text: str) -> str:
167	# Only split when there are multiple newlines, as parsing of PDF/Word often contains false newlines

_insighter_docMethod · 0.95

splitMethod · 0.45

_cut_meaningless_head_tailMethod · 0.95

text_filterFunction · 0.85

no test coverage detected