Method _merge

crawl4ai/extraction_strategy.py:143–187 · view source on GitHub ↗

(self, documents, chunk_token_threshold, overlap)

Source from the content-addressed store, hash-verified

141	return blocks
142
143	def _merge(self, documents, chunk_token_threshold, overlap):
144	chunks = []
145	sections = []
146	total_tokens = 0
147
148	# Calculate the total tokens across all documents
149	for document in documents:
150	total_tokens += len(document.split(' ')) * self.word_token_rate
151
152	# Calculate the number of sections needed
153	num_sections = math.floor(total_tokens / chunk_token_threshold)
154	if num_sections < 1:
155	num_sections = 1 # Ensure there is at least one section
156	adjusted_chunk_threshold = total_tokens / num_sections
157
158	total_token_so_far = 0
159	current_chunk = []
160
161	for document in documents:
162	tokens = document.split(' ')
163	token_count = len(tokens) * self.word_token_rate
164
165	if total_token_so_far + token_count <= adjusted_chunk_threshold:
166	current_chunk.extend(tokens)
167	total_token_so_far += token_count
168	else:
169	# Ensure to handle the last section properly
170	if len(sections) == num_sections - 1:
171	current_chunk.extend(tokens)
172	continue
173
174	# Add overlap if specified
175	if overlap > 0 and current_chunk:
176	overlap_tokens = current_chunk[-overlap:]
177	current_chunk.extend(overlap_tokens)
178
179	sections.append(' '.join(current_chunk))
180	current_chunk = tokens
181	total_token_so_far = token_count
182
183	# Add the last chunk
184	if current_chunk:
185	sections.append(' '.join(current_chunk))
186
187	return sections
188
189
190	def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:

runMethod · 0.95

no outgoing calls

no test coverage detected