MCPcopy
hub / github.com/unclecode/crawl4ai / _merge

Method _merge

crawl4ai/extraction_strategy.py:143–187  ·  view source on GitHub ↗
(self, documents, chunk_token_threshold, overlap)

Source from the content-addressed store, hash-verified

141 return blocks
142
143 def _merge(self, documents, chunk_token_threshold, overlap):
144 chunks = []
145 sections = []
146 total_tokens = 0
147
148 # Calculate the total tokens across all documents
149 for document in documents:
150 total_tokens += len(document.split(' ')) * self.word_token_rate
151
152 # Calculate the number of sections needed
153 num_sections = math.floor(total_tokens / chunk_token_threshold)
154 if num_sections < 1:
155 num_sections = 1 # Ensure there is at least one section
156 adjusted_chunk_threshold = total_tokens / num_sections
157
158 total_token_so_far = 0
159 current_chunk = []
160
161 for document in documents:
162 tokens = document.split(' ')
163 token_count = len(tokens) * self.word_token_rate
164
165 if total_token_so_far + token_count <= adjusted_chunk_threshold:
166 current_chunk.extend(tokens)
167 total_token_so_far += token_count
168 else:
169 # Ensure to handle the last section properly
170 if len(sections) == num_sections - 1:
171 current_chunk.extend(tokens)
172 continue
173
174 # Add overlap if specified
175 if overlap > 0 and current_chunk:
176 overlap_tokens = current_chunk[-overlap:]
177 current_chunk.extend(overlap_tokens)
178
179 sections.append(' '.join(current_chunk))
180 current_chunk = tokens
181 total_token_so_far = token_count
182
183 # Add the last chunk
184 if current_chunk:
185 sections.append(' '.join(current_chunk))
186
187 return sections
188
189
190 def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:

Callers 1

runMethod · 0.95

Calls

no outgoing calls

Tested by

no test coverage detected