MCPcopy
hub / github.com/unclecode/crawl4ai / run

Method run

crawl4ai/extraction_strategy.py:190–231  ·  view source on GitHub ↗

Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.

(self, url: str, sections: List[str])

Source from the content-addressed store, hash-verified

188
189
190 def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
191 """
192 Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
193 """
194
195 merged_sections = self._merge(
196 sections, self.chunk_token_threshold,
197 overlap= int(self.chunk_token_threshold * self.overlap_rate)
198 )
199 extracted_content = []
200 if self.provider.startswith("groq/"):
201 # Sequential processing with a delay
202 for ix, section in enumerate(merged_sections):
203 extract_func = partial(self.extract, url)
204 extracted_content.extend(extract_func(ix, sanitize_input_encode(section)))
205 time.sleep(0.5) # 500 ms delay between each processing
206 else:
207 # Parallel processing using ThreadPoolExecutor
208 # extract_func = partial(self.extract, url)
209 # for ix, section in enumerate(merged_sections):
210 # extracted_content.append(extract_func(ix, section))
211
212 with ThreadPoolExecutor(max_workers=4) as executor:
213 extract_func = partial(self.extract, url)
214 futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)]
215
216 for future in as_completed(futures):
217 try:
218 extracted_content.extend(future.result())
219 except Exception as e:
220 if self.verbose:
221 print(f"Error in thread execution: {e}")
222 # Add error information to extracted_content
223 extracted_content.append({
224 "index": 0,
225 "error": True,
226 "tags": ["error"],
227 "content": str(e)
228 })
229
230
231 return extracted_content
232
233class CosineStrategy(ExtractionStrategy):
234 def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs):

Callers

nothing calls this directly

Calls 2

_mergeMethod · 0.95
sanitize_input_encodeFunction · 0.85

Tested by

no test coverage detected