hub / github.com/unclecode/crawl4ai / run

Method run

crawl4ai/extraction_strategy.py:190–231 · view source on GitHub ↗

Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.

(self, url: str, sections: List[str])

Source from the content-addressed store, hash-verified

188
189
190	def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]:
191	"""
192	Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
193	"""
194
195	merged_sections = self._merge(
196	sections, self.chunk_token_threshold,
197	overlap= int(self.chunk_token_threshold * self.overlap_rate)
198	)
199	extracted_content = []
200	if self.provider.startswith("groq/"):
201	# Sequential processing with a delay
202	for ix, section in enumerate(merged_sections):
203	extract_func = partial(self.extract, url)
204	extracted_content.extend(extract_func(ix, sanitize_input_encode(section)))
205	time.sleep(0.5) # 500 ms delay between each processing
206	else:
207	# Parallel processing using ThreadPoolExecutor
208	# extract_func = partial(self.extract, url)
209	# for ix, section in enumerate(merged_sections):
210	# extracted_content.append(extract_func(ix, section))
211
212	with ThreadPoolExecutor(max_workers=4) as executor:
213	extract_func = partial(self.extract, url)
214	futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)]
215
216	for future in as_completed(futures):
217	try:
218	extracted_content.extend(future.result())
219	except Exception as e:
220	if self.verbose:
221	print(f"Error in thread execution: {e}")
222	# Add error information to extracted_content
223	extracted_content.append({
224	"index": 0,
225	"error": True,
226	"tags": ["error"],
227	"content": str(e)
228	})
229
230
231	return extracted_content
232
233	class CosineStrategy(ExtractionStrategy):
234	def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs):

Callers

nothing calls this directly

Calls 2

_mergeMethod · 0.95

sanitize_input_encodeFunction · 0.85

Tested by

no test coverage detected