hub / github.com/unclecode/crawl4ai / crawl_urls

Function crawl_urls

main.py:193–239 · view source on GitHub ↗

(crawl_request: CrawlRequest, request: Request)

Source from the content-addressed store, hash-verified

191	@app.post("/crawl")
192	@limiter.limit(get_rate_limit())
193	async def crawl_urls(crawl_request: CrawlRequest, request: Request):
194	logging.debug(f"[LOG] Crawl request for URL: {crawl_request.urls}")
195	global current_requests
196	async with lock:
197	if current_requests >= MAX_CONCURRENT_REQUESTS:
198	raise HTTPException(status_code=429, detail="Too many requests - please try again later.")
199	current_requests += 1
200
201	try:
202	logging.debug("[LOG] Loading extraction and chunking strategies...")
203	crawl_request.extraction_strategy_args['verbose'] = True
204	crawl_request.chunking_strategy_args['verbose'] = True
205
206	extraction_strategy = import_strategy("crawl4ai.extraction_strategy", crawl_request.extraction_strategy, **crawl_request.extraction_strategy_args)
207	chunking_strategy = import_strategy("crawl4ai.chunking_strategy", crawl_request.chunking_strategy, **crawl_request.chunking_strategy_args)
208
209	# Use ThreadPoolExecutor to run the synchronous WebCrawler in async manner
210	logging.debug("[LOG] Running the WebCrawler...")
211	with ThreadPoolExecutor() as executor:
212	loop = asyncio.get_event_loop()
213	futures = [
214	loop.run_in_executor(
215	executor,
216	get_crawler().run,
217	str(url),
218	crawl_request.word_count_threshold,
219	extraction_strategy,
220	chunking_strategy,
221	crawl_request.bypass_cache,
222	crawl_request.css_selector,
223	crawl_request.screenshot,
224	crawl_request.user_agent,
225	crawl_request.verbose
226	)
227	for url in crawl_request.urls
228	]
229	results = await asyncio.gather(*futures)
230
231	# if include_raw_html is False, remove the raw HTML content from the results
232	if not crawl_request.include_raw_html:
233	for result in results:
234	result.html = None
235
236	return {"results": [result.model_dump() for result in results]}
237	finally:
238	async with lock:
239	current_requests -= 1
240
241	@app.get("/strategies/extraction", response_class=JSONResponse)
242	async def get_extraction_strategies():

Callers

nothing calls this directly

Calls 2

import_strategyFunction · 0.85

get_crawlerFunction · 0.85

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…