MCPcopy
hub / github.com/unclecode/crawl4ai / crawl_urls

Function crawl_urls

main.py:193–239  ·  view source on GitHub ↗
(crawl_request: CrawlRequest, request: Request)

Source from the content-addressed store, hash-verified

191@app.post("/crawl")
192@limiter.limit(get_rate_limit())
193async def crawl_urls(crawl_request: CrawlRequest, request: Request):
194 logging.debug(f"[LOG] Crawl request for URL: {crawl_request.urls}")
195 global current_requests
196 async with lock:
197 if current_requests >= MAX_CONCURRENT_REQUESTS:
198 raise HTTPException(status_code=429, detail="Too many requests - please try again later.")
199 current_requests += 1
200
201 try:
202 logging.debug("[LOG] Loading extraction and chunking strategies...")
203 crawl_request.extraction_strategy_args['verbose'] = True
204 crawl_request.chunking_strategy_args['verbose'] = True
205
206 extraction_strategy = import_strategy("crawl4ai.extraction_strategy", crawl_request.extraction_strategy, **crawl_request.extraction_strategy_args)
207 chunking_strategy = import_strategy("crawl4ai.chunking_strategy", crawl_request.chunking_strategy, **crawl_request.chunking_strategy_args)
208
209 # Use ThreadPoolExecutor to run the synchronous WebCrawler in async manner
210 logging.debug("[LOG] Running the WebCrawler...")
211 with ThreadPoolExecutor() as executor:
212 loop = asyncio.get_event_loop()
213 futures = [
214 loop.run_in_executor(
215 executor,
216 get_crawler().run,
217 str(url),
218 crawl_request.word_count_threshold,
219 extraction_strategy,
220 chunking_strategy,
221 crawl_request.bypass_cache,
222 crawl_request.css_selector,
223 crawl_request.screenshot,
224 crawl_request.user_agent,
225 crawl_request.verbose
226 )
227 for url in crawl_request.urls
228 ]
229 results = await asyncio.gather(*futures)
230
231 # if include_raw_html is False, remove the raw HTML content from the results
232 if not crawl_request.include_raw_html:
233 for result in results:
234 result.html = None
235
236 return {"results": [result.model_dump() for result in results]}
237 finally:
238 async with lock:
239 current_requests -= 1
240
241@app.get("/strategies/extraction", response_class=JSONResponse)
242async def get_extraction_strategies():

Callers

nothing calls this directly

Calls 2

import_strategyFunction · 0.85
get_crawlerFunction · 0.85

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…