()
| 8 | from crawl4ai import AsyncWebCrawler |
| 9 | |
| 10 | async def main(): |
| 11 | # Initialize the AsyncWebCrawler |
| 12 | async with AsyncWebCrawler(verbose=True) as crawler: |
| 13 | # List of URLs to crawl |
| 14 | urls = [ |
| 15 | "https://example.com", |
| 16 | "https://python.org", |
| 17 | "https://github.com", |
| 18 | "https://stackoverflow.com", |
| 19 | "https://news.ycombinator.com" |
| 20 | ] |
| 21 | |
| 22 | # Set up crawling parameters |
| 23 | word_count_threshold = 100 |
| 24 | |
| 25 | # Run the crawling process for multiple URLs |
| 26 | results = await crawler.arun_many( |
| 27 | urls=urls, |
| 28 | word_count_threshold=word_count_threshold, |
| 29 | bypass_cache=True, |
| 30 | verbose=True |
| 31 | ) |
| 32 | |
| 33 | # Process the results |
| 34 | for result in results: |
| 35 | if result.success: |
| 36 | print(f"Successfully crawled: {result.url}") |
| 37 | print(f"Title: {result.metadata.get('title', 'N/A')}") |
| 38 | print(f"Word count: {len(result.markdown.split())}") |
| 39 | print(f"Number of links: {len(result.links.get('internal', [])) + len(result.links.get('external', []))}") |
| 40 | print(f"Number of images: {len(result.media.get('images', []))}") |
| 41 | print("---") |
| 42 | else: |
| 43 | print(f"Failed to crawl: {result.url}") |
| 44 | print(f"Error: {result.error_message}") |
| 45 | print("---") |
| 46 | |
| 47 | if __name__ == "__main__": |
| 48 | asyncio.run(main()) |
no test coverage detected
searching dependent graphs…