| 90 | |
| 91 | |
| 92 | def crawl(url, max_pages=50, url_ext=()): |
| 93 | true_url = get_redirect_url(url) |
| 94 | pages_need_visit = [true_url] |
| 95 | pages_count = 0 |
| 96 | urls = { |
| 97 | 'url': set(), |
| 98 | 'js': set(), |
| 99 | 'img': set() |
| 100 | } |
| 101 | while pages_count < max_pages and pages_need_visit: |
| 102 | url = pages_need_visit.pop(0) |
| 103 | try: |
| 104 | parser = LinkParser() |
| 105 | links = parser.get_links(url, url_ext=url_ext) |
| 106 | for k, v in links.items(): |
| 107 | urls[k] = urls[k].union(v) |
| 108 | |
| 109 | pages_count += len(links['url']) |
| 110 | pages_need_visit.extend([i for i in links['url']]) |
| 111 | |
| 112 | except Exception as ex: |
| 113 | logger.error(ex) |
| 114 | |
| 115 | return urls |
| 116 | |
| 117 | |
| 118 | if __name__ == '__main__': |