MCPcopy
hub / github.com/knownsec/pocsuite3 / crawl

Function crawl

pocsuite3/modules/spider/__init__.py:92–115  ·  view source on GitHub ↗
(url, max_pages=50, url_ext=())

Source from the content-addressed store, hash-verified

90
91
92def crawl(url, max_pages=50, url_ext=()):
93 true_url = get_redirect_url(url)
94 pages_need_visit = [true_url]
95 pages_count = 0
96 urls = {
97 'url': set(),
98 'js': set(),
99 'img': set()
100 }
101 while pages_count < max_pages and pages_need_visit:
102 url = pages_need_visit.pop(0)
103 try:
104 parser = LinkParser()
105 links = parser.get_links(url, url_ext=url_ext)
106 for k, v in links.items():
107 urls[k] = urls[k].union(v)
108
109 pages_count += len(links['url'])
110 pages_need_visit.extend([i for i in links['url']])
111
112 except Exception as ex:
113 logger.error(ex)
114
115 return urls
116
117
118if __name__ == '__main__':

Callers 2

test_import_runMethod · 0.85
__init__.pyFile · 0.85

Calls 4

get_linksMethod · 0.95
get_redirect_urlFunction · 0.85
LinkParserClass · 0.85
errorMethod · 0.80

Tested by 1

test_import_runMethod · 0.68