hub / github.com/eldraco/domain_analyzer / crawl_url

Function crawl_url

crawler.py:434–644 · view source on GitHub ↗

This function crawl the entire domain recursively. Crawl in deep.

(url_base)

Source from the content-addressed store, hash-verified

432	# CRAWL URL
433	###########
434	def crawl_url(url_base):
435	"""
436	This function crawl the entire domain recursively. Crawl in deep.
437	"""
438	global debug
439	global URL
440	global linkregex
441	global main_domain
442	global accept_domain
443	global host_name
444	global crawled
445	global externals
446	global link_to_files
447	global allfiles
448	global extensions
449	global emails
450	global verbose
451	global url_scheme
452	global link_full_path
453	global sub_domains
454	global error_codes
455
456	#Variables
457	response=""
458	link=""
459	url_to_crawl=""
460	url=""
461	email=""
462	link_full_path=""
463	crawled_url = ""
464	request_web=""
465	opener_web=""
466	response=""
467
468	# Maybe this better have to be in main section. But if this function is used externaly it can cause a problem.
469	extensions_recognized="rss,xsl,xml,msi,vbs,db,asc,js,sql,rar,mdb,jar,mpg,sty,dat,f,c,h,cnf,flv,wma,swf,py,bz2,7z,css,ico,avi,mkv,doc,ppt,pps,xls,docx,pptx,ppsx,xlsx,sxw,sxc,sxi,odt,ods,odg,odp,pdf,wpd,txt,gnumeric,csv,asc,sql,rar,mdb,jar,mp3,sty,jpg,jpeg,png,gif,exe,py,zip,tar,gz,bz,bmp"
470	for i in extensions_recognized.split(','):
471	extensions.append('.'+i.lower())
472	extensions.append('.'+i.upper())
473	######
474	#Program
475	######
476	#URL.append(url_base)
477
478	try:
479	# Here we extract the complete URL to crawl. Commonly it has the form: http://www.xxxxx.com
480	#url_to_crawl = URL[0]
481	url_to_crawl = url_base
482	try:
483	URL.remove(url_to_crawl)
484	except:
485	pass
486
487	print('\n\t\t\t+ Crawling {0}'.format(url_to_crawl), end=' ')
488
489	# We parse the URL to identify domains and paths of the URL
490	url = urllib.parse.urlparse(url_to_crawl)
491

Callers 2

crawl_siteFunction · 0.85

directory_indexingFunction · 0.85

Calls 3

verify_linkFunction · 0.85

formatMethod · 0.80

readMethod · 0.80

Tested by

no test coverage detected