This function crawl the entire domain recursively. Crawl in deep.
(url_base)
| 432 | # CRAWL URL |
| 433 | ########### |
| 434 | def crawl_url(url_base): |
| 435 | """ |
| 436 | This function crawl the entire domain recursively. Crawl in deep. |
| 437 | """ |
| 438 | global debug |
| 439 | global URL |
| 440 | global linkregex |
| 441 | global main_domain |
| 442 | global accept_domain |
| 443 | global host_name |
| 444 | global crawled |
| 445 | global externals |
| 446 | global link_to_files |
| 447 | global allfiles |
| 448 | global extensions |
| 449 | global emails |
| 450 | global verbose |
| 451 | global url_scheme |
| 452 | global link_full_path |
| 453 | global sub_domains |
| 454 | global error_codes |
| 455 | |
| 456 | #Variables |
| 457 | response="" |
| 458 | link="" |
| 459 | url_to_crawl="" |
| 460 | url="" |
| 461 | email="" |
| 462 | link_full_path="" |
| 463 | crawled_url = "" |
| 464 | request_web="" |
| 465 | opener_web="" |
| 466 | response="" |
| 467 | |
| 468 | # Maybe this better have to be in main section. But if this function is used externaly it can cause a problem. |
| 469 | extensions_recognized="rss,xsl,xml,msi,vbs,db,asc,js,sql,rar,mdb,jar,mpg,sty,dat,f,c,h,cnf,flv,wma,swf,py,bz2,7z,css,ico,avi,mkv,doc,ppt,pps,xls,docx,pptx,ppsx,xlsx,sxw,sxc,sxi,odt,ods,odg,odp,pdf,wpd,txt,gnumeric,csv,asc,sql,rar,mdb,jar,mp3,sty,jpg,jpeg,png,gif,exe,py,zip,tar,gz,bz,bmp" |
| 470 | for i in extensions_recognized.split(','): |
| 471 | extensions.append('.'+i.lower()) |
| 472 | extensions.append('.'+i.upper()) |
| 473 | ###### |
| 474 | #Program |
| 475 | ###### |
| 476 | #URL.append(url_base) |
| 477 | |
| 478 | try: |
| 479 | # Here we extract the complete URL to crawl. Commonly it has the form: http://www.xxxxx.com |
| 480 | #url_to_crawl = URL[0] |
| 481 | url_to_crawl = url_base |
| 482 | try: |
| 483 | URL.remove(url_to_crawl) |
| 484 | except: |
| 485 | pass |
| 486 | |
| 487 | print('\n\t\t\t+ Crawling {0}'.format(url_to_crawl), end=' ') |
| 488 | |
| 489 | # We parse the URL to identify domains and paths of the URL |
| 490 | url = urllib.parse.urlparse(url_to_crawl) |
| 491 |
no test coverage detected