Inputs source lxml root and source url, extracts domain and finds all of the top level urls, we are assuming that these are the category urls. cnn.com --> [cnn.com/latest, world.cnn.com, cnn.com/asia]
(self, source_url, doc)
| 511 | return self._get_urls(doc, titles) |
| 512 | |
| 513 | def get_category_urls(self, source_url, doc): |
| 514 | """Inputs source lxml root and source url, extracts domain and |
| 515 | finds all of the top level urls, we are assuming that these are |
| 516 | the category urls. |
| 517 | cnn.com --> [cnn.com/latest, world.cnn.com, cnn.com/asia] |
| 518 | """ |
| 519 | page_urls = self.get_urls(doc) |
| 520 | valid_categories = [] |
| 521 | for p_url in page_urls: |
| 522 | scheme = urls.get_scheme(p_url, allow_fragments=False) |
| 523 | domain = urls.get_domain(p_url, allow_fragments=False) |
| 524 | path = urls.get_path(p_url, allow_fragments=False) |
| 525 | |
| 526 | if not domain and not path: |
| 527 | if self.config.verbose: |
| 528 | print('elim category url %s for no domain and path' |
| 529 | % p_url) |
| 530 | continue |
| 531 | if path and path.startswith('#'): |
| 532 | if self.config.verbose: |
| 533 | print('elim category url %s path starts with #' % p_url) |
| 534 | continue |
| 535 | if scheme and (scheme != 'http' and scheme != 'https'): |
| 536 | if self.config.verbose: |
| 537 | print(('elim category url %s for bad scheme, ' |
| 538 | 'not http nor https' % p_url)) |
| 539 | continue |
| 540 | |
| 541 | if domain: |
| 542 | child_tld = tldextract.extract(p_url) |
| 543 | domain_tld = tldextract.extract(source_url) |
| 544 | child_subdomain_parts = child_tld.subdomain.split('.') |
| 545 | subdomain_contains = False |
| 546 | for part in child_subdomain_parts: |
| 547 | if part == domain_tld.domain: |
| 548 | if self.config.verbose: |
| 549 | print(('subdomain contains at %s and %s' % |
| 550 | (str(part), str(domain_tld.domain)))) |
| 551 | subdomain_contains = True |
| 552 | break |
| 553 | |
| 554 | # Ex. microsoft.com is definitely not related to |
| 555 | # espn.com, but espn.go.com is probably related to espn.com |
| 556 | if not subdomain_contains and \ |
| 557 | (child_tld.domain != domain_tld.domain): |
| 558 | if self.config.verbose: |
| 559 | print(('elim category url %s for domain ' |
| 560 | 'mismatch' % p_url)) |
| 561 | continue |
| 562 | elif child_tld.subdomain in ['m', 'i']: |
| 563 | if self.config.verbose: |
| 564 | print(('elim category url %s for mobile ' |
| 565 | 'subdomain' % p_url)) |
| 566 | continue |
| 567 | else: |
| 568 | valid_categories.append(scheme+'://'+domain) |
| 569 | # TODO account for case where category is in form |
| 570 | # http://subdomain.domain.tld/category/ <-- still legal! |
no test coverage detected