| 92 | fragment='').geturl() |
| 93 | |
| 94 | def get_child_link_list(self, bf: BeautifulSoup): |
| 95 | # Compute the crawl prefix: parent directory when base_fork_url is an HTML file |
| 96 | crawl_prefix = self.base_fork_url |
| 97 | if crawl_prefix.endswith(('.html', '.htm')): |
| 98 | crawl_prefix = crawl_prefix.rsplit('/', 1)[0] |
| 99 | pattern = "^((?!(http:|https:|tel:/|#|mailto:|javascript:))|" + crawl_prefix + "|/).*" |
| 100 | link_list = bf.find_all(name='a', href=re.compile(pattern)) |
| 101 | result = [ChildLink(link.get('href'), link) if link.get('href').startswith(self.base_url) else ChildLink( |
| 102 | self.base_url + link.get('href'), link) for link in link_list] |
| 103 | result = [row for row in result if row.url.startswith(crawl_prefix)] |
| 104 | return result |
| 105 | |
| 106 | def get_content_html(self, bf: BeautifulSoup): |
| 107 | if self.selector_list is None or len(self.selector_list) == 0: |