MCPcopy
hub / github.com/1Panel-dev/MaxKB / get_child_link_list

Method get_child_link_list

apps/common/utils/fork.py:94–104  ·  view source on GitHub ↗
(self, bf: BeautifulSoup)

Source from the content-addressed store, hash-verified

92 fragment='').geturl()
93
94 def get_child_link_list(self, bf: BeautifulSoup):
95 # Compute the crawl prefix: parent directory when base_fork_url is an HTML file
96 crawl_prefix = self.base_fork_url
97 if crawl_prefix.endswith(('.html', '.htm')):
98 crawl_prefix = crawl_prefix.rsplit('/', 1)[0]
99 pattern = "^((?!(http:|https:|tel:/|#|mailto:|javascript:))|" + crawl_prefix + "|/).*"
100 link_list = bf.find_all(name='a', href=re.compile(pattern))
101 result = [ChildLink(link.get('href'), link) if link.get('href').startswith(self.base_url) else ChildLink(
102 self.base_url + link.get('href'), link) for link in link_list]
103 result = [row for row in result if row.url.startswith(crawl_prefix)]
104 return result
105
106 def get_content_html(self, bf: BeautifulSoup):
107 if self.selector_list is None or len(self.selector_list) == 0:

Callers 1

forkMethod · 0.95

Calls 2

ChildLinkClass · 0.85
getMethod · 0.45

Tested by

no test coverage detected