| 33 | |
| 34 | |
| 35 | class LinkTree(Tree): |
| 36 | def __init__(self, url: str, depth: int, client: httpx.Client) -> None: |
| 37 | super().__init__() |
| 38 | self._url = url |
| 39 | self._depth = depth |
| 40 | self._client = client |
| 41 | |
| 42 | def load(self) -> None: |
| 43 | self._append_node(id=self._url, parent_id=None) |
| 44 | self._build_tree(url=self._url, depth=self._depth) |
| 45 | |
| 46 | def _append_node(self, id: str, parent_id: str | None) -> None: |
| 47 | """ |
| 48 | Creates a node for a tree using the given ID which corresponds to a URL. |
| 49 | If the parent_id is None, this will be considered a root node. |
| 50 | """ |
| 51 | resp = self._client.get(id) |
| 52 | soup = BeautifulSoup(resp.text, 'html.parser') |
| 53 | title = soup.title.text.strip() if soup.title is not None else parse_hostname(id) |
| 54 | try: |
| 55 | [classification, accuracy] = classify(resp.text) |
| 56 | numbers = parse_phone_numbers(soup) |
| 57 | emails = parse_emails(soup) |
| 58 | data = LinkNode(title, id, resp.status_code, classification, accuracy, numbers, emails) |
| 59 | self.create_node(title, identifier=id, parent=parent_id, data=data) |
| 60 | except exceptions.DuplicatedNodeIdError: |
| 61 | logging.debug(f"found a duplicate URL {id}") |
| 62 | |
| 63 | def _build_tree(self, url: str, depth: int) -> None: |
| 64 | """ |
| 65 | Builds a tree from the root to the given depth. |
| 66 | """ |
| 67 | if depth > 0: |
| 68 | depth -= 1 |
| 69 | resp = self._client.get(url) |
| 70 | children = parse_links(resp.text) |
| 71 | for child in children: |
| 72 | self._append_node(id=child, parent_id=url) |
| 73 | self._build_tree(url=child, depth=depth) |
| 74 | |
| 75 | def _get_tree_file_name(self) -> str: |
| 76 | root_id = self.root |
| 77 | root_node = self.get_node(root_id) |
| 78 | if root_node is None: |
| 79 | raise Exception('no root node can be found.') |
| 80 | |
| 81 | return os.path.join(project_root_directory, f'{root_node.tag} - Depth {self._depth}') |
| 82 | |
| 83 | def save(self) -> None: |
| 84 | """ |
| 85 | Saves the tree to the current working directory under the given file name. |
| 86 | """ |
| 87 | file_name = self._get_tree_file_name() |
| 88 | self.save2file(f'{file_name}.txt') |
| 89 | |
| 90 | def saveJSON(self) -> None: |
| 91 | """ |
| 92 | Saves the tree to the current working directory under the given file name in JSON. |