Extracts URLs from the given text. Args: text (str): The text to extract URLs from. Returns: Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
(self, text: str, source: str)
| 129 | return state |
| 130 | |
| 131 | def _extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]: |
| 132 | """ |
| 133 | Extracts URLs from the given text. |
| 134 | |
| 135 | Args: |
| 136 | text (str): The text to extract URLs from. |
| 137 | |
| 138 | Returns: |
| 139 | Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs. |
| 140 | """ |
| 141 | if not self.parse_urls: |
| 142 | return [], [] |
| 143 | |
| 144 | image_extensions = default_filters.filter_dict["img_exts"] |
| 145 | url = "" |
| 146 | all_urls = set() |
| 147 | |
| 148 | for group in ParseNode.url_pattern.findall(text): |
| 149 | for el in group: |
| 150 | if el != "": |
| 151 | url += el |
| 152 | all_urls.add(url) |
| 153 | url = "" |
| 154 | |
| 155 | url = "" |
| 156 | for group in ParseNode.relative_url_pattern.findall(text): |
| 157 | for el in group: |
| 158 | if el not in ["", "[", "]", "(", ")", "{", "}"]: |
| 159 | url += el |
| 160 | all_urls.add(urljoin(source, url)) |
| 161 | url = "" |
| 162 | |
| 163 | all_urls = list(all_urls) |
| 164 | all_urls = self._clean_urls(all_urls) |
| 165 | if not source.startswith("http"): |
| 166 | all_urls = [url for url in all_urls if url.startswith("http")] |
| 167 | else: |
| 168 | all_urls = [urljoin(source, url) for url in all_urls] |
| 169 | |
| 170 | images = [ |
| 171 | url |
| 172 | for url in all_urls |
| 173 | if any(url.endswith(ext) for ext in image_extensions) |
| 174 | ] |
| 175 | links = [url for url in all_urls if url not in images] |
| 176 | |
| 177 | return links, images |
| 178 | |
| 179 | def _clean_urls(self, urls: List[str]) -> List[str]: |
| 180 | """ |