Cleans the URLs extracted from the text. Args: urls (List[str]): The list of URLs to clean. Returns: List[str]: The cleaned URLs.
(self, urls: List[str])
| 177 | return links, images |
| 178 | |
| 179 | def _clean_urls(self, urls: List[str]) -> List[str]: |
| 180 | """ |
| 181 | Cleans the URLs extracted from the text. |
| 182 | |
| 183 | Args: |
| 184 | urls (List[str]): The list of URLs to clean. |
| 185 | |
| 186 | Returns: |
| 187 | List[str]: The cleaned URLs. |
| 188 | """ |
| 189 | cleaned_urls = [] |
| 190 | for url in urls: |
| 191 | if not ParseNode._is_valid_url(url): |
| 192 | url = re.sub(r".*?\]\(", "", url) |
| 193 | url = re.sub(r".*?\[\(", "", url) |
| 194 | url = re.sub(r".*?\[\)", "", url) |
| 195 | url = re.sub(r".*?\]\)", "", url) |
| 196 | url = re.sub(r".*?\)\[", "", url) |
| 197 | url = re.sub(r".*?\)\[", "", url) |
| 198 | url = re.sub(r".*?\(\]", "", url) |
| 199 | url = re.sub(r".*?\)\]", "", url) |
| 200 | url = url.rstrip(").-") |
| 201 | if len(url) > 0: |
| 202 | cleaned_urls.append(url) |
| 203 | |
| 204 | return cleaned_urls |
| 205 | |
| 206 | @staticmethod |
| 207 | def _is_valid_url(url: str) -> bool: |
no test coverage detected