Check if a URL is a valid link (page, not related to icon or metadata).
(self, url:str)
| 434 | return base_url |
| 435 | |
| 436 | def is_link_valid(self, url:str) -> bool: |
| 437 | """Check if a URL is a valid link (page, not related to icon or metadata).""" |
| 438 | if len(url) > 72: |
| 439 | self.logger.warning(f"URL too long: {url}") |
| 440 | return False |
| 441 | parsed_url = urlparse(url) |
| 442 | if not parsed_url.scheme or not parsed_url.netloc: |
| 443 | self.logger.warning(f"Invalid URL: {url}") |
| 444 | return False |
| 445 | if re.search(r'/\d+$', parsed_url.path): |
| 446 | return False |
| 447 | image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'] |
| 448 | metadata_extensions = ['.ico', '.xml', '.json', '.rss', '.atom'] |
| 449 | for ext in image_extensions + metadata_extensions: |
| 450 | if url.lower().endswith(ext): |
| 451 | return False |
| 452 | return True |
| 453 | |
| 454 | def get_navigable(self) -> List[str]: |
| 455 | """Get all navigable links on the current page.""" |