Validate a URL with exponential backoff retry logic. Returns (is_valid, status_code, license_info, last_modified).
(
url: str, max_retries: int = 5
)
| 660 | |
| 661 | |
| 662 | def validate_url( |
| 663 | url: str, max_retries: int = 5 |
| 664 | ) -> tuple[bool, int | str | None, str | None, str | None]: |
| 665 | """ |
| 666 | Validate a URL with exponential backoff retry logic. |
| 667 | Returns (is_valid, status_code, license_info, last_modified). |
| 668 | """ |
| 669 | if not url or url.strip() == "": |
| 670 | return True, None, None, None # Empty URLs are considered valid |
| 671 | |
| 672 | # Convert GitHub URLs to API endpoints |
| 673 | api_url, is_github, owner, repo = parse_github_url(url) |
| 674 | |
| 675 | for attempt in range(max_retries): |
| 676 | try: |
| 677 | if is_github: |
| 678 | status, headers, data = github_request_json_paced(api_url) |
| 679 | else: |
| 680 | response = requests.head(url, headers=HEADERS, timeout=10, allow_redirects=True) |
| 681 | status = response.status_code |
| 682 | headers = dict(response.headers) |
| 683 | data = None |
| 684 | |
| 685 | if is_github and VERBOSE: |
| 686 | print(f"[github] url={url} api={api_url} status={status}") |
| 687 | print(f"[github-body] {data}") |
| 688 | |
| 689 | # Check if we hit GitHub rate limit |
| 690 | if status == 403 and is_github and "X-RateLimit-Remaining" in headers: |
| 691 | remaining = _header_int(headers, "X-RateLimit-Remaining") |
| 692 | if remaining == 0: |
| 693 | reset_time = _header_int(headers, "X-RateLimit-Reset") |
| 694 | sleep_time = max(reset_time - int(time.time()), 0) + 1 |
| 695 | print(f"GitHub rate limit hit. Sleeping for {sleep_time} seconds...") |
| 696 | time.sleep(sleep_time) |
| 697 | continue |
| 698 | |
| 699 | # Success cases |
| 700 | if status < 400: |
| 701 | license_info = None |
| 702 | last_modified = None |
| 703 | if is_github and status == 200: |
| 704 | # Extract owner/repo/path from original URL |
| 705 | # Try to match file URL first |
| 706 | file_match = re.match( |
| 707 | r"https://github\.com/([^/]+)/([^/]+)/blob/[^/]+/(.+)", url |
| 708 | ) |
| 709 | if file_match: |
| 710 | owner, repo, path = file_match.groups() |
| 711 | license_info = get_github_license(owner, repo) |
| 712 | last_modified = get_github_last_modified(owner, repo, path) |
| 713 | else: |
| 714 | # Try repository URL |
| 715 | repo_match = re.match(r"https://github\.com/([^/]+)/([^/]+)", url) |
| 716 | if repo_match: |
| 717 | owner, repo = repo_match.groups() |
| 718 | license_info = get_github_license(owner, repo) |
| 719 | last_modified = get_github_last_modified(owner, repo) |
no test coverage detected