Parse GitHub URL and return API endpoint if it's a GitHub repository content URL. Returns (api_url, is_github, owner, repo) tuple.
(url: str)
| 68 | |
| 69 | |
| 70 | def parse_github_url(url: str) -> tuple[str, bool, str | None, str | None]: |
| 71 | """ |
| 72 | Parse GitHub URL and return API endpoint if it's a GitHub repository content URL. |
| 73 | Returns (api_url, is_github, owner, repo) tuple. |
| 74 | """ |
| 75 | # Match GitHub blob or tree URLs - capture everything after /blob/ or /tree/ as one group |
| 76 | github_pattern = r"https://github\.com/([^/]+)/([^/]+)/(blob|tree)/(.+)" |
| 77 | match = re.match(github_pattern, url) |
| 78 | |
| 79 | if match: |
| 80 | owner, repo, _, branch_and_path = match.groups() # _ is blob_or_tree, which we don't need |
| 81 | repo = _normalize_repo_name(repo) |
| 82 | |
| 83 | # Split on the first occurrence of a path starting with . or containing a file extension |
| 84 | # Common patterns: .github/, .claude/, src/, file.ext |
| 85 | parts = branch_and_path.split("/") |
| 86 | |
| 87 | # Find where the file path likely starts |
| 88 | branch_parts = [] |
| 89 | path_parts: list[str] = [] |
| 90 | found_path_start = False |
| 91 | |
| 92 | for i, part in enumerate(parts): |
| 93 | if not found_path_start: |
| 94 | # Check if this looks like the start of a file path |
| 95 | if ( |
| 96 | part.startswith(".") # Hidden directories like .github, .claude |
| 97 | or "." in part # Files with extensions |
| 98 | or part in ["src", "lib", "bin", "scripts", "docs", "test", "tests"] |
| 99 | ): # Common directories |
| 100 | found_path_start = True |
| 101 | path_parts = parts[i:] |
| 102 | else: |
| 103 | branch_parts.append(part) |
| 104 | |
| 105 | # If we didn't find an obvious path start, treat the last part as the path |
| 106 | if not path_parts and parts: |
| 107 | branch_parts = parts[:-1] if len(parts) > 1 else parts |
| 108 | path_parts = parts[-1:] if len(parts) > 1 else [] |
| 109 | |
| 110 | branch = "/".join(branch_parts) if branch_parts else "main" |
| 111 | path = "/".join(path_parts) |
| 112 | |
| 113 | # URL-encode the branch name to handle slashes |
| 114 | encoded_branch = quote(branch, safe="") |
| 115 | api_url = ( |
| 116 | f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?ref={encoded_branch}" |
| 117 | ) |
| 118 | return api_url, True, owner, repo |
| 119 | |
| 120 | # Check if it's a repository root URL |
| 121 | github_repo_pattern = r"https://github\.com/([^/]+)/([^/]+)(?:/.*)?$" |
| 122 | match = re.match(github_repo_pattern, url) |
| 123 | if match: |
| 124 | owner, repo = match.groups() |
| 125 | repo = _normalize_repo_name(repo) |
| 126 | api_url = f"https://api.github.com/repos/{owner}/{repo}" |
| 127 | return api_url, True, owner, repo |