| 112 | } |
| 113 | |
| 114 | async function crawlPage(pageUrl: URL, referrer: string) { |
| 115 | const pathname = pageUrl.pathname; |
| 116 | if (visitedPages.has(pathname)) return; |
| 117 | visitedPages.add(pathname); |
| 118 | |
| 119 | let res: Response; |
| 120 | try { |
| 121 | res = await fetch(pageUrl, { |
| 122 | headers: { |
| 123 | accept: |
| 124 | "text/html, application/xhtml+xml, application/xml;q=0.9, */*;q=0.8", |
| 125 | }, |
| 126 | }); |
| 127 | } catch { |
| 128 | failedLinks.push({ url: pageUrl.href, status: 0, referrer }); |
| 129 | return; |
| 130 | } |
| 131 | |
| 132 | if (res.status >= 400) { |
| 133 | failedLinks.push({ url: pageUrl.href, status: res.status, referrer }); |
| 134 | await res.body?.cancel(); |
| 135 | return; |
| 136 | } |
| 137 | |
| 138 | if (!res.headers.get("content-type")?.includes("text/html")) { |
| 139 | await res.body?.cancel(); |
| 140 | return; |
| 141 | } |
| 142 | |
| 143 | const text = await res.text(); |
| 144 | Deno.stdout.writeSync(new TextEncoder().encode(".")); |
| 145 | const doc = new DOMParser().parseFromString(text, "text/html"); |
| 146 | |
| 147 | const linkChecks: Array<Promise<void>> = []; |
| 148 | const internalPages: Array<{ url: URL; referrer: string }> = []; |
| 149 | |
| 150 | for (const link of doc.querySelectorAll("a")) { |
| 151 | const href = link.getAttribute("href")?.trim(); |
| 152 | if (!href) continue; |
| 153 | if (EXCLUDED_PREFIXES.some((p) => href.startsWith(p))) continue; |
| 154 | if (href.startsWith("#")) continue; |
| 155 | |
| 156 | let nextUrl: URL; |
| 157 | try { |
| 158 | nextUrl = new URL(href, pageUrl); |
| 159 | } catch { |
| 160 | continue; |
| 161 | } |
| 162 | |
| 163 | // Strip fragment |
| 164 | nextUrl.hash = ""; |
| 165 | const urlStr = nextUrl.href; |
| 166 | |
| 167 | if (nextUrl.origin === rootUrl.origin) { |
| 168 | // Internal link -- crawl the page if it's a docs page |
| 169 | if ( |
| 170 | !visitedPages.has(nextUrl.pathname) && |
| 171 | nextUrl.pathname.startsWith("/docs") |