* Normalize URL to prevent crawling the same page * @param {string} urlString * @returns {string}
(urlString: string)
| 418 | * @returns {string} |
| 419 | */ |
| 420 | function normalizeURL(urlString: string): string { |
| 421 | const urlObj = new URL(urlString) |
| 422 | const port = urlObj.port ? `:${urlObj.port}` : '' |
| 423 | const hostPath = urlObj.hostname + port + urlObj.pathname + urlObj.search |
| 424 | if (hostPath.length > 0 && hostPath.slice(-1) == '/') { |
| 425 | // handling trailing slash |
| 426 | return hostPath.slice(0, -1) |
| 427 | } |
| 428 | return hostPath |
| 429 | } |
| 430 | |
| 431 | /** |
| 432 | * Recursive crawl using normalizeURL and getURLsFromHTML |