* Recursive crawl using normalizeURL and getURLsFromHTML * @param {string} baseURL * @param {string} currentURL * @param {string[]} pages * @param {number} limit * @returns {Promise }
(baseURL: string, currentURL: string, pages: string[], limit: number)
| 437 | * @returns {Promise<string[]>} |
| 438 | */ |
| 439 | async function crawl(baseURL: string, currentURL: string, pages: string[], limit: number): Promise<string[]> { |
| 440 | const baseURLObj = new URL(baseURL) |
| 441 | const currentURLObj = new URL(currentURL) |
| 442 | |
| 443 | if (limit !== 0 && pages.length === limit) return pages |
| 444 | |
| 445 | if (baseURLObj.hostname !== currentURLObj.hostname) return pages |
| 446 | |
| 447 | const normalizeCurrentURL = baseURLObj.protocol + '//' + normalizeURL(currentURL) |
| 448 | if (pages.includes(normalizeCurrentURL)) { |
| 449 | return pages |
| 450 | } |
| 451 | |
| 452 | pages.push(normalizeCurrentURL) |
| 453 | |
| 454 | if (process.env.DEBUG === 'true') console.info(`actively crawling ${currentURL}`) |
| 455 | try { |
| 456 | const resp = await secureFetch(currentURL) |
| 457 | |
| 458 | if (resp.status > 399) { |
| 459 | if (process.env.DEBUG === 'true') console.error(`error in fetch with status code: ${resp.status}, on page: ${currentURL}`) |
| 460 | return pages |
| 461 | } |
| 462 | |
| 463 | const contentType: string | null = resp.headers.get('content-type') |
| 464 | if ((contentType && !contentType.includes('text/html')) || !contentType) { |
| 465 | if (process.env.DEBUG === 'true') console.error(`non html response, content type: ${contentType}, on page: ${currentURL}`) |
| 466 | return pages |
| 467 | } |
| 468 | |
| 469 | const htmlBody = await resp.text() |
| 470 | const nextURLs = getURLsFromHTML(htmlBody, currentURL) |
| 471 | for (const nextURL of nextURLs) { |
| 472 | pages = await crawl(baseURL, nextURL, pages, limit) |
| 473 | } |
| 474 | } catch (err) { |
| 475 | if (process.env.DEBUG === 'true') console.error(`error in fetch url: ${err.message}, on page: ${currentURL}`) |
| 476 | } |
| 477 | return pages |
| 478 | } |
| 479 | |
| 480 | /** |
| 481 | * Prep URL before passing into recursive crawl function |
no test coverage detected