* Search for href through htmlBody string * @param {string} htmlBody * @param {string} baseURL * @returns {string[]}
(htmlBody: string, baseURL: string)
| 397 | * @returns {string[]} |
| 398 | */ |
| 399 | function getURLsFromHTML(htmlBody: string, baseURL: string): string[] { |
| 400 | const dom = new JSDOM(htmlBody) |
| 401 | const linkElements = dom.window.document.querySelectorAll('a') |
| 402 | const urls: string[] = [] |
| 403 | for (const linkElement of linkElements) { |
| 404 | try { |
| 405 | const urlObj = new URL(linkElement.href, baseURL) |
| 406 | urls.push(urlObj.href) |
| 407 | } catch (err) { |
| 408 | if (process.env.DEBUG === 'true') console.error(`error with scraped URL: ${err.message}`) |
| 409 | continue |
| 410 | } |
| 411 | } |
| 412 | return urls |
| 413 | } |
| 414 | |
| 415 | /** |
| 416 | * Normalize URL to prevent crawling the same page |