Function getURLsFromHTML

packages/components/src/utils.ts:399–413 · view source on GitHub ↗

* Search for href through htmlBody string * @param {string} htmlBody * @param {string} baseURL * @returns {string[]}

(htmlBody: string, baseURL: string)

Source from the content-addressed store, hash-verified

397	* @returns {string[]}
398	*/
399	function getURLsFromHTML(htmlBody: string, baseURL: string): string[] {
400	const dom = new JSDOM(htmlBody)
401	const linkElements = dom.window.document.querySelectorAll('a')
402	const urls: string[] = []
403	for (const linkElement of linkElements) {
404	try {
405	const urlObj = new URL(linkElement.href, baseURL)
406	urls.push(urlObj.href)
407	} catch (err) {
408	if (process.env.DEBUG === 'true') console.error(`error with scraped URL: ${err.message}`)
409	continue
410	}
411	}
412	return urls
413	}
414
415	/**
416	* Normalize URL to prevent crawling the same page

crawlFunction · 0.85

no outgoing calls

no test coverage detected