MCPcopy Index your code
hub / github.com/FlowiseAI/Flowise / getURLsFromHTML

Function getURLsFromHTML

packages/components/src/utils.ts:399–413  ·  view source on GitHub ↗

* Search for href through htmlBody string * @param {string} htmlBody * @param {string} baseURL * @returns {string[]}

(htmlBody: string, baseURL: string)

Source from the content-addressed store, hash-verified

397 * @returns {string[]}
398 */
399function getURLsFromHTML(htmlBody: string, baseURL: string): string[] {
400 const dom = new JSDOM(htmlBody)
401 const linkElements = dom.window.document.querySelectorAll('a')
402 const urls: string[] = []
403 for (const linkElement of linkElements) {
404 try {
405 const urlObj = new URL(linkElement.href, baseURL)
406 urls.push(urlObj.href)
407 } catch (err) {
408 if (process.env.DEBUG === 'true') console.error(`error with scraped URL: ${err.message}`)
409 continue
410 }
411 }
412 return urls
413}
414
415/**
416 * Normalize URL to prevent crawling the same page

Callers 1

crawlFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected