MCPcopy Index your code
hub / github.com/FlowiseAI/Flowise / crawl

Function crawl

packages/components/src/utils.ts:439–478  ·  view source on GitHub ↗

* Recursive crawl using normalizeURL and getURLsFromHTML * @param {string} baseURL * @param {string} currentURL * @param {string[]} pages * @param {number} limit * @returns {Promise }

(baseURL: string, currentURL: string, pages: string[], limit: number)

Source from the content-addressed store, hash-verified

437 * @returns {Promise<string[]>}
438 */
439async function crawl(baseURL: string, currentURL: string, pages: string[], limit: number): Promise<string[]> {
440 const baseURLObj = new URL(baseURL)
441 const currentURLObj = new URL(currentURL)
442
443 if (limit !== 0 && pages.length === limit) return pages
444
445 if (baseURLObj.hostname !== currentURLObj.hostname) return pages
446
447 const normalizeCurrentURL = baseURLObj.protocol + '//' + normalizeURL(currentURL)
448 if (pages.includes(normalizeCurrentURL)) {
449 return pages
450 }
451
452 pages.push(normalizeCurrentURL)
453
454 if (process.env.DEBUG === 'true') console.info(`actively crawling ${currentURL}`)
455 try {
456 const resp = await secureFetch(currentURL)
457
458 if (resp.status > 399) {
459 if (process.env.DEBUG === 'true') console.error(`error in fetch with status code: ${resp.status}, on page: ${currentURL}`)
460 return pages
461 }
462
463 const contentType: string | null = resp.headers.get('content-type')
464 if ((contentType && !contentType.includes('text/html')) || !contentType) {
465 if (process.env.DEBUG === 'true') console.error(`non html response, content type: ${contentType}, on page: ${currentURL}`)
466 return pages
467 }
468
469 const htmlBody = await resp.text()
470 const nextURLs = getURLsFromHTML(htmlBody, currentURL)
471 for (const nextURL of nextURLs) {
472 pages = await crawl(baseURL, nextURL, pages, limit)
473 }
474 } catch (err) {
475 if (process.env.DEBUG === 'true') console.error(`error in fetch url: ${err.message}, on page: ${currentURL}`)
476 }
477 return pages
478}
479
480/**
481 * Prep URL before passing into recursive crawl function

Callers 1

webCrawlFunction · 0.85

Calls 4

secureFetchFunction · 0.90
normalizeURLFunction · 0.85
getURLsFromHTMLFunction · 0.85
getMethod · 0.45

Tested by

no test coverage detected