MCPcopy
hub / github.com/apify/crawlee / extractUrlsFromCheerio

Function extractUrlsFromCheerio

packages/utils/src/internals/cheerio.ts:98–122  ·  view source on GitHub ↗
($: CheerioAPI, selector = 'a', baseUrl = '')

Source from the content-addressed store, hash-verified

96 * @return An array of absolute URLs
97 */
98export function extractUrlsFromCheerio($: CheerioAPI, selector = 'a', baseUrl = ''): string[] {
99 const base = $('base').attr('href');
100 const absoluteBaseUrl = base && tryAbsoluteURL(base, baseUrl);
101
102 if (absoluteBaseUrl) {
103 baseUrl = absoluteBaseUrl;
104 }
105
106 return $(selector)
107 .map((_i, el) => $(el).attr('href'))
108 .get()
109 .filter(Boolean)
110 .map((href) => {
111 // Throw a meaningful error when only a relative URL would be extracted instead of waiting for the Request to fail later.
112 const isHrefAbsolute = /^[a-z][a-z0-9+.-]*:/.test(href); // Grabbed this in 'is-absolute-url' package.
113 if (!isHrefAbsolute && !baseUrl) {
114 throw new Error(
115 `An extracted URL: ${href} is relative and baseUrl is not set. ` +
116 'Provide a baseUrl to automatically resolve relative URLs.',
117 );
118 }
119 return baseUrl ? tryAbsoluteURL(href, baseUrl) : href;
120 })
121 .filter(Boolean) as string[];
122}

Callers 2

Calls 3

tryAbsoluteURLFunction · 0.90
mapMethod · 0.80
getMethod · 0.65

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…