MCPcopy
hub / github.com/apify/crawlee / load

Method load

packages/utils/src/internals/robots.ts:65–101  ·  view source on GitHub ↗
(
        url: string,
        proxyUrl?: string,
        options?: { signal?: AbortSignal; timeoutMillis?: number },
    )

Source from the content-addressed store, hash-verified

63 }
64
65 protected static async load(
66 url: string,
67 proxyUrl?: string,
68 options?: { signal?: AbortSignal; timeoutMillis?: number },
69 ): Promise<RobotsTxtFile> {
70 if (!HTTPError) {
71 HTTPError = (await import('got-scraping')).HTTPError;
72 }
73
74 try {
75 const response = await gotScraping({
76 url,
77 proxyUrl,
78 method: 'GET',
79 responseType: 'text',
80 signal: options?.signal,
81 ...(options?.timeoutMillis ? { timeout: { request: options.timeoutMillis } } : {}),
82 });
83
84 return new RobotsTxtFile(robotsParser(url.toString(), response.body), proxyUrl);
85 } catch (e) {
86 if (e instanceof HTTPError && e.response.statusCode === 404) {
87 return new RobotsTxtFile(
88 {
89 isAllowed() {
90 return true;
91 },
92 getSitemaps() {
93 return [];
94 },
95 },
96 proxyUrl,
97 );
98 }
99 throw e;
100 }
101 }
102
103 /**
104 * Check if a URL should be crawled by robots.

Callers 2

findMethod · 0.45
parseSitemapsMethod · 0.45

Calls 1

toStringMethod · 0.80

Tested by

no test coverage detected