MCPcopy
hub / github.com/BuilderIO/gpt-crawler / crawl

Function crawl

src/core.ts:51–159  ·  view source on GitHub ↗
(config: Config)

Source from the content-addressed store, hash-verified

49}
50
51export async function crawl(config: Config) {
52 configSchema.parse(config);
53
54 if (process.env.NO_CRAWL !== "true") {
55 // PlaywrightCrawler crawls the web using a headless
56 // browser controlled by the Playwright library.
57 crawler = new PlaywrightCrawler(
58 {
59 // Use the requestHandler to process each of the crawled pages.
60 async requestHandler({ request, page, enqueueLinks, log, pushData }) {
61 const title = await page.title();
62 pageCounter++;
63 log.info(
64 `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`,
65 );
66
67 // Use custom handling for XPath selector
68 if (config.selector) {
69 if (config.selector.startsWith("/")) {
70 await waitForXPath(
71 page,
72 config.selector,
73 config.waitForSelectorTimeout ?? 1000,
74 );
75 } else {
76 await page.waitForSelector(config.selector, {
77 timeout: config.waitForSelectorTimeout ?? 1000,
78 });
79 }
80 }
81
82 const html = await getPageHtml(page, config.selector);
83
84 // Save results as JSON to ./storage/datasets/default
85 await pushData({ title, url: request.loadedUrl, html });
86
87 if (config.onVisitPage) {
88 await config.onVisitPage({ page, pushData });
89 }
90
91 // Extract links from the current page
92 // and add them to the crawling queue.
93 await enqueueLinks({
94 globs:
95 typeof config.match === "string" ? [config.match] : config.match,
96 exclude:
97 typeof config.exclude === "string"
98 ? [config.exclude]
99 : config.exclude ?? [],
100 });
101 },
102 // Comment this option to scrape the full website.
103 maxRequestsPerCrawl: config.maxPagesToCrawl,
104 // Uncomment this option to see the browser window.
105 // headless: false,
106 preNavigationHooks: [
107 // Abort requests for certain resource types and add cookies
108 async (crawlingContext, _gotoOptions) => {

Callers 3

main.tsFile · 0.85
crawlMethod · 0.85
handlerFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected