MCPcopy
hub / github.com/continuedev/continue / crawl

Method crawl

core/indexing/docs/crawlers/CheerioCrawler.ts:27–86  ·  view source on GitHub ↗
()

Source from the content-addressed store, hash-verified

25 ) {}
26
27 async *crawl(): AsyncGenerator<PageData> {
28 let currentPageCount = 0;
29 console.log(
30 `[${
31 (this.constructor as any).name
32 }] Starting crawl from: ${this.startUrl} - Max Depth: ${this.maxDepth}`,
33 );
34
35 const { baseUrl, basePath } = this.splitUrl(this.startUrl);
36
37 let paths: { path: string; depth: number }[] = [
38 { path: basePath, depth: 0 },
39 ];
40
41 let index = 0;
42
43 while (index < paths.length) {
44 const batch = paths.slice(index, index + 50);
45
46 try {
47 const promises = batch.map(({ path, depth }) =>
48 this.getLinksFromUrl(baseUrl, path).then((links) => ({
49 links,
50 path,
51 depth,
52 })),
53 );
54
55 const results = await Promise.all(promises);
56 for (const {
57 links: { html, links: linksArray },
58 path,
59 depth,
60 } of results) {
61 if (html !== "" && depth <= this.maxDepth) {
62 yield { url: this.startUrl.toString(), path, content: html };
63 currentPageCount++;
64 if (currentPageCount >= this.maxRequestsPerCrawl) {
65 console.log("Crawl completed - max requests reached");
66 return;
67 }
68 }
69
70 if (depth < this.maxDepth) {
71 for (let link of linksArray) {
72 if (!paths.some((p) => p.path === link)) {
73 paths.push({ path: link, depth: depth + 1 });
74 }
75 }
76 }
77 }
78 } catch (e) {
79 console.debug("Error while crawling page: ", e);
80 }
81
82 index += batch.length;
83 }
84

Callers

nothing calls this directly

Calls 5

splitUrlMethod · 0.95
getLinksFromUrlMethod · 0.95
logMethod · 0.65
pushMethod · 0.65
debugMethod · 0.45

Tested by

no test coverage detected