Method crawl

core/indexing/docs/crawlers/CheerioCrawler.ts:27–86 · view source on GitHub ↗

()

Source from the content-addressed store, hash-verified

25	) {}
26
27	async *crawl(): AsyncGenerator<PageData> {
28	let currentPageCount = 0;
29	console.log(
30	`[${
31	(this.constructor as any).name
32	}] Starting crawl from: ${this.startUrl} - Max Depth: ${this.maxDepth}`,
33	);
34
35	const { baseUrl, basePath } = this.splitUrl(this.startUrl);
36
37	let paths: { path: string; depth: number }[] = [
38	{ path: basePath, depth: 0 },
39	];
40
41	let index = 0;
42
43	while (index < paths.length) {
44	const batch = paths.slice(index, index + 50);
45
46	try {
47	const promises = batch.map(({ path, depth }) =>
48	this.getLinksFromUrl(baseUrl, path).then((links) => ({
49	links,
50	path,
51	depth,
52	})),
53	);
54
55	const results = await Promise.all(promises);
56	for (const {
57	links: { html, links: linksArray },
58	path,
59	depth,
60	} of results) {
61	if (html !== "" && depth <= this.maxDepth) {
62	yield { url: this.startUrl.toString(), path, content: html };
63	currentPageCount++;
64	if (currentPageCount >= this.maxRequestsPerCrawl) {
65	console.log("Crawl completed - max requests reached");
66	return;
67	}
68	}
69
70	if (depth < this.maxDepth) {
71	for (let link of linksArray) {
72	if (!paths.some((p) => p.path === link)) {
73	paths.push({ path: link, depth: depth + 1 });
74	}
75	}
76	}
77	}
78	} catch (e) {
79	console.debug("Error while crawling page: ", e);
80	}
81
82	index += batch.length;
83	}
84

nothing calls this directly

splitUrlMethod · 0.95

getLinksFromUrlMethod · 0.95

logMethod · 0.65

pushMethod · 0.65

debugMethod · 0.45

no test coverage detected