()
| 25 | ) {} |
| 26 | |
| 27 | async *crawl(): AsyncGenerator<PageData> { |
| 28 | let currentPageCount = 0; |
| 29 | console.log( |
| 30 | `[${ |
| 31 | (this.constructor as any).name |
| 32 | }] Starting crawl from: ${this.startUrl} - Max Depth: ${this.maxDepth}`, |
| 33 | ); |
| 34 | |
| 35 | const { baseUrl, basePath } = this.splitUrl(this.startUrl); |
| 36 | |
| 37 | let paths: { path: string; depth: number }[] = [ |
| 38 | { path: basePath, depth: 0 }, |
| 39 | ]; |
| 40 | |
| 41 | let index = 0; |
| 42 | |
| 43 | while (index < paths.length) { |
| 44 | const batch = paths.slice(index, index + 50); |
| 45 | |
| 46 | try { |
| 47 | const promises = batch.map(({ path, depth }) => |
| 48 | this.getLinksFromUrl(baseUrl, path).then((links) => ({ |
| 49 | links, |
| 50 | path, |
| 51 | depth, |
| 52 | })), |
| 53 | ); |
| 54 | |
| 55 | const results = await Promise.all(promises); |
| 56 | for (const { |
| 57 | links: { html, links: linksArray }, |
| 58 | path, |
| 59 | depth, |
| 60 | } of results) { |
| 61 | if (html !== "" && depth <= this.maxDepth) { |
| 62 | yield { url: this.startUrl.toString(), path, content: html }; |
| 63 | currentPageCount++; |
| 64 | if (currentPageCount >= this.maxRequestsPerCrawl) { |
| 65 | console.log("Crawl completed - max requests reached"); |
| 66 | return; |
| 67 | } |
| 68 | } |
| 69 | |
| 70 | if (depth < this.maxDepth) { |
| 71 | for (let link of linksArray) { |
| 72 | if (!paths.some((p) => p.path === link)) { |
| 73 | paths.push({ path: link, depth: depth + 1 }); |
| 74 | } |
| 75 | } |
| 76 | } |
| 77 | } |
| 78 | } catch (e) { |
| 79 | console.debug("Error while crawling page: ", e); |
| 80 | } |
| 81 | |
| 82 | index += batch.length; |
| 83 | } |
| 84 |
nothing calls this directly
no test coverage detected