| 126 | * The loading of the sitemap is performed in the background so that crawling can start before the sitemap is fully loaded. |
| 127 | */ |
| 128 | export class SitemapRequestList implements IRequestList { |
| 129 | /** |
| 130 | * Set of URLs that were returned by `fetchNextRequest()` and not marked as handled yet. |
| 131 | * @internal |
| 132 | */ |
| 133 | inProgress = new Set<string>(); |
| 134 | |
| 135 | /** Set of URLs for which `reclaimRequest()` was called. */ |
| 136 | private reclaimed = new Set<string>(); |
| 137 | |
| 138 | /** |
| 139 | * Map of returned Request objects that have not been marked as handled yet. |
| 140 | * |
| 141 | * We use this to persist custom user fields on the in-progress (or reclaimed) requests. |
| 142 | */ |
| 143 | private requestData = new Map<string, Request>(); |
| 144 | |
| 145 | /** |
| 146 | * Object for keeping track of the sitemap parsing progress. |
| 147 | */ |
| 148 | private sitemapParsingProgress: SitemapParsingProgress = { |
| 149 | /** |
| 150 | * URL of the sitemap that is currently being parsed. `null` if no sitemap is being parsed. |
| 151 | */ |
| 152 | inProgressSitemapUrl: null, |
| 153 | /** |
| 154 | * Buffer for URLs from the currently parsed sitemap. Used for tracking partially loaded sitemaps across migrations. |
| 155 | */ |
| 156 | inProgressEntries: new Set<string>(), |
| 157 | /** |
| 158 | * Set of sitemap URLs that have not been parsed yet. If the set is empty and `inProgressSitemapUrl` is `null`, the sitemap loading is finished. |
| 159 | */ |
| 160 | pendingSitemapUrls: new Set<string>(), |
| 161 | }; |
| 162 | |
| 163 | /** |
| 164 | * Object stream of URLs parsed from the sitemaps. |
| 165 | * Using `highWaterMark`, this can manage the speed of the sitemap loading. |
| 166 | * |
| 167 | * Fetch the next URL to be processed using `fetchNextRequest()`. |
| 168 | */ |
| 169 | private urlQueueStream: Transform; |
| 170 | |
| 171 | /** |
| 172 | * Indicates whether the request list sitemap loading was aborted. |
| 173 | * |
| 174 | * If the loading was aborted before the sitemaps were fully loaded, the request list might be missing some URLs. |
| 175 | * The `isSitemapFullyLoaded` method can be used to check if the sitemaps were fully loaded. |
| 176 | * |
| 177 | * If the loading is aborted and all the requests are handled, `isFinished()` will return `true`. |
| 178 | */ |
| 179 | private abortLoading = false; |
| 180 | |
| 181 | /** Number of URLs that were marked as handled */ |
| 182 | private handledUrlCount = 0; |
| 183 | |
| 184 | private persistStateKey?: string; |
| 185 |
nothing calls this directly
no outgoing calls
no test coverage detected
searching dependent graphs…