MCPcopy
hub / github.com/apify/crawlee / SitemapRequestList

Class SitemapRequestList

packages/core/src/storages/sitemap_request_list.ts:128–624  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

126 * The loading of the sitemap is performed in the background so that crawling can start before the sitemap is fully loaded.
127 */
128export class SitemapRequestList implements IRequestList {
129 /**
130 * Set of URLs that were returned by `fetchNextRequest()` and not marked as handled yet.
131 * @internal
132 */
133 inProgress = new Set<string>();
134
135 /** Set of URLs for which `reclaimRequest()` was called. */
136 private reclaimed = new Set<string>();
137
138 /**
139 * Map of returned Request objects that have not been marked as handled yet.
140 *
141 * We use this to persist custom user fields on the in-progress (or reclaimed) requests.
142 */
143 private requestData = new Map<string, Request>();
144
145 /**
146 * Object for keeping track of the sitemap parsing progress.
147 */
148 private sitemapParsingProgress: SitemapParsingProgress = {
149 /**
150 * URL of the sitemap that is currently being parsed. `null` if no sitemap is being parsed.
151 */
152 inProgressSitemapUrl: null,
153 /**
154 * Buffer for URLs from the currently parsed sitemap. Used for tracking partially loaded sitemaps across migrations.
155 */
156 inProgressEntries: new Set<string>(),
157 /**
158 * Set of sitemap URLs that have not been parsed yet. If the set is empty and `inProgressSitemapUrl` is `null`, the sitemap loading is finished.
159 */
160 pendingSitemapUrls: new Set<string>(),
161 };
162
163 /**
164 * Object stream of URLs parsed from the sitemaps.
165 * Using `highWaterMark`, this can manage the speed of the sitemap loading.
166 *
167 * Fetch the next URL to be processed using `fetchNextRequest()`.
168 */
169 private urlQueueStream: Transform;
170
171 /**
172 * Indicates whether the request list sitemap loading was aborted.
173 *
174 * If the loading was aborted before the sitemaps were fully loaded, the request list might be missing some URLs.
175 * The `isSitemapFullyLoaded` method can be used to check if the sitemaps were fully loaded.
176 *
177 * If the loading is aborted and all the requests are handled, `isFinished()` will return `true`.
178 */
179 private abortLoading = false;
180
181 /** Number of URLs that were marked as handled */
182 private handledUrlCount = 0;
183
184 private persistStateKey?: string;
185

Callers

nothing calls this directly

Calls

no outgoing calls

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…