MCPcopy
hub / github.com/unclecode/crawl4ai / fetch_pages

Method fetch_pages

crawl4ai/web_crawler.back.py:204–240  ·  view source on GitHub ↗
(
        self,
        url_models: List[UrlModel],
        provider: str = DEFAULT_PROVIDER,
        api_token: str = None,
        extract_blocks_flag: bool = True,
        word_count_threshold=MIN_WORD_THRESHOLD,
        use_cached_html: bool = False,
        css_selector: str = None,
        screenshot: bool = False,
        extraction_strategy: ExtractionStrategy = None,
        chunking_strategy: ChunkingStrategy = RegexChunking(),
        **kwargs,
    )

Source from the content-addressed store, hash-verified

202 )
203
204 def fetch_pages(
205 self,
206 url_models: List[UrlModel],
207 provider: str = DEFAULT_PROVIDER,
208 api_token: str = None,
209 extract_blocks_flag: bool = True,
210 word_count_threshold=MIN_WORD_THRESHOLD,
211 use_cached_html: bool = False,
212 css_selector: str = None,
213 screenshot: bool = False,
214 extraction_strategy: ExtractionStrategy = None,
215 chunking_strategy: ChunkingStrategy = RegexChunking(),
216 **kwargs,
217 ) -> List[CrawlResult]:
218 extraction_strategy = extraction_strategy or NoExtractionStrategy()
219 def fetch_page_wrapper(url_model, *args, **kwargs):
220 return self.fetch_page(url_model, *args, **kwargs)
221
222 with ThreadPoolExecutor() as executor:
223 results = list(
224 executor.map(
225 fetch_page_wrapper,
226 url_models,
227 [provider] * len(url_models),
228 [api_token] * len(url_models),
229 [extract_blocks_flag] * len(url_models),
230 [word_count_threshold] * len(url_models),
231 [css_selector] * len(url_models),
232 [screenshot] * len(url_models),
233 [use_cached_html] * len(url_models),
234 [extraction_strategy] * len(url_models),
235 [chunking_strategy] * len(url_models),
236 *[kwargs] * len(url_models),
237 )
238 )
239
240 return results
241
242 def run(
243 self,

Callers

nothing calls this directly

Calls 2

RegexChunkingClass · 0.85

Tested by

no test coverage detected