MCPcopy
hub / github.com/unclecode/crawl4ai / fetch_pages

Method fetch_pages

crawl4ai/web_crawler.py:67–103  ·  view source on GitHub ↗
(
        self,
        url_models: List[UrlModel],
        provider: str = DEFAULT_PROVIDER,
        api_token: str = None,
        extract_blocks_flag: bool = True,
        word_count_threshold=MIN_WORD_THRESHOLD,
        use_cached_html: bool = False,
        css_selector: str = None,
        screenshot: bool = False,
        extraction_strategy: ExtractionStrategy = None,
        chunking_strategy: ChunkingStrategy = RegexChunking(),
        **kwargs,
    )

Source from the content-addressed store, hash-verified

65 pass
66
67 def fetch_pages(
68 self,
69 url_models: List[UrlModel],
70 provider: str = DEFAULT_PROVIDER,
71 api_token: str = None,
72 extract_blocks_flag: bool = True,
73 word_count_threshold=MIN_WORD_THRESHOLD,
74 use_cached_html: bool = False,
75 css_selector: str = None,
76 screenshot: bool = False,
77 extraction_strategy: ExtractionStrategy = None,
78 chunking_strategy: ChunkingStrategy = RegexChunking(),
79 **kwargs,
80 ) -> List[CrawlResult]:
81 extraction_strategy = extraction_strategy or NoExtractionStrategy()
82 def fetch_page_wrapper(url_model, *args, **kwargs):
83 return self.fetch_page(url_model, *args, **kwargs)
84
85 with ThreadPoolExecutor() as executor:
86 results = list(
87 executor.map(
88 fetch_page_wrapper,
89 url_models,
90 [provider] * len(url_models),
91 [api_token] * len(url_models),
92 [extract_blocks_flag] * len(url_models),
93 [word_count_threshold] * len(url_models),
94 [css_selector] * len(url_models),
95 [screenshot] * len(url_models),
96 [use_cached_html] * len(url_models),
97 [extraction_strategy] * len(url_models),
98 [chunking_strategy] * len(url_models),
99 *[kwargs] * len(url_models),
100 )
101 )
102
103 return results
104
105 def run(
106 self,

Callers

nothing calls this directly

Calls 2

RegexChunkingClass · 0.85

Tested by

no test coverage detected