hub / github.com/unclecode/crawl4ai / fetch_pages

Method fetch_pages

crawl4ai/web_crawler.back.py:204–240 · view source on GitHub ↗

(
        self,
        url_models: List[UrlModel],
        provider: str = DEFAULT_PROVIDER,
        api_token: str = None,
        extract_blocks_flag: bool = True,
        word_count_threshold=MIN_WORD_THRESHOLD,
        use_cached_html: bool = False,
        css_selector: str = None,
        screenshot: bool = False,
        extraction_strategy: ExtractionStrategy = None,
        chunking_strategy: ChunkingStrategy = RegexChunking(),
        **kwargs,
    )

Source from the content-addressed store, hash-verified

202	)
203
204	def fetch_pages(
205	self,
206	url_models: List[UrlModel],
207	provider: str = DEFAULT_PROVIDER,
208	api_token: str = None,
209	extract_blocks_flag: bool = True,
210	word_count_threshold=MIN_WORD_THRESHOLD,
211	use_cached_html: bool = False,
212	css_selector: str = None,
213	screenshot: bool = False,
214	extraction_strategy: ExtractionStrategy = None,
215	chunking_strategy: ChunkingStrategy = RegexChunking(),
216	**kwargs,
217	) -> List[CrawlResult]:
218	extraction_strategy = extraction_strategy or NoExtractionStrategy()
219	def fetch_page_wrapper(url_model, args, *kwargs):
220	return self.fetch_page(url_model, args, *kwargs)
221
222	with ThreadPoolExecutor() as executor:
223	results = list(
224	executor.map(
225	fetch_page_wrapper,
226	url_models,
227	[provider] * len(url_models),
228	[api_token] * len(url_models),
229	[extract_blocks_flag] * len(url_models),
230	[word_count_threshold] * len(url_models),
231	[css_selector] * len(url_models),
232	[screenshot] * len(url_models),
233	[use_cached_html] * len(url_models),
234	[extraction_strategy] * len(url_models),
235	[chunking_strategy] * len(url_models),
236	[kwargs] len(url_models),
237	)
238	)
239
240	return results
241
242	def run(
243	self,

Callers

nothing calls this directly

Calls 2

RegexChunkingClass · 0.85

NoExtractionStrategyClass · 0.85

Tested by

no test coverage detected