hub / github.com/unclecode/crawl4ai / fetch_pages

Method fetch_pages

crawl4ai/web_crawler.py:67–103 · view source on GitHub ↗

(
        self,
        url_models: List[UrlModel],
        provider: str = DEFAULT_PROVIDER,
        api_token: str = None,
        extract_blocks_flag: bool = True,
        word_count_threshold=MIN_WORD_THRESHOLD,
        use_cached_html: bool = False,
        css_selector: str = None,
        screenshot: bool = False,
        extraction_strategy: ExtractionStrategy = None,
        chunking_strategy: ChunkingStrategy = RegexChunking(),
        **kwargs,
    )

Source from the content-addressed store, hash-verified

65	pass
66
67	def fetch_pages(
68	self,
69	url_models: List[UrlModel],
70	provider: str = DEFAULT_PROVIDER,
71	api_token: str = None,
72	extract_blocks_flag: bool = True,
73	word_count_threshold=MIN_WORD_THRESHOLD,
74	use_cached_html: bool = False,
75	css_selector: str = None,
76	screenshot: bool = False,
77	extraction_strategy: ExtractionStrategy = None,
78	chunking_strategy: ChunkingStrategy = RegexChunking(),
79	**kwargs,
80	) -> List[CrawlResult]:
81	extraction_strategy = extraction_strategy or NoExtractionStrategy()
82	def fetch_page_wrapper(url_model, args, *kwargs):
83	return self.fetch_page(url_model, args, *kwargs)
84
85	with ThreadPoolExecutor() as executor:
86	results = list(
87	executor.map(
88	fetch_page_wrapper,
89	url_models,
90	[provider] * len(url_models),
91	[api_token] * len(url_models),
92	[extract_blocks_flag] * len(url_models),
93	[word_count_threshold] * len(url_models),
94	[css_selector] * len(url_models),
95	[screenshot] * len(url_models),
96	[use_cached_html] * len(url_models),
97	[extraction_strategy] * len(url_models),
98	[chunking_strategy] * len(url_models),
99	[kwargs] len(url_models),
100	)
101	)
102
103	return results
104
105	def run(
106	self,

Callers

nothing calls this directly

Calls 2

RegexChunkingClass · 0.85

NoExtractionStrategyClass · 0.85

Tested by

no test coverage detected