hub / github.com/unclecode/crawl4ai / aprocess_html

Method aprocess_html

crawl4ai/async_webcrawler.py:172–274 · view source on GitHub ↗

(
        self,
        url: str,
        html: str,
        extracted_content: str,
        word_count_threshold: int,
        extraction_strategy: ExtractionStrategy,
        chunking_strategy: ChunkingStrategy,
        css_selector: str,
        screenshot: str,
        verbose: bool,
        is_cached: bool,
        **kwargs,
    )

Source from the content-addressed store, hash-verified

170
171
172	async def aprocess_html(
173	self,
174	url: str,
175	html: str,
176	extracted_content: str,
177	word_count_threshold: int,
178	extraction_strategy: ExtractionStrategy,
179	chunking_strategy: ChunkingStrategy,
180	css_selector: str,
181	screenshot: str,
182	verbose: bool,
183	is_cached: bool,
184	**kwargs,
185	) -> CrawlResult:
186	t = time.time()
187	# Extract content from HTML
188	try:
189	t1 = time.time()
190	scrapping_strategy = WebScrappingStrategy()
191	# result = await scrapping_strategy.ascrap(
192	result = scrapping_strategy.scrap(
193	url,
194	html,
195	word_count_threshold=word_count_threshold,
196	css_selector=css_selector,
197	only_text=kwargs.get("only_text", False),
198	image_description_min_word_threshold=kwargs.get(
199	"image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
200	),
201	**kwargs,
202	)
203	if verbose:
204	print(
205	f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds"
206	)
207
208	if result is None:
209	raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
210	except InvalidCSSSelectorError as e:
211	raise ValueError(str(e))
212	except Exception as e:
213	raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
214
215	cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
216	markdown = sanitize_input_encode(result.get("markdown", ""))
217	fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
218	fit_html = sanitize_input_encode(result.get("fit_html", ""))
219	media = result.get("media", [])
220	links = result.get("links", [])
221	metadata = result.get("metadata", {})
222
223	if extracted_content is None and extraction_strategy and chunking_strategy:
224	if verbose:
225	print(
226	f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {self.__class__.__name__}"
227	)
228
229	# Check if extraction strategy is type of JsonCssExtractionStrategy

Callers 1

arunMethod · 0.95

Calls 8

scrapMethod · 0.95

WebScrappingStrategyClass · 0.85

sanitize_input_encodeFunction · 0.85

CrawlResultClass · 0.85

format_htmlFunction · 0.85

acache_urlMethod · 0.80

runMethod · 0.45

chunkMethod · 0.45

Tested by

no test coverage detected