MCPcopy
hub / github.com/unclecode/crawl4ai / aprocess_html

Method aprocess_html

crawl4ai/async_webcrawler.py:172–274  ·  view source on GitHub ↗
(
        self,
        url: str,
        html: str,
        extracted_content: str,
        word_count_threshold: int,
        extraction_strategy: ExtractionStrategy,
        chunking_strategy: ChunkingStrategy,
        css_selector: str,
        screenshot: str,
        verbose: bool,
        is_cached: bool,
        **kwargs,
    )

Source from the content-addressed store, hash-verified

170
171
172 async def aprocess_html(
173 self,
174 url: str,
175 html: str,
176 extracted_content: str,
177 word_count_threshold: int,
178 extraction_strategy: ExtractionStrategy,
179 chunking_strategy: ChunkingStrategy,
180 css_selector: str,
181 screenshot: str,
182 verbose: bool,
183 is_cached: bool,
184 **kwargs,
185 ) -> CrawlResult:
186 t = time.time()
187 # Extract content from HTML
188 try:
189 t1 = time.time()
190 scrapping_strategy = WebScrappingStrategy()
191 # result = await scrapping_strategy.ascrap(
192 result = scrapping_strategy.scrap(
193 url,
194 html,
195 word_count_threshold=word_count_threshold,
196 css_selector=css_selector,
197 only_text=kwargs.get("only_text", False),
198 image_description_min_word_threshold=kwargs.get(
199 "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD
200 ),
201 **kwargs,
202 )
203 if verbose:
204 print(
205 f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds"
206 )
207
208 if result is None:
209 raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")
210 except InvalidCSSSelectorError as e:
211 raise ValueError(str(e))
212 except Exception as e:
213 raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}")
214
215 cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
216 markdown = sanitize_input_encode(result.get("markdown", ""))
217 fit_markdown = sanitize_input_encode(result.get("fit_markdown", ""))
218 fit_html = sanitize_input_encode(result.get("fit_html", ""))
219 media = result.get("media", [])
220 links = result.get("links", [])
221 metadata = result.get("metadata", {})
222
223 if extracted_content is None and extraction_strategy and chunking_strategy:
224 if verbose:
225 print(
226 f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {self.__class__.__name__}"
227 )
228
229 # Check if extraction strategy is type of JsonCssExtractionStrategy

Callers 1

arunMethod · 0.95

Calls 8

scrapMethod · 0.95
sanitize_input_encodeFunction · 0.85
CrawlResultClass · 0.85
format_htmlFunction · 0.85
acache_urlMethod · 0.80
runMethod · 0.45
chunkMethod · 0.45

Tested by

no test coverage detected