(
self,
url: str,
html: str,
extracted_content: str,
word_count_threshold: int,
extraction_strategy: ExtractionStrategy,
chunking_strategy: ChunkingStrategy,
css_selector: str,
screenshot: str,
verbose: bool,
is_cached: bool,
**kwargs,
)
| 170 | |
| 171 | |
| 172 | async def aprocess_html( |
| 173 | self, |
| 174 | url: str, |
| 175 | html: str, |
| 176 | extracted_content: str, |
| 177 | word_count_threshold: int, |
| 178 | extraction_strategy: ExtractionStrategy, |
| 179 | chunking_strategy: ChunkingStrategy, |
| 180 | css_selector: str, |
| 181 | screenshot: str, |
| 182 | verbose: bool, |
| 183 | is_cached: bool, |
| 184 | **kwargs, |
| 185 | ) -> CrawlResult: |
| 186 | t = time.time() |
| 187 | # Extract content from HTML |
| 188 | try: |
| 189 | t1 = time.time() |
| 190 | scrapping_strategy = WebScrappingStrategy() |
| 191 | # result = await scrapping_strategy.ascrap( |
| 192 | result = scrapping_strategy.scrap( |
| 193 | url, |
| 194 | html, |
| 195 | word_count_threshold=word_count_threshold, |
| 196 | css_selector=css_selector, |
| 197 | only_text=kwargs.get("only_text", False), |
| 198 | image_description_min_word_threshold=kwargs.get( |
| 199 | "image_description_min_word_threshold", IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD |
| 200 | ), |
| 201 | **kwargs, |
| 202 | ) |
| 203 | if verbose: |
| 204 | print( |
| 205 | f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds" |
| 206 | ) |
| 207 | |
| 208 | if result is None: |
| 209 | raise ValueError(f"Process HTML, Failed to extract content from the website: {url}") |
| 210 | except InvalidCSSSelectorError as e: |
| 211 | raise ValueError(str(e)) |
| 212 | except Exception as e: |
| 213 | raise ValueError(f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}") |
| 214 | |
| 215 | cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) |
| 216 | markdown = sanitize_input_encode(result.get("markdown", "")) |
| 217 | fit_markdown = sanitize_input_encode(result.get("fit_markdown", "")) |
| 218 | fit_html = sanitize_input_encode(result.get("fit_html", "")) |
| 219 | media = result.get("media", []) |
| 220 | links = result.get("links", []) |
| 221 | metadata = result.get("metadata", {}) |
| 222 | |
| 223 | if extracted_content is None and extraction_strategy and chunking_strategy: |
| 224 | if verbose: |
| 225 | print( |
| 226 | f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {self.__class__.__name__}" |
| 227 | ) |
| 228 | |
| 229 | # Check if extraction strategy is type of JsonCssExtractionStrategy |
no test coverage detected