(
self,
url: str,
html: str,
extracted_content: str,
word_count_threshold: int,
extraction_strategy: ExtractionStrategy,
chunking_strategy: ChunkingStrategy,
css_selector: str,
screenshot: bool,
verbose: bool,
is_cached: bool,
**kwargs,
)
| 164 | return CrawlResult(url=url, html="", success=False, error_message=e.msg) |
| 165 | |
| 166 | def process_html( |
| 167 | self, |
| 168 | url: str, |
| 169 | html: str, |
| 170 | extracted_content: str, |
| 171 | word_count_threshold: int, |
| 172 | extraction_strategy: ExtractionStrategy, |
| 173 | chunking_strategy: ChunkingStrategy, |
| 174 | css_selector: str, |
| 175 | screenshot: bool, |
| 176 | verbose: bool, |
| 177 | is_cached: bool, |
| 178 | **kwargs, |
| 179 | ) -> CrawlResult: |
| 180 | t = time.time() |
| 181 | # Extract content from HTML |
| 182 | try: |
| 183 | t1 = time.time() |
| 184 | result = get_content_of_website_optimized(url, html, word_count_threshold, css_selector=css_selector, only_text=kwargs.get("only_text", False)) |
| 185 | if verbose: |
| 186 | print(f"[LOG] 🚀 Content extracted for {url}, success: True, time taken: {time.time() - t1:.2f} seconds") |
| 187 | |
| 188 | if result is None: |
| 189 | raise ValueError(f"Failed to extract content from the website: {url}") |
| 190 | except InvalidCSSSelectorError as e: |
| 191 | raise ValueError(str(e)) |
| 192 | |
| 193 | cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) |
| 194 | markdown = sanitize_input_encode(result.get("markdown", "")) |
| 195 | media = result.get("media", []) |
| 196 | links = result.get("links", []) |
| 197 | metadata = result.get("metadata", {}) |
| 198 | |
| 199 | if extracted_content is None: |
| 200 | if verbose: |
| 201 | print(f"[LOG] 🔥 Extracting semantic blocks for {url}, Strategy: {extraction_strategy.name}") |
| 202 | |
| 203 | sections = chunking_strategy.chunk(markdown) |
| 204 | extracted_content = extraction_strategy.run(url, sections) |
| 205 | extracted_content = json.dumps(extracted_content, indent=4, default=str, ensure_ascii=False) |
| 206 | |
| 207 | if verbose: |
| 208 | print(f"[LOG] 🚀 Extraction done for {url}, time taken: {time.time() - t:.2f} seconds.") |
| 209 | |
| 210 | screenshot = None if not screenshot else screenshot |
| 211 | |
| 212 | if not is_cached: |
| 213 | cache_url( |
| 214 | url, |
| 215 | html, |
| 216 | cleaned_html, |
| 217 | markdown, |
| 218 | extracted_content, |
| 219 | True, |
| 220 | json.dumps(media), |
| 221 | json.dumps(links), |
| 222 | json.dumps(metadata), |
| 223 | screenshot=screenshot, |
no test coverage detected