(
self,
url: str,
word_count_threshold=MIN_WORD_THRESHOLD,
extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = RegexChunking(),
bypass_cache: bool = False,
css_selector: str = None,
screenshot: bool = False,
user_agent: str = None,
verbose=True,
**kwargs,
)
| 78 | pass |
| 79 | |
| 80 | def run_old( |
| 81 | self, |
| 82 | url: str, |
| 83 | word_count_threshold=MIN_WORD_THRESHOLD, |
| 84 | extraction_strategy: ExtractionStrategy = None, |
| 85 | chunking_strategy: ChunkingStrategy = RegexChunking(), |
| 86 | bypass_cache: bool = False, |
| 87 | css_selector: str = None, |
| 88 | screenshot: bool = False, |
| 89 | user_agent: str = None, |
| 90 | verbose=True, |
| 91 | **kwargs, |
| 92 | ) -> CrawlResult: |
| 93 | if user_agent: |
| 94 | self.crawler_strategy.update_user_agent(user_agent) |
| 95 | extraction_strategy = extraction_strategy or NoExtractionStrategy() |
| 96 | extraction_strategy.verbose = verbose |
| 97 | # Check if extraction strategy is an instance of ExtractionStrategy if not raise an error |
| 98 | if not isinstance(extraction_strategy, ExtractionStrategy): |
| 99 | raise ValueError("Unsupported extraction strategy") |
| 100 | if not isinstance(chunking_strategy, ChunkingStrategy): |
| 101 | raise ValueError("Unsupported chunking strategy") |
| 102 | |
| 103 | # make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD |
| 104 | if word_count_threshold < MIN_WORD_THRESHOLD: |
| 105 | word_count_threshold = MIN_WORD_THRESHOLD |
| 106 | |
| 107 | # Check cache first |
| 108 | if not bypass_cache and not self.always_by_pass_cache: |
| 109 | cached = get_cached_url(url) |
| 110 | if cached: |
| 111 | return CrawlResult( |
| 112 | **{ |
| 113 | "url": cached[0], |
| 114 | "html": cached[1], |
| 115 | "cleaned_html": cached[2], |
| 116 | "markdown": cached[3], |
| 117 | "extracted_content": cached[4], |
| 118 | "success": cached[5], |
| 119 | "media": json.loads(cached[6] or "{}"), |
| 120 | "links": json.loads(cached[7] or "{}"), |
| 121 | "metadata": json.loads(cached[8] or "{}"), # "metadata": "{} |
| 122 | "screenshot": cached[9], |
| 123 | "error_message": "", |
| 124 | } |
| 125 | ) |
| 126 | |
| 127 | # Initialize WebDriver for crawling |
| 128 | t = time.time() |
| 129 | if kwargs.get("js", None): |
| 130 | self.crawler_strategy.js_code = kwargs.get("js") |
| 131 | html = self.crawler_strategy.crawl(url) |
| 132 | base64_image = None |
| 133 | if screenshot: |
| 134 | base64_image = self.crawler_strategy.take_screenshot() |
| 135 | success = True |
| 136 | error_message = "" |
| 137 | # Extract content from HTML |
nothing calls this directly
no test coverage detected