MCPcopy
hub / github.com/unclecode/crawl4ai / run

Method run

crawl4ai/web_crawler.py:105–164  ·  view source on GitHub ↗
(
            self,
            url: str,
            word_count_threshold=MIN_WORD_THRESHOLD,
            extraction_strategy: ExtractionStrategy = None,
            chunking_strategy: ChunkingStrategy = RegexChunking(),
            bypass_cache: bool = False,
            css_selector: str = None,
            screenshot: bool = False,
            user_agent: str = None,
            verbose=True,
            **kwargs,
        )

Source from the content-addressed store, hash-verified

103 return results
104
105 def run(
106 self,
107 url: str,
108 word_count_threshold=MIN_WORD_THRESHOLD,
109 extraction_strategy: ExtractionStrategy = None,
110 chunking_strategy: ChunkingStrategy = RegexChunking(),
111 bypass_cache: bool = False,
112 css_selector: str = None,
113 screenshot: bool = False,
114 user_agent: str = None,
115 verbose=True,
116 **kwargs,
117 ) -> CrawlResult:
118 try:
119 extraction_strategy = extraction_strategy or NoExtractionStrategy()
120 extraction_strategy.verbose = verbose
121 if not isinstance(extraction_strategy, ExtractionStrategy):
122 raise ValueError("Unsupported extraction strategy")
123 if not isinstance(chunking_strategy, ChunkingStrategy):
124 raise ValueError("Unsupported chunking strategy")
125
126 word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)
127
128 cached = None
129 screenshot_data = None
130 extracted_content = None
131 if not bypass_cache and not self.always_by_pass_cache:
132 cached = get_cached_url(url)
133
134 if kwargs.get("warmup", True) and not self.ready:
135 return None
136
137 if cached:
138 html = sanitize_input_encode(cached[1])
139 extracted_content = sanitize_input_encode(cached[4])
140 if screenshot:
141 screenshot_data = cached[9]
142 if not screenshot_data:
143 cached = None
144
145 if not cached or not html:
146 if user_agent:
147 self.crawler_strategy.update_user_agent(user_agent)
148 t1 = time.time()
149 html = sanitize_input_encode(self.crawler_strategy.crawl(url, **kwargs))
150 t2 = time.time()
151 if verbose:
152 print(f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds")
153 if screenshot:
154 screenshot_data = self.crawler_strategy.take_screenshot()
155
156
157 crawl_result = self.process_html(url, html, extracted_content, word_count_threshold, extraction_strategy, chunking_strategy, css_selector, screenshot_data, verbose, bool(cached), **kwargs)
158 crawl_result.success = bool(html)
159 return crawl_result
160 except Exception as e:
161 if not hasattr(e, "msg"):
162 e.msg = str(e)

Callers 7

warmupMethod · 0.95
fetch_pageMethod · 0.95
using_crawler_hooksFunction · 0.95
load_spacy_modelFunction · 0.45
aprocess_htmlMethod · 0.45
process_htmlMethod · 0.45

Calls 9

process_htmlMethod · 0.95
RegexChunkingClass · 0.85
get_cached_urlFunction · 0.85
sanitize_input_encodeFunction · 0.85
CrawlResultClass · 0.85
update_user_agentMethod · 0.45
crawlMethod · 0.45
take_screenshotMethod · 0.45

Tested by

no test coverage detected