hub / github.com/unclecode/crawl4ai / run_old

Method run_old

crawl4ai/web_crawler.back.py:80–202 · view source on GitHub ↗

(
        self,
        url: str,
        word_count_threshold=MIN_WORD_THRESHOLD,
        extraction_strategy: ExtractionStrategy = None,
        chunking_strategy: ChunkingStrategy = RegexChunking(),
        bypass_cache: bool = False,
        css_selector: str = None,
        screenshot: bool = False,
        user_agent: str = None,
        verbose=True,
        **kwargs,
    )

Source from the content-addressed store, hash-verified

78	pass
79
80	def run_old(
81	self,
82	url: str,
83	word_count_threshold=MIN_WORD_THRESHOLD,
84	extraction_strategy: ExtractionStrategy = None,
85	chunking_strategy: ChunkingStrategy = RegexChunking(),
86	bypass_cache: bool = False,
87	css_selector: str = None,
88	screenshot: bool = False,
89	user_agent: str = None,
90	verbose=True,
91	**kwargs,
92	) -> CrawlResult:
93	if user_agent:
94	self.crawler_strategy.update_user_agent(user_agent)
95	extraction_strategy = extraction_strategy or NoExtractionStrategy()
96	extraction_strategy.verbose = verbose
97	# Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
98	if not isinstance(extraction_strategy, ExtractionStrategy):
99	raise ValueError("Unsupported extraction strategy")
100	if not isinstance(chunking_strategy, ChunkingStrategy):
101	raise ValueError("Unsupported chunking strategy")
102
103	# make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD
104	if word_count_threshold < MIN_WORD_THRESHOLD:
105	word_count_threshold = MIN_WORD_THRESHOLD
106
107	# Check cache first
108	if not bypass_cache and not self.always_by_pass_cache:
109	cached = get_cached_url(url)
110	if cached:
111	return CrawlResult(
112	**{
113	"url": cached[0],
114	"html": cached[1],
115	"cleaned_html": cached[2],
116	"markdown": cached[3],
117	"extracted_content": cached[4],
118	"success": cached[5],
119	"media": json.loads(cached[6] or "{}"),
120	"links": json.loads(cached[7] or "{}"),
121	"metadata": json.loads(cached[8] or "{}"), # "metadata": "{}
122	"screenshot": cached[9],
123	"error_message": "",
124	}
125	)
126
127	# Initialize WebDriver for crawling
128	t = time.time()
129	if kwargs.get("js", None):
130	self.crawler_strategy.js_code = kwargs.get("js")
131	html = self.crawler_strategy.crawl(url)
132	base64_image = None
133	if screenshot:
134	base64_image = self.crawler_strategy.take_screenshot()
135	success = True
136	error_message = ""
137	# Extract content from HTML

Callers

nothing calls this directly

Calls 13

RegexChunkingClass · 0.85

NoExtractionStrategyClass · 0.85

get_cached_urlFunction · 0.85

CrawlResultClass · 0.85

get_content_of_websiteFunction · 0.85

extract_metadataFunction · 0.85

beautify_htmlFunction · 0.85

cache_urlFunction · 0.85

update_user_agentMethod · 0.45

crawlMethod · 0.45

take_screenshotMethod · 0.45

chunkMethod · 0.45

Tested by

no test coverage detected