MCPcopy
hub / github.com/unclecode/crawl4ai / run_old

Method run_old

crawl4ai/web_crawler.back.py:80–202  ·  view source on GitHub ↗
(
        self,
        url: str,
        word_count_threshold=MIN_WORD_THRESHOLD,
        extraction_strategy: ExtractionStrategy = None,
        chunking_strategy: ChunkingStrategy = RegexChunking(),
        bypass_cache: bool = False,
        css_selector: str = None,
        screenshot: bool = False,
        user_agent: str = None,
        verbose=True,
        **kwargs,
    )

Source from the content-addressed store, hash-verified

78 pass
79
80 def run_old(
81 self,
82 url: str,
83 word_count_threshold=MIN_WORD_THRESHOLD,
84 extraction_strategy: ExtractionStrategy = None,
85 chunking_strategy: ChunkingStrategy = RegexChunking(),
86 bypass_cache: bool = False,
87 css_selector: str = None,
88 screenshot: bool = False,
89 user_agent: str = None,
90 verbose=True,
91 **kwargs,
92 ) -> CrawlResult:
93 if user_agent:
94 self.crawler_strategy.update_user_agent(user_agent)
95 extraction_strategy = extraction_strategy or NoExtractionStrategy()
96 extraction_strategy.verbose = verbose
97 # Check if extraction strategy is an instance of ExtractionStrategy if not raise an error
98 if not isinstance(extraction_strategy, ExtractionStrategy):
99 raise ValueError("Unsupported extraction strategy")
100 if not isinstance(chunking_strategy, ChunkingStrategy):
101 raise ValueError("Unsupported chunking strategy")
102
103 # make sure word_count_threshold is not lesser than MIN_WORD_THRESHOLD
104 if word_count_threshold < MIN_WORD_THRESHOLD:
105 word_count_threshold = MIN_WORD_THRESHOLD
106
107 # Check cache first
108 if not bypass_cache and not self.always_by_pass_cache:
109 cached = get_cached_url(url)
110 if cached:
111 return CrawlResult(
112 **{
113 "url": cached[0],
114 "html": cached[1],
115 "cleaned_html": cached[2],
116 "markdown": cached[3],
117 "extracted_content": cached[4],
118 "success": cached[5],
119 "media": json.loads(cached[6] or "{}"),
120 "links": json.loads(cached[7] or "{}"),
121 "metadata": json.loads(cached[8] or "{}"), # "metadata": "{}
122 "screenshot": cached[9],
123 "error_message": "",
124 }
125 )
126
127 # Initialize WebDriver for crawling
128 t = time.time()
129 if kwargs.get("js", None):
130 self.crawler_strategy.js_code = kwargs.get("js")
131 html = self.crawler_strategy.crawl(url)
132 base64_image = None
133 if screenshot:
134 base64_image = self.crawler_strategy.take_screenshot()
135 success = True
136 error_message = ""
137 # Extract content from HTML

Callers

nothing calls this directly

Calls 13

RegexChunkingClass · 0.85
get_cached_urlFunction · 0.85
CrawlResultClass · 0.85
get_content_of_websiteFunction · 0.85
extract_metadataFunction · 0.85
beautify_htmlFunction · 0.85
cache_urlFunction · 0.85
update_user_agentMethod · 0.45
crawlMethod · 0.45
take_screenshotMethod · 0.45
chunkMethod · 0.45

Tested by

no test coverage detected