MCPcopy
hub / github.com/unclecode/crawl4ai / get_content_of_website

Function get_content_of_website

crawl4ai/utils.py:269–490  ·  view source on GitHub ↗
(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None, **kwargs)

Source from the content-addressed store, hash-verified

267 # return soup
268
269def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None, **kwargs):
270 try:
271 if not html:
272 return None
273 # Parse HTML content with BeautifulSoup
274 soup = BeautifulSoup(html, 'html.parser')
275
276 # Get the content within the <body> tag
277 body = soup.body
278
279 # If css_selector is provided, extract content based on the selector
280 if css_selector:
281 selected_elements = body.select(css_selector)
282 if not selected_elements:
283 raise InvalidCSSSelectorError(f"Invalid CSS selector , No elements found for CSS selector: {css_selector}")
284 div_tag = soup.new_tag('div')
285 for el in selected_elements:
286 div_tag.append(el)
287 body = div_tag
288
289 links = {
290 'internal': [],
291 'external': []
292 }
293
294 # Extract all internal and external links
295 for a in body.find_all('a', href=True):
296 href = a['href']
297 url_base = url.split('/')[2]
298 if href.startswith('http') and url_base not in href:
299 links['external'].append({
300 'href': href,
301 'text': a.get_text()
302 })
303 else:
304 links['internal'].append(
305 {
306 'href': href,
307 'text': a.get_text()
308 }
309 )
310
311 # Remove script, style, and other tags that don't carry useful content from body
312 for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']):
313 tag.decompose()
314
315 # Remove all attributes from remaining tags in body, except for img tags
316 for tag in body.find_all():
317 if tag.name != 'img':
318 tag.attrs = {}
319
320 # Extract all img tgas int0 [{src: '', alt: ''}]
321 media = {
322 'images': [],
323 'videos': [],
324 'audios': []
325 }
326 for img in body.find_all('img'):

Callers 2

run_oldMethod · 0.85
process_htmlMethod · 0.85

Calls 12

replace_inline_tagsFunction · 0.85
remove_small_text_tagsFunction · 0.85
remove_empty_tagsFunction · 0.85
flatten_nested_elementsFunction · 0.85
sanitize_htmlFunction · 0.85
CustomHTML2TextClass · 0.85
extract_metadataFunction · 0.85
handleMethod · 0.80
extractMethod · 0.45

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…