(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None, **kwargs)
| 267 | # return soup |
| 268 | |
| 269 | def get_content_of_website(url, html, word_count_threshold = MIN_WORD_THRESHOLD, css_selector = None, **kwargs): |
| 270 | try: |
| 271 | if not html: |
| 272 | return None |
| 273 | # Parse HTML content with BeautifulSoup |
| 274 | soup = BeautifulSoup(html, 'html.parser') |
| 275 | |
| 276 | # Get the content within the <body> tag |
| 277 | body = soup.body |
| 278 | |
| 279 | # If css_selector is provided, extract content based on the selector |
| 280 | if css_selector: |
| 281 | selected_elements = body.select(css_selector) |
| 282 | if not selected_elements: |
| 283 | raise InvalidCSSSelectorError(f"Invalid CSS selector , No elements found for CSS selector: {css_selector}") |
| 284 | div_tag = soup.new_tag('div') |
| 285 | for el in selected_elements: |
| 286 | div_tag.append(el) |
| 287 | body = div_tag |
| 288 | |
| 289 | links = { |
| 290 | 'internal': [], |
| 291 | 'external': [] |
| 292 | } |
| 293 | |
| 294 | # Extract all internal and external links |
| 295 | for a in body.find_all('a', href=True): |
| 296 | href = a['href'] |
| 297 | url_base = url.split('/')[2] |
| 298 | if href.startswith('http') and url_base not in href: |
| 299 | links['external'].append({ |
| 300 | 'href': href, |
| 301 | 'text': a.get_text() |
| 302 | }) |
| 303 | else: |
| 304 | links['internal'].append( |
| 305 | { |
| 306 | 'href': href, |
| 307 | 'text': a.get_text() |
| 308 | } |
| 309 | ) |
| 310 | |
| 311 | # Remove script, style, and other tags that don't carry useful content from body |
| 312 | for tag in body.find_all(['script', 'style', 'link', 'meta', 'noscript']): |
| 313 | tag.decompose() |
| 314 | |
| 315 | # Remove all attributes from remaining tags in body, except for img tags |
| 316 | for tag in body.find_all(): |
| 317 | if tag.name != 'img': |
| 318 | tag.attrs = {} |
| 319 | |
| 320 | # Extract all img tgas int0 [{src: '', alt: ''}] |
| 321 | media = { |
| 322 | 'images': [], |
| 323 | 'videos': [], |
| 324 | 'audios': [] |
| 325 | } |
| 326 | for img in body.find_all('img'): |
no test coverage detected
searching dependent graphs…