Get page text as formatted Markdown
(self)
| 387 | return (word_count >= 5 and (has_punctuation or is_long_enough)) |
| 388 | |
| 389 | def get_text(self) -> str | None: |
| 390 | """Get page text as formatted Markdown""" |
| 391 | try: |
| 392 | soup = BeautifulSoup(self.driver.page_source, 'html.parser') |
| 393 | for element in soup(['script', 'style', 'noscript', 'meta', 'link']): |
| 394 | element.decompose() |
| 395 | markdown_converter = markdownify.MarkdownConverter( |
| 396 | heading_style="ATX", |
| 397 | strip=['a'], |
| 398 | autolinks=False, |
| 399 | bullets='•', |
| 400 | strong_em_symbol='*', |
| 401 | default_title=False, |
| 402 | ) |
| 403 | markdown_text = markdown_converter.convert(str(soup.body)) |
| 404 | lines = [] |
| 405 | for line in markdown_text.splitlines(): |
| 406 | stripped = line.strip() |
| 407 | if stripped and self.is_sentence(stripped): |
| 408 | cleaned = ' '.join(stripped.split()) |
| 409 | lines.append(cleaned) |
| 410 | result = "[Start of page]\n\n" + "\n\n".join(lines) + "\n\n[End of page]" |
| 411 | result = re.sub(r'!\[(.*?)\]\(.*?\)', r'[IMAGE: \1]', result) |
| 412 | self.logger.info(f"Extracted text: {result[:100]}...") |
| 413 | self.logger.info(f"Extracted text length: {len(result)}") |
| 414 | return result[:32768] |
| 415 | except Exception as e: |
| 416 | self.logger.error(f"Error getting text: {str(e)}") |
| 417 | return None |
| 418 | |
| 419 | def clean_url(self, url:str) -> str: |
| 420 | """Clean URL to keep only the part needed for navigation to the page""" |
no test coverage detected