MCPcopy
hub / github.com/Fosowl/agenticSeek / get_text

Method get_text

sources/browser.py:389–417  ·  view source on GitHub ↗

Get page text as formatted Markdown

(self)

Source from the content-addressed store, hash-verified

387 return (word_count >= 5 and (has_punctuation or is_long_enough))
388
389 def get_text(self) -> str | None:
390 """Get page text as formatted Markdown"""
391 try:
392 soup = BeautifulSoup(self.driver.page_source, 'html.parser')
393 for element in soup(['script', 'style', 'noscript', 'meta', 'link']):
394 element.decompose()
395 markdown_converter = markdownify.MarkdownConverter(
396 heading_style="ATX",
397 strip=['a'],
398 autolinks=False,
399 bullets='•',
400 strong_em_symbol='*',
401 default_title=False,
402 )
403 markdown_text = markdown_converter.convert(str(soup.body))
404 lines = []
405 for line in markdown_text.splitlines():
406 stripped = line.strip()
407 if stripped and self.is_sentence(stripped):
408 cleaned = ' '.join(stripped.split())
409 lines.append(cleaned)
410 result = "[Start of page]\n\n" + "\n\n".join(lines) + "\n\n[End of page]"
411 result = re.sub(r'!\[(.*?)\]\(.*?\)', r'[IMAGE: \1]', result)
412 self.logger.info(f"Extracted text: {result[:100]}...")
413 self.logger.info(f"Extracted text length: {len(result)}")
414 return result[:32768]
415 except Exception as e:
416 self.logger.error(f"Error getting text: {str(e)}")
417 return None
418
419 def clean_url(self, url:str) -> str:
420 """Clean URL to keep only the part needed for navigation to the page"""

Callers 1

get_page_textMethod · 0.80

Calls 3

is_sentenceMethod · 0.95
infoMethod · 0.80
errorMethod · 0.80

Tested by

no test coverage detected