Method get_text

sources/browser.py:389–417 · view source on GitHub ↗

Get page text as formatted Markdown

(self)

Source from the content-addressed store, hash-verified

387	return (word_count >= 5 and (has_punctuation or is_long_enough))
388
389	def get_text(self) -> str \| None:
390	"""Get page text as formatted Markdown"""
391	try:
392	soup = BeautifulSoup(self.driver.page_source, 'html.parser')
393	for element in soup(['script', 'style', 'noscript', 'meta', 'link']):
394	element.decompose()
395	markdown_converter = markdownify.MarkdownConverter(
396	heading_style="ATX",
397	strip=['a'],
398	autolinks=False,
399	bullets='•',
400	strong_em_symbol='*',
401	default_title=False,
402	)
403	markdown_text = markdown_converter.convert(str(soup.body))
404	lines = []
405	for line in markdown_text.splitlines():
406	stripped = line.strip()
407	if stripped and self.is_sentence(stripped):
408	cleaned = ' '.join(stripped.split())
409	lines.append(cleaned)
410	result = "[Start of page]\n\n" + "\n\n".join(lines) + "\n\n[End of page]"
411	result = re.sub(r'!\[(.?)\]\(.?\)', r'[IMAGE: \1]', result)
412	self.logger.info(f"Extracted text: {result[:100]}...")
413	self.logger.info(f"Extracted text length: {len(result)}")
414	return result[:32768]
415	except Exception as e:
416	self.logger.error(f"Error getting text: {str(e)}")
417	return None
418
419	def clean_url(self, url:str) -> str:
420	"""Clean URL to keep only the part needed for navigation to the page"""

get_page_textMethod · 0.80

is_sentenceMethod · 0.95

infoMethod · 0.80

errorMethod · 0.80

no test coverage detected