hub / github.com/unclecode/crawl4ai / TopicExtractionStrategy

Class TopicExtractionStrategy

crawl4ai/extraction_strategy.py:513–579 · view source on GitHub ↗

Source from the content-addressed store, hash-verified

511	return self.extract(url, self.DEL.join(sections), **kwargs)
512
513	class TopicExtractionStrategy(ExtractionStrategy):
514	def __init__(self, num_keywords: int = 3, **kwargs):
515	"""
516	Initialize the topic extraction strategy with parameters for topic segmentation.
517
518	:param num_keywords: Number of keywords to represent each topic segment.
519	"""
520	import nltk
521	super().__init__()
522	self.num_keywords = num_keywords
523	self.tokenizer = nltk.TextTilingTokenizer()
524
525	def extract_keywords(self, text: str) -> List[str]:
526	"""
527	Extract keywords from a given text segment using simple frequency analysis.
528
529	:param text: The text segment from which to extract keywords.
530	:return: A list of keyword strings.
531	"""
532	import nltk
533	# Tokenize the text and compute word frequency
534	words = nltk.word_tokenize(text)
535	freq_dist = nltk.FreqDist(words)
536	# Get the most common words as keywords
537	keywords = [word for (word, _) in freq_dist.most_common(self.num_keywords)]
538	return keywords
539
540	def extract(self, url: str, html: str, q, *kwargs) -> List[Dict[str, Any]]:
541	"""
542	Extract topics from HTML content using TextTiling for segmentation and keyword extraction.
543
544	:param url: The URL of the webpage.
545	:param html: The HTML content of the webpage.
546	:param provider: The provider to be used for extraction (not used here).
547	:param api_token: Optional API token for the provider (not used here).
548	:return: A list of dictionaries representing the topics.
549	"""
550	# Use TextTiling to segment the text into topics
551	segmented_topics = html.split(self.DEL) # Split by lines or paragraphs as needed
552
553	# Prepare the output as a list of dictionaries
554	topic_list = []
555	for i, segment in enumerate(segmented_topics):
556	# Extract keywords for each segment
557	keywords = self.extract_keywords(segment)
558	topic_list.append({
559	"index": i,
560	"content": segment,
561	"keywords": keywords
562	})
563
564	return topic_list
565
566	def run(self, url: str, sections: List[str], q, *kwargs) -> List[Dict[str, Any]]:
567	"""
568	Process sections using topic segmentation and keyword extraction.
569
570	:param url: The URL of the webpage.

Callers 1

test_run_different_strategiesMethod · 0.90

Calls

no outgoing calls

Tested by 1

test_run_different_strategiesMethod · 0.72

Used in the wild real call sites across dependent graphs

searching dependent graphs…