MCPcopy
hub / github.com/unclecode/crawl4ai / TopicExtractionStrategy

Class TopicExtractionStrategy

crawl4ai/extraction_strategy.py:513–579  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

511 return self.extract(url, self.DEL.join(sections), **kwargs)
512
513class TopicExtractionStrategy(ExtractionStrategy):
514 def __init__(self, num_keywords: int = 3, **kwargs):
515 """
516 Initialize the topic extraction strategy with parameters for topic segmentation.
517
518 :param num_keywords: Number of keywords to represent each topic segment.
519 """
520 import nltk
521 super().__init__()
522 self.num_keywords = num_keywords
523 self.tokenizer = nltk.TextTilingTokenizer()
524
525 def extract_keywords(self, text: str) -> List[str]:
526 """
527 Extract keywords from a given text segment using simple frequency analysis.
528
529 :param text: The text segment from which to extract keywords.
530 :return: A list of keyword strings.
531 """
532 import nltk
533 # Tokenize the text and compute word frequency
534 words = nltk.word_tokenize(text)
535 freq_dist = nltk.FreqDist(words)
536 # Get the most common words as keywords
537 keywords = [word for (word, _) in freq_dist.most_common(self.num_keywords)]
538 return keywords
539
540 def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
541 """
542 Extract topics from HTML content using TextTiling for segmentation and keyword extraction.
543
544 :param url: The URL of the webpage.
545 :param html: The HTML content of the webpage.
546 :param provider: The provider to be used for extraction (not used here).
547 :param api_token: Optional API token for the provider (not used here).
548 :return: A list of dictionaries representing the topics.
549 """
550 # Use TextTiling to segment the text into topics
551 segmented_topics = html.split(self.DEL) # Split by lines or paragraphs as needed
552
553 # Prepare the output as a list of dictionaries
554 topic_list = []
555 for i, segment in enumerate(segmented_topics):
556 # Extract keywords for each segment
557 keywords = self.extract_keywords(segment)
558 topic_list.append({
559 "index": i,
560 "content": segment,
561 "keywords": keywords
562 })
563
564 return topic_list
565
566 def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]:
567 """
568 Process sections using topic segmentation and keyword extraction.
569
570 :param url: The URL of the webpage.

Callers 1

Calls

no outgoing calls

Tested by 1

Used in the wild real call sites across dependent graphs

searching dependent graphs…