| 511 | return self.extract(url, self.DEL.join(sections), **kwargs) |
| 512 | |
| 513 | class TopicExtractionStrategy(ExtractionStrategy): |
| 514 | def __init__(self, num_keywords: int = 3, **kwargs): |
| 515 | """ |
| 516 | Initialize the topic extraction strategy with parameters for topic segmentation. |
| 517 | |
| 518 | :param num_keywords: Number of keywords to represent each topic segment. |
| 519 | """ |
| 520 | import nltk |
| 521 | super().__init__() |
| 522 | self.num_keywords = num_keywords |
| 523 | self.tokenizer = nltk.TextTilingTokenizer() |
| 524 | |
| 525 | def extract_keywords(self, text: str) -> List[str]: |
| 526 | """ |
| 527 | Extract keywords from a given text segment using simple frequency analysis. |
| 528 | |
| 529 | :param text: The text segment from which to extract keywords. |
| 530 | :return: A list of keyword strings. |
| 531 | """ |
| 532 | import nltk |
| 533 | # Tokenize the text and compute word frequency |
| 534 | words = nltk.word_tokenize(text) |
| 535 | freq_dist = nltk.FreqDist(words) |
| 536 | # Get the most common words as keywords |
| 537 | keywords = [word for (word, _) in freq_dist.most_common(self.num_keywords)] |
| 538 | return keywords |
| 539 | |
| 540 | def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: |
| 541 | """ |
| 542 | Extract topics from HTML content using TextTiling for segmentation and keyword extraction. |
| 543 | |
| 544 | :param url: The URL of the webpage. |
| 545 | :param html: The HTML content of the webpage. |
| 546 | :param provider: The provider to be used for extraction (not used here). |
| 547 | :param api_token: Optional API token for the provider (not used here). |
| 548 | :return: A list of dictionaries representing the topics. |
| 549 | """ |
| 550 | # Use TextTiling to segment the text into topics |
| 551 | segmented_topics = html.split(self.DEL) # Split by lines or paragraphs as needed |
| 552 | |
| 553 | # Prepare the output as a list of dictionaries |
| 554 | topic_list = [] |
| 555 | for i, segment in enumerate(segmented_topics): |
| 556 | # Extract keywords for each segment |
| 557 | keywords = self.extract_keywords(segment) |
| 558 | topic_list.append({ |
| 559 | "index": i, |
| 560 | "content": segment, |
| 561 | "keywords": keywords |
| 562 | }) |
| 563 | |
| 564 | return topic_list |
| 565 | |
| 566 | def run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]: |
| 567 | """ |
| 568 | Process sections using topic segmentation and keyword extraction. |
| 569 | |
| 570 | :param url: The URL of the webpage. |
no outgoing calls
searching dependent graphs…