Extract clusters from HTML content using hierarchical clustering. :param url: The URL of the webpage. :param html: The HTML content of the webpage. :return: A list of dictionaries representing the clusters.
(self, url: str, html: str, *q, **kwargs)
| 433 | return filtered_clusters |
| 434 | |
| 435 | def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]: |
| 436 | """ |
| 437 | Extract clusters from HTML content using hierarchical clustering. |
| 438 | |
| 439 | :param url: The URL of the webpage. |
| 440 | :param html: The HTML content of the webpage. |
| 441 | :return: A list of dictionaries representing the clusters. |
| 442 | """ |
| 443 | # Assume `html` is a list of text chunks for this strategy |
| 444 | t = time.time() |
| 445 | text_chunks = html.split(self.DEL) # Split by lines or paragraphs as needed |
| 446 | |
| 447 | # Pre-filter documents using embeddings and semantic_filter |
| 448 | text_chunks = self.filter_documents_embeddings(text_chunks, self.semantic_filter) |
| 449 | |
| 450 | if not text_chunks: |
| 451 | return [] |
| 452 | |
| 453 | # Perform clustering |
| 454 | labels = self.hierarchical_clustering(text_chunks) |
| 455 | # print(f"[LOG] 🚀 Clustering done in {time.time() - t:.2f} seconds") |
| 456 | |
| 457 | # Organize texts by their cluster labels, retaining order |
| 458 | t = time.time() |
| 459 | clusters = {} |
| 460 | for index, label in enumerate(labels): |
| 461 | clusters.setdefault(label, []).append(text_chunks[index]) |
| 462 | |
| 463 | # Filter clusters by word count |
| 464 | filtered_clusters = self.filter_clusters_by_word_count(clusters) |
| 465 | |
| 466 | # Convert filtered clusters to a sorted list of dictionaries |
| 467 | cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)] |
| 468 | |
| 469 | if self.verbose: |
| 470 | print(f"[LOG] 🚀 Assign tags using {self.device}") |
| 471 | |
| 472 | if self.device.type in ["gpu", "cuda", "mps", "cpu"]: |
| 473 | labels = self.nlp([cluster['content'] for cluster in cluster_list]) |
| 474 | |
| 475 | for cluster, label in zip(cluster_list, labels): |
| 476 | cluster['tags'] = label |
| 477 | # elif self.device.type == "cpu": |
| 478 | # # Process the text with the loaded model |
| 479 | # texts = [cluster['content'] for cluster in cluster_list] |
| 480 | # # Batch process texts |
| 481 | # docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"]) |
| 482 | |
| 483 | # for doc, cluster in zip(docs, cluster_list): |
| 484 | # tok_k = self.top_k |
| 485 | # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k] |
| 486 | # cluster['tags'] = [cat for cat, _ in top_categories] |
| 487 | |
| 488 | # for cluster in cluster_list: |
| 489 | # doc = self.nlp(cluster['content']) |
| 490 | # tok_k = self.top_k |
| 491 | # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k] |
| 492 | # cluster['tags'] = [cat for cat, _ in top_categories] |
no test coverage detected