MCPcopy
hub / github.com/unclecode/crawl4ai / extract

Method extract

crawl4ai/extraction_strategy.py:435–497  ·  view source on GitHub ↗

Extract clusters from HTML content using hierarchical clustering. :param url: The URL of the webpage. :param html: The HTML content of the webpage. :return: A list of dictionaries representing the clusters.

(self, url: str, html: str, *q, **kwargs)

Source from the content-addressed store, hash-verified

433 return filtered_clusters
434
435 def extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]:
436 """
437 Extract clusters from HTML content using hierarchical clustering.
438
439 :param url: The URL of the webpage.
440 :param html: The HTML content of the webpage.
441 :return: A list of dictionaries representing the clusters.
442 """
443 # Assume `html` is a list of text chunks for this strategy
444 t = time.time()
445 text_chunks = html.split(self.DEL) # Split by lines or paragraphs as needed
446
447 # Pre-filter documents using embeddings and semantic_filter
448 text_chunks = self.filter_documents_embeddings(text_chunks, self.semantic_filter)
449
450 if not text_chunks:
451 return []
452
453 # Perform clustering
454 labels = self.hierarchical_clustering(text_chunks)
455 # print(f"[LOG] 🚀 Clustering done in {time.time() - t:.2f} seconds")
456
457 # Organize texts by their cluster labels, retaining order
458 t = time.time()
459 clusters = {}
460 for index, label in enumerate(labels):
461 clusters.setdefault(label, []).append(text_chunks[index])
462
463 # Filter clusters by word count
464 filtered_clusters = self.filter_clusters_by_word_count(clusters)
465
466 # Convert filtered clusters to a sorted list of dictionaries
467 cluster_list = [{"index": int(idx), "tags" : [], "content": " ".join(filtered_clusters[idx])} for idx in sorted(filtered_clusters)]
468
469 if self.verbose:
470 print(f"[LOG] 🚀 Assign tags using {self.device}")
471
472 if self.device.type in ["gpu", "cuda", "mps", "cpu"]:
473 labels = self.nlp([cluster['content'] for cluster in cluster_list])
474
475 for cluster, label in zip(cluster_list, labels):
476 cluster['tags'] = label
477 # elif self.device.type == "cpu":
478 # # Process the text with the loaded model
479 # texts = [cluster['content'] for cluster in cluster_list]
480 # # Batch process texts
481 # docs = self.nlp.pipe(texts, disable=["tagger", "parser", "ner", "lemmatizer"])
482
483 # for doc, cluster in zip(docs, cluster_list):
484 # tok_k = self.top_k
485 # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
486 # cluster['tags'] = [cat for cat, _ in top_categories]
487
488 # for cluster in cluster_list:
489 # doc = self.nlp(cluster['content'])
490 # tok_k = self.top_k
491 # top_categories = sorted(doc.cats.items(), key=lambda x: x[1], reverse=True)[:tok_k]
492 # cluster['tags'] = [cat for cat, _ in top_categories]

Callers 4

runMethod · 0.95
get_content_of_websiteFunction · 0.45
process_elementFunction · 0.45
process_elementMethod · 0.45

Tested by

no test coverage detected