MCPcopy
hub / github.com/unclecode/crawl4ai / CosineStrategy

Class CosineStrategy

crawl4ai/extraction_strategy.py:233–511  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

231 return extracted_content
232
233class CosineStrategy(ExtractionStrategy):
234 def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs):
235 """
236 Initialize the strategy with clustering parameters.
237
238 Args:
239 semantic_filter (str): A keyword filter for document filtering.
240 word_count_threshold (int): Minimum number of words per cluster.
241 max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters.
242 linkage_method (str): The linkage method for hierarchical clustering.
243 top_k (int): Number of top categories to extract.
244 """
245 super().__init__()
246
247 import numpy as np
248
249 self.semantic_filter = semantic_filter
250 self.word_count_threshold = word_count_threshold
251 self.max_dist = max_dist
252 self.linkage_method = linkage_method
253 self.top_k = top_k
254 self.sim_threshold = sim_threshold
255 self.timer = time.time()
256 self.verbose = kwargs.get("verbose", False)
257
258 self.buffer_embeddings = np.array([])
259 self.get_embedding_method = "direct"
260
261 self.device = get_device()
262 # import torch
263 # self.device = torch.device('cpu')
264
265 self.default_batch_size = calculate_batch_size(self.device)
266
267 if self.verbose:
268 print(f"[LOG] Loading Extraction Model for {self.device.type} device.")
269
270 # if False and self.device.type == "cpu":
271 # self.model = load_onnx_all_MiniLM_l6_v2()
272 # self.tokenizer = self.model.tokenizer
273 # self.get_embedding_method = "direct"
274 # else:
275
276 self.tokenizer, self.model = load_HF_embedding_model(model_name)
277 self.model.to(self.device)
278 self.model.eval()
279
280 self.get_embedding_method = "batch"
281
282 self.buffer_embeddings = np.array([])
283
284 # if model_name == "bert-base-uncased":
285 # self.tokenizer, self.model = load_bert_base_uncased()
286 # self.model.eval() # Ensure the model is in evaluation mode
287 # self.get_embedding_method = "batch"
288 # elif model_name == "BAAI/bge-small-en-v1.5":
289 # self.tokenizer, self.model = load_bge_small_en_v1_5()
290 # self.model.eval() # Ensure the model is in evaluation mode

Callers 3

add_extraction_strategyFunction · 0.85

Calls

no outgoing calls

Tested by 2

Used in the wild real call sites across dependent graphs

searching dependent graphs…