| 231 | return extracted_content |
| 232 | |
| 233 | class CosineStrategy(ExtractionStrategy): |
| 234 | def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs): |
| 235 | """ |
| 236 | Initialize the strategy with clustering parameters. |
| 237 | |
| 238 | Args: |
| 239 | semantic_filter (str): A keyword filter for document filtering. |
| 240 | word_count_threshold (int): Minimum number of words per cluster. |
| 241 | max_dist (float): The maximum cophenetic distance on the dendrogram to form clusters. |
| 242 | linkage_method (str): The linkage method for hierarchical clustering. |
| 243 | top_k (int): Number of top categories to extract. |
| 244 | """ |
| 245 | super().__init__() |
| 246 | |
| 247 | import numpy as np |
| 248 | |
| 249 | self.semantic_filter = semantic_filter |
| 250 | self.word_count_threshold = word_count_threshold |
| 251 | self.max_dist = max_dist |
| 252 | self.linkage_method = linkage_method |
| 253 | self.top_k = top_k |
| 254 | self.sim_threshold = sim_threshold |
| 255 | self.timer = time.time() |
| 256 | self.verbose = kwargs.get("verbose", False) |
| 257 | |
| 258 | self.buffer_embeddings = np.array([]) |
| 259 | self.get_embedding_method = "direct" |
| 260 | |
| 261 | self.device = get_device() |
| 262 | # import torch |
| 263 | # self.device = torch.device('cpu') |
| 264 | |
| 265 | self.default_batch_size = calculate_batch_size(self.device) |
| 266 | |
| 267 | if self.verbose: |
| 268 | print(f"[LOG] Loading Extraction Model for {self.device.type} device.") |
| 269 | |
| 270 | # if False and self.device.type == "cpu": |
| 271 | # self.model = load_onnx_all_MiniLM_l6_v2() |
| 272 | # self.tokenizer = self.model.tokenizer |
| 273 | # self.get_embedding_method = "direct" |
| 274 | # else: |
| 275 | |
| 276 | self.tokenizer, self.model = load_HF_embedding_model(model_name) |
| 277 | self.model.to(self.device) |
| 278 | self.model.eval() |
| 279 | |
| 280 | self.get_embedding_method = "batch" |
| 281 | |
| 282 | self.buffer_embeddings = np.array([]) |
| 283 | |
| 284 | # if model_name == "bert-base-uncased": |
| 285 | # self.tokenizer, self.model = load_bert_base_uncased() |
| 286 | # self.model.eval() # Ensure the model is in evaluation mode |
| 287 | # self.get_embedding_method = "batch" |
| 288 | # elif model_name == "BAAI/bge-small-en-v1.5": |
| 289 | # self.tokenizer, self.model = load_bge_small_en_v1_5() |
| 290 | # self.model.eval() # Ensure the model is in evaluation mode |
no outgoing calls
searching dependent graphs…