Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy.
(self, url: str, sections: List[str])
| 188 | |
| 189 | |
| 190 | def run(self, url: str, sections: List[str]) -> List[Dict[str, Any]]: |
| 191 | """ |
| 192 | Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionStrategy. |
| 193 | """ |
| 194 | |
| 195 | merged_sections = self._merge( |
| 196 | sections, self.chunk_token_threshold, |
| 197 | overlap= int(self.chunk_token_threshold * self.overlap_rate) |
| 198 | ) |
| 199 | extracted_content = [] |
| 200 | if self.provider.startswith("groq/"): |
| 201 | # Sequential processing with a delay |
| 202 | for ix, section in enumerate(merged_sections): |
| 203 | extract_func = partial(self.extract, url) |
| 204 | extracted_content.extend(extract_func(ix, sanitize_input_encode(section))) |
| 205 | time.sleep(0.5) # 500 ms delay between each processing |
| 206 | else: |
| 207 | # Parallel processing using ThreadPoolExecutor |
| 208 | # extract_func = partial(self.extract, url) |
| 209 | # for ix, section in enumerate(merged_sections): |
| 210 | # extracted_content.append(extract_func(ix, section)) |
| 211 | |
| 212 | with ThreadPoolExecutor(max_workers=4) as executor: |
| 213 | extract_func = partial(self.extract, url) |
| 214 | futures = [executor.submit(extract_func, ix, sanitize_input_encode(section)) for ix, section in enumerate(merged_sections)] |
| 215 | |
| 216 | for future in as_completed(futures): |
| 217 | try: |
| 218 | extracted_content.extend(future.result()) |
| 219 | except Exception as e: |
| 220 | if self.verbose: |
| 221 | print(f"Error in thread execution: {e}") |
| 222 | # Add error information to extracted_content |
| 223 | extracted_content.append({ |
| 224 | "index": 0, |
| 225 | "error": True, |
| 226 | "tags": ["error"], |
| 227 | "content": str(e) |
| 228 | }) |
| 229 | |
| 230 | |
| 231 | return extracted_content |
| 232 | |
| 233 | class CosineStrategy(ExtractionStrategy): |
| 234 | def __init__(self, semantic_filter = None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name = 'sentence-transformers/all-MiniLM-L6-v2', sim_threshold = 0.3, **kwargs): |
nothing calls this directly
no test coverage detected