| 56 | return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)] |
| 57 | |
| 58 | class LLMExtractionStrategy(ExtractionStrategy): |
| 59 | def __init__(self, |
| 60 | provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None, |
| 61 | instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs): |
| 62 | """ |
| 63 | Initialize the strategy with clustering parameters. |
| 64 | |
| 65 | :param provider: The provider to use for extraction. |
| 66 | :param api_token: The API token for the provider. |
| 67 | :param instruction: The instruction to use for the LLM model. |
| 68 | """ |
| 69 | super().__init__() |
| 70 | self.provider = provider |
| 71 | self.api_token = api_token or PROVIDER_MODELS.get(provider, "no-token") or os.getenv("OPENAI_API_KEY") |
| 72 | self.instruction = instruction |
| 73 | self.extract_type = extraction_type |
| 74 | self.schema = schema |
| 75 | if schema: |
| 76 | self.extract_type = "schema" |
| 77 | |
| 78 | self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD) |
| 79 | self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE) |
| 80 | self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE) |
| 81 | self.apply_chunking = kwargs.get("apply_chunking", True) |
| 82 | self.base_url = kwargs.get("base_url", None) |
| 83 | self.api_base = kwargs.get("api_base", kwargs.get("base_url", None)) |
| 84 | self.extra_args = kwargs.get("extra_args", {}) |
| 85 | if not self.apply_chunking: |
| 86 | self.chunk_token_threshold = 1e9 |
| 87 | |
| 88 | self.verbose = kwargs.get("verbose", False) |
| 89 | |
| 90 | if not self.api_token: |
| 91 | raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.") |
| 92 | |
| 93 | |
| 94 | def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]: |
| 95 | # print("[LOG] Extracting blocks from URL:", url) |
| 96 | print(f"[LOG] Call LLM for {url} - block index: {ix}") |
| 97 | variable_values = { |
| 98 | "URL": url, |
| 99 | "HTML": escape_json_string(sanitize_html(html)), |
| 100 | } |
| 101 | |
| 102 | prompt_with_variables = PROMPT_EXTRACT_BLOCKS |
| 103 | if self.instruction: |
| 104 | variable_values["REQUEST"] = self.instruction |
| 105 | prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION |
| 106 | |
| 107 | if self.extract_type == "schema" and self.schema: |
| 108 | variable_values["SCHEMA"] = json.dumps(self.schema, indent=2) |
| 109 | prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION |
| 110 | |
| 111 | for variable in variable_values: |
| 112 | prompt_with_variables = prompt_with_variables.replace( |
| 113 | "{" + variable + "}", variable_values[variable] |
| 114 | ) |
| 115 |
no outgoing calls
searching dependent graphs…