MCPcopy
hub / github.com/unclecode/crawl4ai / LLMExtractionStrategy

Class LLMExtractionStrategy

crawl4ai/extraction_strategy.py:58–231  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

56 return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
57
58class LLMExtractionStrategy(ExtractionStrategy):
59 def __init__(self,
60 provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None,
61 instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs):
62 """
63 Initialize the strategy with clustering parameters.
64
65 :param provider: The provider to use for extraction.
66 :param api_token: The API token for the provider.
67 :param instruction: The instruction to use for the LLM model.
68 """
69 super().__init__()
70 self.provider = provider
71 self.api_token = api_token or PROVIDER_MODELS.get(provider, "no-token") or os.getenv("OPENAI_API_KEY")
72 self.instruction = instruction
73 self.extract_type = extraction_type
74 self.schema = schema
75 if schema:
76 self.extract_type = "schema"
77
78 self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD)
79 self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
80 self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
81 self.apply_chunking = kwargs.get("apply_chunking", True)
82 self.base_url = kwargs.get("base_url", None)
83 self.api_base = kwargs.get("api_base", kwargs.get("base_url", None))
84 self.extra_args = kwargs.get("extra_args", {})
85 if not self.apply_chunking:
86 self.chunk_token_threshold = 1e9
87
88 self.verbose = kwargs.get("verbose", False)
89
90 if not self.api_token:
91 raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.")
92
93
94 def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]:
95 # print("[LOG] Extracting blocks from URL:", url)
96 print(f"[LOG] Call LLM for {url} - block index: {ix}")
97 variable_values = {
98 "URL": url,
99 "HTML": escape_json_string(sanitize_html(html)),
100 }
101
102 prompt_with_variables = PROMPT_EXTRACT_BLOCKS
103 if self.instruction:
104 variable_values["REQUEST"] = self.instruction
105 prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
106
107 if self.extract_type == "schema" and self.schema:
108 variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
109 prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
110
111 for variable in variable_values:
112 prompt_with_variables = prompt_with_variables.replace(
113 "{" + variable + "}", variable_values[variable]
114 )
115

Calls

no outgoing calls

Tested by 2

Used in the wild real call sites across dependent graphs

searching dependent graphs…