hub / github.com/unclecode/crawl4ai / LLMExtractionStrategy

Class LLMExtractionStrategy

crawl4ai/extraction_strategy.py:58–231 · view source on GitHub ↗

Source from the content-addressed store, hash-verified

56	return [{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]
57
58	class LLMExtractionStrategy(ExtractionStrategy):
59	def __init__(self,
60	provider: str = DEFAULT_PROVIDER, api_token: Optional[str] = None,
61	instruction:str = None, schema:Dict = None, extraction_type = "block", **kwargs):
62	"""
63	Initialize the strategy with clustering parameters.
64
65	:param provider: The provider to use for extraction.
66	:param api_token: The API token for the provider.
67	:param instruction: The instruction to use for the LLM model.
68	"""
69	super().__init__()
70	self.provider = provider
71	self.api_token = api_token or PROVIDER_MODELS.get(provider, "no-token") or os.getenv("OPENAI_API_KEY")
72	self.instruction = instruction
73	self.extract_type = extraction_type
74	self.schema = schema
75	if schema:
76	self.extract_type = "schema"
77
78	self.chunk_token_threshold = kwargs.get("chunk_token_threshold", CHUNK_TOKEN_THRESHOLD)
79	self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
80	self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
81	self.apply_chunking = kwargs.get("apply_chunking", True)
82	self.base_url = kwargs.get("base_url", None)
83	self.api_base = kwargs.get("api_base", kwargs.get("base_url", None))
84	self.extra_args = kwargs.get("extra_args", {})
85	if not self.apply_chunking:
86	self.chunk_token_threshold = 1e9
87
88	self.verbose = kwargs.get("verbose", False)
89
90	if not self.api_token:
91	raise ValueError("API token must be provided for LLMExtractionStrategy. Update the config.py or set OPENAI_API_KEY environment variable.")
92
93
94	def extract(self, url: str, ix:int, html: str) -> List[Dict[str, Any]]:
95	# print("[LOG] Extracting blocks from URL:", url)
96	print(f"[LOG] Call LLM for {url} - block index: {ix}")
97	variable_values = {
98	"URL": url,
99	"HTML": escape_json_string(sanitize_html(html)),
100	}
101
102	prompt_with_variables = PROMPT_EXTRACT_BLOCKS
103	if self.instruction:
104	variable_values["REQUEST"] = self.instruction
105	prompt_with_variables = PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION
106
107	if self.extract_type == "schema" and self.schema:
108	variable_values["SCHEMA"] = json.dumps(self.schema, indent=2)
109	prompt_with_variables = PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION
110
111	for variable in variable_values:
112	prompt_with_variables = prompt_with_variables.replace(
113	"{" + variable + "}", variable_values[variable]
114	)
115

Callers 7

test_run_different_strategiesMethod · 0.90

test_llm_extraction_strategyFunction · 0.90

extract_structured_data_using_llmFunction · 0.90

generate_knowledge_graphFunction · 0.90

summarize_page.pyFile · 0.85

add_llm_extraction_strategyFunction · 0.85

llm_extraction_openai_pricing.pyFile · 0.85

Calls

no outgoing calls

Tested by 2

test_run_different_strategiesMethod · 0.72

test_llm_extraction_strategyFunction · 0.72

Used in the wild real call sites across dependent graphs

searching dependent graphs…