hub / github.com/unclecode/crawl4ai / extract_structured_data_using_llm

Function extract_structured_data_using_llm

docs/examples/quickstart_async.py:110–137 · view source on GitHub ↗

(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None)

Source from the content-addressed store, hash-verified

108	)
109
110	async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
111	print(f"\n--- Extracting Structured Data with {provider} ---")
112
113	if api_token is None and provider != "ollama":
114	print(f"API token is required for {provider}. Skipping this example.")
115	return
116
117	extra_args = {}
118	if extra_headers:
119	extra_args["extra_headers"] = extra_headers
120
121	async with AsyncWebCrawler(verbose=True) as crawler:
122	result = await crawler.arun(
123	url="https://openai.com/api/pricing/",
124	word_count_threshold=1,
125	extraction_strategy=LLMExtractionStrategy(
126	provider=provider,
127	api_token=api_token,
128	schema=OpenAIModelFee.schema(),
129	extraction_type="schema",
130	instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
131	Do not miss any models in the entire content. One extracted model JSON format should look like this:
132	{"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""",
133	extra_args=extra_args
134	),
135	bypass_cache=True,
136	)
137	print(result.extracted_content)
138
139	async def extract_structured_data_using_css_extractor():
140	print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")

mainFunction · 0.85

AsyncWebCrawlerClass · 0.90

LLMExtractionStrategyClass · 0.90

arunMethod · 0.80

no test coverage detected

searching dependent graphs…