MCPcopy
hub / github.com/unclecode/crawl4ai / extract_structured_data_using_llm

Function extract_structured_data_using_llm

docs/examples/quickstart_async.py:110–137  ·  view source on GitHub ↗
(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None)

Source from the content-addressed store, hash-verified

108 )
109
110async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None):
111 print(f"\n--- Extracting Structured Data with {provider} ---")
112
113 if api_token is None and provider != "ollama":
114 print(f"API token is required for {provider}. Skipping this example.")
115 return
116
117 extra_args = {}
118 if extra_headers:
119 extra_args["extra_headers"] = extra_headers
120
121 async with AsyncWebCrawler(verbose=True) as crawler:
122 result = await crawler.arun(
123 url="https://openai.com/api/pricing/",
124 word_count_threshold=1,
125 extraction_strategy=LLMExtractionStrategy(
126 provider=provider,
127 api_token=api_token,
128 schema=OpenAIModelFee.schema(),
129 extraction_type="schema",
130 instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
131 Do not miss any models in the entire content. One extracted model JSON format should look like this:
132 {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""",
133 extra_args=extra_args
134 ),
135 bypass_cache=True,
136 )
137 print(result.extracted_content)
138
139async def extract_structured_data_using_css_extractor():
140 print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")

Callers 1

mainFunction · 0.85

Calls 3

AsyncWebCrawlerClass · 0.90
arunMethod · 0.80

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…