(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None)
| 108 | ) |
| 109 | |
| 110 | async def extract_structured_data_using_llm(provider: str, api_token: str = None, extra_headers: Dict[str, str] = None): |
| 111 | print(f"\n--- Extracting Structured Data with {provider} ---") |
| 112 | |
| 113 | if api_token is None and provider != "ollama": |
| 114 | print(f"API token is required for {provider}. Skipping this example.") |
| 115 | return |
| 116 | |
| 117 | extra_args = {} |
| 118 | if extra_headers: |
| 119 | extra_args["extra_headers"] = extra_headers |
| 120 | |
| 121 | async with AsyncWebCrawler(verbose=True) as crawler: |
| 122 | result = await crawler.arun( |
| 123 | url="https://openai.com/api/pricing/", |
| 124 | word_count_threshold=1, |
| 125 | extraction_strategy=LLMExtractionStrategy( |
| 126 | provider=provider, |
| 127 | api_token=api_token, |
| 128 | schema=OpenAIModelFee.schema(), |
| 129 | extraction_type="schema", |
| 130 | instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens. |
| 131 | Do not miss any models in the entire content. One extracted model JSON format should look like this: |
| 132 | {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""", |
| 133 | extra_args=extra_args |
| 134 | ), |
| 135 | bypass_cache=True, |
| 136 | ) |
| 137 | print(result.extracted_content) |
| 138 | |
| 139 | async def extract_structured_data_using_css_extractor(): |
| 140 | print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") |
no test coverage detected
searching dependent graphs…