(crawler)
| 119 | print_result(result) |
| 120 | |
| 121 | def add_llm_extraction_strategy(crawler): |
| 122 | # Adding an LLM extraction strategy without instructions |
| 123 | cprint("\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]", True) |
| 124 | cprint("LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!") |
| 125 | result = crawler.run( |
| 126 | url="https://www.nbcnews.com/business", |
| 127 | extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')) |
| 128 | ) |
| 129 | cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]") |
| 130 | print_result(result) |
| 131 | |
| 132 | # Adding an LLM extraction strategy with instructions |
| 133 | cprint("\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]", True) |
| 134 | cprint("Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!") |
| 135 | result = crawler.run( |
| 136 | url="https://www.nbcnews.com/business", |
| 137 | extraction_strategy=LLMExtractionStrategy( |
| 138 | provider="openai/gpt-4o", |
| 139 | api_token=os.getenv('OPENAI_API_KEY'), |
| 140 | instruction="I am interested in only financial news" |
| 141 | ) |
| 142 | ) |
| 143 | cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]") |
| 144 | print_result(result) |
| 145 | |
| 146 | result = crawler.run( |
| 147 | url="https://www.nbcnews.com/business", |
| 148 | extraction_strategy=LLMExtractionStrategy( |
| 149 | provider="openai/gpt-4o", |
| 150 | api_token=os.getenv('OPENAI_API_KEY'), |
| 151 | instruction="Extract only content related to technology" |
| 152 | ) |
| 153 | ) |
| 154 | cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]") |
| 155 | print_result(result) |
| 156 | |
| 157 | def targeted_extraction(crawler): |
| 158 | # Using a CSS selector to extract only H2 tags |
no test coverage detected
searching dependent graphs…