(crawler)
| 97 | print_result(result) |
| 98 | |
| 99 | def add_extraction_strategy(crawler): |
| 100 | # Adding an extraction strategy: CosineStrategy |
| 101 | cprint("\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]", True) |
| 102 | cprint("CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!") |
| 103 | result = crawler.run( |
| 104 | url="https://www.nbcnews.com/business", |
| 105 | extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold = 0.3, verbose=True) |
| 106 | ) |
| 107 | cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]") |
| 108 | print_result(result) |
| 109 | |
| 110 | # Using semantic_filter with CosineStrategy |
| 111 | cprint("You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!") |
| 112 | result = crawler.run( |
| 113 | url="https://www.nbcnews.com/business", |
| 114 | extraction_strategy=CosineStrategy( |
| 115 | semantic_filter="inflation rent prices", |
| 116 | ) |
| 117 | ) |
| 118 | cprint("[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]") |
| 119 | print_result(result) |
| 120 | |
| 121 | def add_llm_extraction_strategy(crawler): |
| 122 | # Adding an LLM extraction strategy without instructions |
no test coverage detected
searching dependent graphs…