(self)
| 22 | self.assertTrue(result.success, "Failed to crawl and extract using default strategies") |
| 23 | |
| 24 | def test_run_different_strategies(self): |
| 25 | url = 'https://www.nbcnews.com/business' |
| 26 | |
| 27 | # Test with FixedLengthWordChunking and LLMExtractionStrategy |
| 28 | result = self.crawler.run( |
| 29 | url=url, |
| 30 | word_count_threshold=5, |
| 31 | chunking_strategy=FixedLengthWordChunking(chunk_size=100), |
| 32 | extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-3.5-turbo", api_token=os.getenv('OPENAI_API_KEY')), bypass_cache=True |
| 33 | ) |
| 34 | self.assertTrue(result.success, "Failed to crawl and extract with FixedLengthWordChunking and LLMExtractionStrategy") |
| 35 | |
| 36 | # Test with SlidingWindowChunking and TopicExtractionStrategy |
| 37 | result = self.crawler.run( |
| 38 | url=url, |
| 39 | word_count_threshold=5, |
| 40 | chunking_strategy=SlidingWindowChunking(window_size=100, step=50), |
| 41 | extraction_strategy=TopicExtractionStrategy(num_keywords=5), bypass_cache=True |
| 42 | ) |
| 43 | self.assertTrue(result.success, "Failed to crawl and extract with SlidingWindowChunking and TopicExtractionStrategy") |
| 44 | |
| 45 | def test_invalid_url(self): |
| 46 | with self.assertRaises(Exception) as context: |
nothing calls this directly
no test coverage detected