手动触发一次临时爬取任务(可选持久化) Args: platforms: 指定平台列表,为空则爬取所有平台 save_to_local: 是否保存到本地 output 目录,默认 False include_url: 是否包含URL链接,默认False(节省token) Returns: 爬取结果字典,包含新闻数据和保存路径(如果保存)
(self, platforms: Optional[List[str]] = None, save_to_local: bool = False, include_url: bool = False)
| 199 | return result |
| 200 | |
| 201 | def trigger_crawl(self, platforms: Optional[List[str]] = None, save_to_local: bool = False, include_url: bool = False) -> Dict: |
| 202 | """ |
| 203 | 手动触发一次临时爬取任务(可选持久化) |
| 204 | |
| 205 | Args: |
| 206 | platforms: 指定平台列表,为空则爬取所有平台 |
| 207 | save_to_local: 是否保存到本地 output 目录,默认 False |
| 208 | include_url: 是否包含URL链接,默认False(节省token) |
| 209 | |
| 210 | Returns: |
| 211 | 爬取结果字典,包含新闻数据和保存路径(如果保存) |
| 212 | """ |
| 213 | try: |
| 214 | from trendradar.crawler.fetcher import DataFetcher |
| 215 | from trendradar.storage.local import LocalStorageBackend |
| 216 | from trendradar.storage.base import convert_crawl_results_to_news_data |
| 217 | from trendradar.utils.time import get_configured_time, format_date_folder, format_time_filename |
| 218 | from ..services.cache_service import get_cache |
| 219 | |
| 220 | platforms = validate_platforms(platforms) |
| 221 | |
| 222 | # 1. 加载配置 |
| 223 | config_data, all_platforms = self._load_crawl_config() |
| 224 | target_platforms, ids = self._resolve_target_platforms(all_platforms, platforms) |
| 225 | |
| 226 | print(f"开始临时爬取,平台: {[p.get('name', p['id']) for p in target_platforms]}") |
| 227 | |
| 228 | # 2. 执行爬取 |
| 229 | advanced = config_data.get("advanced", {}) |
| 230 | crawler_config = advanced.get("crawler", {}) |
| 231 | platforms_config = config_data.get("platforms", {}) |
| 232 | proxy_url = crawler_config.get("default_proxy") if crawler_config.get("use_proxy") else None |
| 233 | api_url = ( |
| 234 | os.environ.get("PLATFORMS_API_URL", "").strip() |
| 235 | or platforms_config.get("api_url", "") |
| 236 | ) or None |
| 237 | |
| 238 | domain_rules = {} |
| 239 | for p in target_platforms: |
| 240 | ed = p.get("expected_domain", "") |
| 241 | if ed: |
| 242 | domain_rules[p["id"]] = ed |
| 243 | |
| 244 | fetcher = DataFetcher(proxy_url=proxy_url, api_url=api_url) |
| 245 | results, id_to_name, failed_ids = fetcher.crawl_websites( |
| 246 | ids_list=ids, |
| 247 | request_interval=crawler_config.get("request_interval", 100), |
| 248 | domain_rules=domain_rules, |
| 249 | ) |
| 250 | |
| 251 | # 3. 转换与持久化 |
| 252 | timezone = config_data.get("app", {}).get("timezone", "Asia/Shanghai") |
| 253 | current_time = get_configured_time(timezone) |
| 254 | crawl_date = format_date_folder(None, timezone) |
| 255 | crawl_time_str = format_time_filename(timezone) |
| 256 | |
| 257 | news_data = convert_crawl_results_to_news_data( |
| 258 | results=results, id_to_name=id_to_name, |
nothing calls this directly
no test coverage detected