从代理池中获取代理 :param retry: :return:
(self, retry: int = 0)
| 477 | return |
| 478 | |
| 479 | def get(self, retry: int = 0) -> dict: |
| 480 | """ |
| 481 | 从代理池中获取代理 |
| 482 | :param retry: |
| 483 | :return: |
| 484 | """ |
| 485 | retry += 1 |
| 486 | if retry > 3: |
| 487 | self.no_valid_proxy_times += 1 |
| 488 | return None |
| 489 | if time.time() - self.last_get_ts > 3 * 60: |
| 490 | # 3分钟没有获取过 重置一下 |
| 491 | try: |
| 492 | self.reset_proxy_pool() |
| 493 | except Exception as e: |
| 494 | self.logger.exception(e) |
| 495 | # 记录获取时间 |
| 496 | self.last_get_ts = time.time() |
| 497 | # |
| 498 | self.warn() |
| 499 | proxy_item = self.get_random_proxy() |
| 500 | if proxy_item: |
| 501 | # 不检测 |
| 502 | if not self.check_valid: |
| 503 | # 塞回去 |
| 504 | proxies = proxy_item.get_proxies() |
| 505 | self.put_proxy_item(proxy_item) |
| 506 | return proxies |
| 507 | else: |
| 508 | is_valid = proxy_item.is_valid() |
| 509 | if is_valid: |
| 510 | # 记录update_ts |
| 511 | self.proxy_item_update_ts_dict[ |
| 512 | proxy_item.proxy_id |
| 513 | ] = proxy_item.update_ts |
| 514 | # 塞回去 |
| 515 | proxies = proxy_item.get_proxies() |
| 516 | self.put_proxy_item(proxy_item) |
| 517 | if is_valid == 1: |
| 518 | if proxy_item.use_interval: |
| 519 | proxy_item.use_ts = time.time() |
| 520 | return proxies |
| 521 | else: |
| 522 | # 处理失效代理 |
| 523 | self.proxy_dict.pop(proxy_item.proxy_id, "") |
| 524 | self.invalid_proxy_dict[ |
| 525 | proxy_item.proxy_id |
| 526 | ] = datetime.datetime.now() |
| 527 | else: |
| 528 | try: |
| 529 | self.reset_proxy_pool() |
| 530 | except Exception as e: |
| 531 | self.logger.exception(e) |
| 532 | if self.no_valid_proxy_times >= 5: |
| 533 | # 解决bug: 当爬虫仅剩一个任务时 由于只有一个线程检测代理 而不可用代理又刚好很多(时间越长越多) 可能出现一直获取不到代理的情况 |
| 534 | # 导致爬虫烂尾 |
| 535 | try: |
| 536 | self.reset_proxy_pool() |
no test coverage detected