Common response parser :param response: scrapy response :param pre_extract_method: extracting method for extract all infos, xpath is default value :param pre_extract: pre parsing rule for extracing all infos :param infos_pos: pos for extracting infos
(self, response, pre_extract_method='xpath',
pre_extract='//tr', infos_pos=1, infos_end=None,
detail_rule='td::text', ip_pos=0, port_pos=1,
extract_protocol=True, split_detail=False,
protocols=None)
| 19 | } |
| 20 | |
| 21 | def parse_common(self, response, pre_extract_method='xpath', |
| 22 | pre_extract='//tr', infos_pos=1, infos_end=None, |
| 23 | detail_rule='td::text', ip_pos=0, port_pos=1, |
| 24 | extract_protocol=True, split_detail=False, |
| 25 | protocols=None): |
| 26 | """ |
| 27 | Common response parser |
| 28 | :param response: scrapy response |
| 29 | :param pre_extract_method: extracting method for extract all infos, xpath is default value |
| 30 | :param pre_extract: pre parsing rule for extracing all infos |
| 31 | :param infos_pos: pos for extracting infos |
| 32 | :param infos_end: end pos for extracting infos,it value should be smaller than 0 |
| 33 | :param detail_rule: rule for extracting ip and port block, css selector is used here |
| 34 | :param ip_pos: ip index |
| 35 | :param port_pos: port index |
| 36 | :param extract_protocol: if extract_protocol == False, default protocols will be used |
| 37 | :param split_detail: if split_detail == True, ':' will be used to split ip:port |
| 38 | :param protocols: this value will be used for the ip's protocols |
| 39 | :return: ip infos |
| 40 | """ |
| 41 | if pre_extract_method == 'xpath': |
| 42 | infos = response.xpath(pre_extract)[infos_pos:infos_end] |
| 43 | else: |
| 44 | infos = response.css(pre_extract) |
| 45 | items = list() |
| 46 | for info in infos: |
| 47 | info_str = info.extract() |
| 48 | if '透明' in info_str or 'transparent' in info_str.lower(): |
| 49 | continue |
| 50 | proxy_detail = info.css(detail_rule).extract() |
| 51 | if not proxy_detail: |
| 52 | continue |
| 53 | |
| 54 | if not split_detail: |
| 55 | ip = proxy_detail[ip_pos].strip() |
| 56 | port = proxy_detail[port_pos].strip() |
| 57 | else: |
| 58 | ip, port = proxy_detail[0].split(':') |
| 59 | if not self.proxy_check(ip, port): |
| 60 | continue |
| 61 | |
| 62 | if protocols: |
| 63 | cur_protocols = protocols |
| 64 | elif extract_protocol: |
| 65 | cur_protocols = self.procotol_extractor(info_str) |
| 66 | else: |
| 67 | cur_protocols = self.default_protocols |
| 68 | |
| 69 | for protocol in cur_protocols: |
| 70 | items.append(ProxyUrlItem(url=self.construct_proxy_url(protocol, ip, port))) |
| 71 | |
| 72 | return items |
| 73 | |
| 74 | def parse_json(self, response, detail_rule, ip_key='ip', port_key='port'): |
| 75 | """ |
no test coverage detected