MCPcopy
hub / github.com/SpiderClub/haipproxy / parse_common

Method parse_common

crawler/spiders/base.py:21–72  ·  view source on GitHub ↗

Common response parser :param response: scrapy response :param pre_extract_method: extracting method for extract all infos, xpath is default value :param pre_extract: pre parsing rule for extracing all infos :param infos_pos: pos for extracting infos

(self, response, pre_extract_method='xpath',
                     pre_extract='//tr', infos_pos=1, infos_end=None,
                     detail_rule='td::text', ip_pos=0, port_pos=1,
                     extract_protocol=True, split_detail=False,
                     protocols=None)

Source from the content-addressed store, hash-verified

19 }
20
21 def parse_common(self, response, pre_extract_method='xpath',
22 pre_extract='//tr', infos_pos=1, infos_end=None,
23 detail_rule='td::text', ip_pos=0, port_pos=1,
24 extract_protocol=True, split_detail=False,
25 protocols=None):
26 """
27 Common response parser
28 :param response: scrapy response
29 :param pre_extract_method: extracting method for extract all infos, xpath is default value
30 :param pre_extract: pre parsing rule for extracing all infos
31 :param infos_pos: pos for extracting infos
32 :param infos_end: end pos for extracting infos,it value should be smaller than 0
33 :param detail_rule: rule for extracting ip and port block, css selector is used here
34 :param ip_pos: ip index
35 :param port_pos: port index
36 :param extract_protocol: if extract_protocol == False, default protocols will be used
37 :param split_detail: if split_detail == True, ':' will be used to split ip:port
38 :param protocols: this value will be used for the ip's protocols
39 :return: ip infos
40 """
41 if pre_extract_method == 'xpath':
42 infos = response.xpath(pre_extract)[infos_pos:infos_end]
43 else:
44 infos = response.css(pre_extract)
45 items = list()
46 for info in infos:
47 info_str = info.extract()
48 if '透明' in info_str or 'transparent' in info_str.lower():
49 continue
50 proxy_detail = info.css(detail_rule).extract()
51 if not proxy_detail:
52 continue
53
54 if not split_detail:
55 ip = proxy_detail[ip_pos].strip()
56 port = proxy_detail[port_pos].strip()
57 else:
58 ip, port = proxy_detail[0].split(':')
59 if not self.proxy_check(ip, port):
60 continue
61
62 if protocols:
63 cur_protocols = protocols
64 elif extract_protocol:
65 cur_protocols = self.procotol_extractor(info_str)
66 else:
67 cur_protocols = self.default_protocols
68
69 for protocol in cur_protocols:
70 items.append(ProxyUrlItem(url=self.construct_proxy_url(protocol, ip, port)))
71
72 return items
73
74 def parse_json(self, response, detail_rule, ip_key='ip', port_key='port'):
75 """

Callers 4

parseMethod · 0.80
parseMethod · 0.80
parseMethod · 0.80
parseMethod · 0.80

Calls 4

proxy_checkMethod · 0.95
procotol_extractorMethod · 0.95
construct_proxy_urlMethod · 0.95
ProxyUrlItemClass · 0.85

Tested by

no test coverage detected