hub / github.com/SpiderClub/haipproxy / parse_common

Method parse_common

crawler/spiders/base.py:21–72 · view source on GitHub ↗

Common response parser :param response: scrapy response :param pre_extract_method: extracting method for extract all infos, xpath is default value :param pre_extract: pre parsing rule for extracing all infos :param infos_pos: pos for extracting infos

(self, response, pre_extract_method='xpath',
                     pre_extract='//tr', infos_pos=1, infos_end=None,
                     detail_rule='td::text', ip_pos=0, port_pos=1,
                     extract_protocol=True, split_detail=False,
                     protocols=None)

Source from the content-addressed store, hash-verified

19	}
20
21	def parse_common(self, response, pre_extract_method='xpath',
22	pre_extract='//tr', infos_pos=1, infos_end=None,
23	detail_rule='td::text', ip_pos=0, port_pos=1,
24	extract_protocol=True, split_detail=False,
25	protocols=None):
26	"""
27	Common response parser
28	:param response: scrapy response
29	:param pre_extract_method: extracting method for extract all infos, xpath is default value
30	:param pre_extract: pre parsing rule for extracing all infos
31	:param infos_pos: pos for extracting infos
32	:param infos_end: end pos for extracting infos,it value should be smaller than 0
33	:param detail_rule: rule for extracting ip and port block, css selector is used here
34	:param ip_pos: ip index
35	:param port_pos: port index
36	:param extract_protocol: if extract_protocol == False, default protocols will be used
37	:param split_detail: if split_detail == True, ':' will be used to split ip:port
38	:param protocols: this value will be used for the ip's protocols
39	:return: ip infos
40	"""
41	if pre_extract_method == 'xpath':
42	infos = response.xpath(pre_extract)[infos_pos:infos_end]
43	else:
44	infos = response.css(pre_extract)
45	items = list()
46	for info in infos:
47	info_str = info.extract()
48	if '透明' in info_str or 'transparent' in info_str.lower():
49	continue
50	proxy_detail = info.css(detail_rule).extract()
51	if not proxy_detail:
52	continue
53
54	if not split_detail:
55	ip = proxy_detail[ip_pos].strip()
56	port = proxy_detail[port_pos].strip()
57	else:
58	ip, port = proxy_detail[0].split(':')
59	if not self.proxy_check(ip, port):
60	continue
61
62	if protocols:
63	cur_protocols = protocols
64	elif extract_protocol:
65	cur_protocols = self.procotol_extractor(info_str)
66	else:
67	cur_protocols = self.default_protocols
68
69	for protocol in cur_protocols:
70	items.append(ProxyUrlItem(url=self.construct_proxy_url(protocol, ip, port)))
71
72	return items
73
74	def parse_json(self, response, detail_rule, ip_key='ip', port_key='port'):
75	"""

Callers 4

parseMethod · 0.80

Calls 4

proxy_checkMethod · 0.95

procotol_extractorMethod · 0.95

construct_proxy_urlMethod · 0.95

ProxyUrlItemClass · 0.85

Tested by

no test coverage detected