MCPcopy
hub / github.com/DropsDevopsOrg/ECommerceCrawlers / parse

Method parse

TaobaoCrawler/crawler.py:93–120  ·  view source on GitHub ↗
(self)

Source from the content-addressed store, hash-verified

91 return True
92
93 def parse(self):
94 pattern = re.compile(r'g_page_config = ({.*});')
95 m = re.search(pattern, self.page)
96 if not m:
97 print('Cannot fount data in this page.')
98
99 return False
100 g_page_config = json.loads(m.group(1))
101 auctions = g_page_config.get("mods").get("itemlist").get("data").get("auctions")
102 for auction in auctions:
103 try:
104 simil_url_short = auction.get('i2iTags', {"samestyle": '/'}).get('samestyle', {"url", '/'}).get('url',
105 '')
106 except Exception as e:
107 simil_url_short = ''
108 d = {}
109 d['keyword'] = self.KEYWORD
110 d['t_link'] = 'https:' + auction.get('detail_url', '/')
111 d['title'] = auction.get('raw_title')
112 d['price'] = auction.get('view_price')
113 d['shop_name'] = auction.get('nick')
114 d['sales_num'] = auction.get('view_sales', '0').replace('人收货', '').replace('人付款', '')
115 d['simil_url_short'] = simil_url_short
116 d['flag'] = 0
117 print(d.get('keyword'), d.get('title'), d.get('simil_url_short'))
118 self.db.insert(d)
119 self.logMessage.put(d.get('keyword'), d.get('title'), d.get('simil_url_short'))
120 self.data_search_shop_Queue.put(d)
121
122 def run_cry(self):
123 while True:

Callers 1

get_pageMethod · 0.95

Calls 2

getMethod · 0.80
insertMethod · 0.45

Tested by

no test coverage detected