Regex(slowly) or Xpath(fast)
(new_page)
| 21 | return mypage_Info |
| 22 | |
| 23 | def New_Page_Info(new_page): |
| 24 | '''Regex(slowly) or Xpath(fast)''' |
| 25 | # new_page_Info = re.findall(r'<td class=".*?">.*?<a href="(.*?)\.html".*?>(.*?)</a></td>', new_page, re.S) |
| 26 | # # new_page_Info = re.findall(r'<td class=".*?">.*?<a href="(.*?)">(.*?)</a></td>', new_page, re.S) # bugs |
| 27 | # results = [] |
| 28 | # for url, item in new_page_Info: |
| 29 | # results.append((item, url+".html")) |
| 30 | # return results |
| 31 | dom = etree.HTML(new_page) |
| 32 | new_items = dom.xpath('//tr/td/a/text()') |
| 33 | new_urls = dom.xpath('//tr/td/a/@href') |
| 34 | assert(len(new_items) == len(new_urls)) |
| 35 | return zip(new_items, new_urls) |
| 36 | |
| 37 | def Spider(url): |
| 38 | i = 0 |