MCPcopy
hub / github.com/yhangf/PythonCrawler / save_html

Function save_html

spiderFile/ECUT_pos_html.py:24–48  ·  view source on GitHub ↗
()

Source from the content-addressed store, hash-verified

22 return True
23 else: pass
24 def save_html():
25 all_url_list = crawl_all_main_url()
26 for son_url in all_url_list:
27 if get_title(son_url):
28 text_html = requests.get(son_url).content.decode('gbk')
29 domain_url = 'http://zjc.ecit.edu.cn/jy'
30 img_url_reg = re.compile('border=0 src="\.\.(.*?)"')
31 child_url = re.findall(img_url_reg, text_html)
32 if child_url != []:
33 img_url = domain_url + child_url[0]
34 re_url = 'src="..{0}"'.format(child_url[0])
35 end_url = 'src="{0}"'.format(img_url)
36 end_html = text_html.replace(re_url, end_url)
37 soup = bs(end_html, 'lxml')
38 text_div = soup.find_all('div', id='main')[0]
39 with open('./{0}.html'.format(son_url[-11:]), 'wb') as file:
40 text_html = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">U职网提供数据咨询服务 <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> </head> {0} </body>'.format(text_div) file.write(text_html.encode('utf-8'))
41 else:
42 with open('./{0}.html'.format(son_url[-11:]), 'wb') as file:
43 html = requests.get(son_url).content.decode('gbk')
44 soup = bs(text_html, 'lxml')
45 text_div = soup.find_all('div', id='main')[0]
46 text_html = '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">U职网提供数据咨询服务 <html xmlns="http://www.w3.org/1999/xhtml"> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> </head> {0} </body>'.format(text_div)
47 file.write(text_html.encode('utf-8'))
48 else: continue
49if __name__ == '__main__':
50save_html()

Callers 1

ECUT_pos_html.pyFile · 0.85

Calls 2

crawl_all_main_urlFunction · 0.85
get_titleFunction · 0.85

Tested by

no test coverage detected