生成所有的页面url @param root_url: 首页url type root_url: str @param page: 爬取的页面个数 type page: int
(root_url, page=51)
| 6 | from bs4 import BeautifulSoup |
| 7 | |
| 8 | def yield_all_page_url(root_url, page=51): |
| 9 | """生成所有的页面url |
| 10 | @param root_url: 首页url |
| 11 | type root_url: str |
| 12 | @param page: 爬取的页面个数 |
| 13 | type page: int |
| 14 | """ |
| 15 | # 观察网站翻页结构可知 |
| 16 | page_url_list = [f"{root_url}index_{i}.html" for i in range(1, page)] |
| 17 | # 添加首页url |
| 18 | page_url_list.insert(0, root_url) |
| 19 | return page_url_list |
| 20 | |
| 21 | async def get_info_page_url(url, session): |
| 22 | regex = re.compile("<a href='./(.*?)'\s+title=") |
no outgoing calls
no test coverage detected