| 365 | super().__init__("HTML") |
| 366 | |
| 367 | def parse(self, result, detail) -> Tuple[str, str, List[str]]: |
| 368 | html_data = self.request.get_html(detail["url"]) |
| 369 | if not html_data: |
| 370 | return |
| 371 | |
| 372 | bs4 = BeautifulSoup(html_data, features="lxml") |
| 373 | title = bs4.title.text |
| 374 | parts = bs4.find("div", class_="law-content").find_all("p") |
| 375 | content = map(lambda x: x.text.replace("\xa0", " ").strip(), parts) |
| 376 | content = filter(lambda x: x, content) |
| 377 | content = filter(lambda x: not title.startswith(x) |
| 378 | and not title.endswith(x), content) |
| 379 | content = list(content) |
| 380 | if not title and re.match("^中华人民共和国", content[0]): |
| 381 | title = content[0] |
| 382 | content = content[1:] |
| 383 | return title, content[0], content[1:] |
| 384 | |
| 385 | |
| 386 | class ContentParser(object): |