MCPcopy Index your code
hub / github.com/LawRefBook/Laws / parse

Method parse

scripts/request.py:367–383  ·  view source on GitHub ↗
(self, result, detail)

Source from the content-addressed store, hash-verified

365 super().__init__("HTML")
366
367 def parse(self, result, detail) -> Tuple[str, str, List[str]]:
368 html_data = self.request.get_html(detail["url"])
369 if not html_data:
370 return
371
372 bs4 = BeautifulSoup(html_data, features="lxml")
373 title = bs4.title.text
374 parts = bs4.find("div", class_="law-content").find_all("p")
375 content = map(lambda x: x.text.replace("\xa0", " ").strip(), parts)
376 content = filter(lambda x: x, content)
377 content = filter(lambda x: not title.startswith(x)
378 and not title.endswith(x), content)
379 content = list(content)
380 if not title and re.match("^中华人民共和国", content[0]):
381 title = content[0]
382 content = content[1:]
383 return title, content[0], content[1:]
384
385
386class ContentParser(object):

Callers

nothing calls this directly

Calls 1

get_htmlMethod · 0.80

Tested by

no test coverage detected