Method parse

scripts/request.py:367–383 · view source on GitHub ↗

(self, result, detail)

Source from the content-addressed store, hash-verified

365	super().__init__("HTML")
366
367	def parse(self, result, detail) -> Tuple[str, str, List[str]]:
368	html_data = self.request.get_html(detail["url"])
369	if not html_data:
370	return
371
372	bs4 = BeautifulSoup(html_data, features="lxml")
373	title = bs4.title.text
374	parts = bs4.find("div", class_="law-content").find_all("p")
375	content = map(lambda x: x.text.replace("\xa0", " ").strip(), parts)
376	content = filter(lambda x: x, content)
377	content = filter(lambda x: not title.startswith(x)
378	and not title.endswith(x), content)
379	content = list(content)
380	if not title and re.match("^中华人民共和国", content[0]):
381	title = content[0]
382	content = content[1:]
383	return title, content[0], content[1:]
384
385
386	class ContentParser(object):

nothing calls this directly

get_htmlMethod · 0.80

no test coverage detected