Method post_process

src/docstore/wikipedia.py:78–109 · view source on GitHub ↗

(
        self, response_text: str, entity: str, skip_retry_when_postprocess: bool = False
    )

Source from the content-addressed store, hash-verified

76	return alternative
77
78	def post_process(
79	self, response_text: str, entity: str, skip_retry_when_postprocess: bool = False
80	) -> str:
81	soup = BeautifulSoup(response_text, features="html.parser")
82	result_divs = soup.find_all("div", {"class": "mw-search-result-heading"})
83
84	if result_divs: # mismatch
85	self.result_titles = [
86	clean_str(div.get_text().strip()) for div in result_divs
87	]
88	obs = f"Could not find {entity}. Similar: {self.result_titles[:5]}."
89	else:
90	page = [
91	p.get_text().strip() for p in soup.find_all("p") + soup.find_all("ul")
92	]
93	if any("may refer to:" in p for p in page):
94	if skip_retry_when_postprocess or self.skip_retry_when_postprocess:
95	obs = "Could not find " + entity + "."
96	else:
97	obs = self.search("[" + entity + "]", is_retry=True)
98	else:
99	self.page = ""
100	for p in page:
101	if len(p.split(" ")) > 2:
102	self.page += clean_str(p)
103	if not p.endswith("\n"):
104	self.page += "\n"
105	obs = self._get_page_obs(self.page)
106	self.lookup_keyword = self.lookup_list = self.lookup_cnt = None
107
108	obs = obs.replace("\\n", "")
109	return obs
110
111	async def apost_process(
112	self, response_text: str, entity: str, skip_retry_when_postprocess: bool = False

searchMethod · 0.95

searchMethod · 0.95

_get_page_obsMethod · 0.95

clean_strFunction · 0.85

no test coverage detected