hub / github.com/LAION-AI/Open-Assistant / StackExchangeBuilder

Class StackExchangeBuilder

scripts/data_augment/data_augment.py:135–271 · view source on GitHub ↗

Source from the content-addressed store, hash-verified

133
134
135	class StackExchangeBuilder(DataAugmenter):
136	def __init__(self, base_url=None, filter_opts=None):
137	self.base_url = (
138	base_url
139	if base_url is not None
140	else "https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/{0}&file=Posts.xml"
141	)
142	self.filter_opts = (
143	filter_opts if filter_opts is not None else ["accepted", "score", "convert_html", "clean_tags"]
144	)
145
146	def get_all_filenames(self):
147	response = requests.get("https://archive.org/download/stackexchange")
148	if response.ok:
149	soup = bs(response.content, "html.parser")
150	table = soup.find("table")
151	link_tags = table.find_all("a")
152	urls = {}
153	for link in link_tags:
154	url = link["href"]
155	name = url.split(".stackexchange")[0].replace(".", "_").replace("-", "_")
156	if url.endswith("7z"):
157	urls[name] = self.base_url.format(url)
158	return urls
159
160	def xml_to_df(self, response: str):
161	"""
162	Collect and Manually import XML into Dataframe
163
164	pd.read_xml() errors when XML trees are too large, this is just a hack to
165	download a XML file and parse into a Dataframe. Not Tested on huge XML files
166
167	Parameters:
168	response (Requests.Response): Requests response object with the XML data
169
170	Returns:
171	df (DataFrame): A Dataframe from the XML file
172	"""
173	xml_format_map = {
174	"Id": int,
175	"PostTypeId": int,
176	"CreationDate": str,
177	"Score": int,
178	"ViewCount": int,
179	"Body": str,
180	"AnswerCount": int,
181	"CommentCount": int,
182	"ContentLicense": str,
183	"AcceptedAnswerId": int,
184	"ParentId": int,
185	}
186	soup = bs(response.content, "xml")
187	posts = soup.find_all("row")
188
189	all_posts = [post.attrs for post in posts]
190
191	df = pd.DataFrame(all_posts)
192	df.AnswerCount.fillna(0, inplace=True)

Callers 1

get_augmenterFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected