MCPcopy Index your code
hub / github.com/LAION-AI/Open-Assistant / StackExchangeBuilder

Class StackExchangeBuilder

scripts/data_augment/data_augment.py:135–271  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

133
134
135class StackExchangeBuilder(DataAugmenter):
136 def __init__(self, base_url=None, filter_opts=None):
137 self.base_url = (
138 base_url
139 if base_url is not None
140 else "https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/{0}&file=Posts.xml"
141 )
142 self.filter_opts = (
143 filter_opts if filter_opts is not None else ["accepted", "score", "convert_html", "clean_tags"]
144 )
145
146 def get_all_filenames(self):
147 response = requests.get("https://archive.org/download/stackexchange")
148 if response.ok:
149 soup = bs(response.content, "html.parser")
150 table = soup.find("table")
151 link_tags = table.find_all("a")
152 urls = {}
153 for link in link_tags:
154 url = link["href"]
155 name = url.split(".stackexchange")[0].replace(".", "_").replace("-", "_")
156 if url.endswith("7z"):
157 urls[name] = self.base_url.format(url)
158 return urls
159
160 def xml_to_df(self, response: str):
161 """
162 Collect and Manually import XML into Dataframe
163
164 pd.read_xml() errors when XML trees are too large, this is just a hack to
165 download a XML file and parse into a Dataframe. **Not Tested on huge XML files**
166
167 Parameters:
168 response (Requests.Response): Requests response object with the XML data
169
170 Returns:
171 df (DataFrame): A Dataframe from the XML file
172 """
173 xml_format_map = {
174 "Id": int,
175 "PostTypeId": int,
176 "CreationDate": str,
177 "Score": int,
178 "ViewCount": int,
179 "Body": str,
180 "AnswerCount": int,
181 "CommentCount": int,
182 "ContentLicense": str,
183 "AcceptedAnswerId": int,
184 "ParentId": int,
185 }
186 soup = bs(response.content, "xml")
187 posts = soup.find_all("row")
188
189 all_posts = [post.attrs for post in posts]
190
191 df = pd.DataFrame(all_posts)
192 df.AnswerCount.fillna(0, inplace=True)

Callers 1

get_augmenterFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected