| 133 | |
| 134 | |
| 135 | class StackExchangeBuilder(DataAugmenter): |
| 136 | def __init__(self, base_url=None, filter_opts=None): |
| 137 | self.base_url = ( |
| 138 | base_url |
| 139 | if base_url is not None |
| 140 | else "https://ia600107.us.archive.org/view_archive.php?archive=/27/items/stackexchange/{0}&file=Posts.xml" |
| 141 | ) |
| 142 | self.filter_opts = ( |
| 143 | filter_opts if filter_opts is not None else ["accepted", "score", "convert_html", "clean_tags"] |
| 144 | ) |
| 145 | |
| 146 | def get_all_filenames(self): |
| 147 | response = requests.get("https://archive.org/download/stackexchange") |
| 148 | if response.ok: |
| 149 | soup = bs(response.content, "html.parser") |
| 150 | table = soup.find("table") |
| 151 | link_tags = table.find_all("a") |
| 152 | urls = {} |
| 153 | for link in link_tags: |
| 154 | url = link["href"] |
| 155 | name = url.split(".stackexchange")[0].replace(".", "_").replace("-", "_") |
| 156 | if url.endswith("7z"): |
| 157 | urls[name] = self.base_url.format(url) |
| 158 | return urls |
| 159 | |
| 160 | def xml_to_df(self, response: str): |
| 161 | """ |
| 162 | Collect and Manually import XML into Dataframe |
| 163 | |
| 164 | pd.read_xml() errors when XML trees are too large, this is just a hack to |
| 165 | download a XML file and parse into a Dataframe. **Not Tested on huge XML files** |
| 166 | |
| 167 | Parameters: |
| 168 | response (Requests.Response): Requests response object with the XML data |
| 169 | |
| 170 | Returns: |
| 171 | df (DataFrame): A Dataframe from the XML file |
| 172 | """ |
| 173 | xml_format_map = { |
| 174 | "Id": int, |
| 175 | "PostTypeId": int, |
| 176 | "CreationDate": str, |
| 177 | "Score": int, |
| 178 | "ViewCount": int, |
| 179 | "Body": str, |
| 180 | "AnswerCount": int, |
| 181 | "CommentCount": int, |
| 182 | "ContentLicense": str, |
| 183 | "AcceptedAnswerId": int, |
| 184 | "ParentId": int, |
| 185 | } |
| 186 | soup = bs(response.content, "xml") |
| 187 | posts = soup.find_all("row") |
| 188 | |
| 189 | all_posts = [post.attrs for post in posts] |
| 190 | |
| 191 | df = pd.DataFrame(all_posts) |
| 192 | df.AnswerCount.fillna(0, inplace=True) |