Parse HTML table into a dataframe. Parameters ---------- html_table : lxml.html.HtmlElement HTML table to clean up. Returns ------- pd.DataFrame
(html_table: html.HtmlElement)
| 1035 | |
| 1036 | |
| 1037 | def _parse_table(html_table: html.HtmlElement) -> pd.DataFrame: |
| 1038 | """Parse HTML table into a dataframe. |
| 1039 | |
| 1040 | Parameters |
| 1041 | ---------- |
| 1042 | html_table : lxml.html.HtmlElement |
| 1043 | HTML table to clean up. |
| 1044 | |
| 1045 | Returns |
| 1046 | ------- |
| 1047 | pd.DataFrame |
| 1048 | """ |
| 1049 | # remove icons |
| 1050 | for elem in html_table.xpath(".//span[contains(@class, 'f-i')]"): |
| 1051 | parent = elem.getparent() |
| 1052 | if parent is not None: |
| 1053 | etree.strip_elements(parent, "span", with_tail=False) |
| 1054 | # remove sep rows |
| 1055 | for elem in html_table.xpath("//tbody/tr[contains(@class, 'spacer')]"): |
| 1056 | elem.getparent().remove(elem) |
| 1057 | # remove thead rows in the table body |
| 1058 | for elem in html_table.xpath("//tbody/tr[contains(@class, 'thead')]"): |
| 1059 | elem.getparent().remove(elem) |
| 1060 | # parse HTML to dataframe |
| 1061 | (df_table,) = pd.read_html( |
| 1062 | io.StringIO(html.tostring(html_table, encoding="unicode")), flavor="lxml" |
| 1063 | ) |
| 1064 | return df_table.convert_dtypes() |
| 1065 | |
| 1066 | |
| 1067 | def _concat(dfs: list[pd.DataFrame], key: list[str]) -> pd.DataFrame: |
no outgoing calls
no test coverage detected