(element: HtmlElement)
| 8 | |
| 9 | |
| 10 | def normalize_node(element: HtmlElement): |
| 11 | etree.strip_elements(element, *USELESS_TAG) |
| 12 | for node in iter_node(element): |
| 13 | # inspired by readability. |
| 14 | if node.tag.lower() in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node): |
| 15 | remove_node(node) |
| 16 | |
| 17 | # merge text in span or strong to parent p tag |
| 18 | if node.tag.lower() == 'p': |
| 19 | etree.strip_tags(node, 'span') |
| 20 | etree.strip_tags(node, 'strong') |
| 21 | |
| 22 | # if a div tag does not contain any sub node, it could be converted to p node. |
| 23 | if node.tag.lower() == 'div' and not node.getchildren(): |
| 24 | node.tag = 'p' |
| 25 | |
| 26 | if node.tag.lower() == 'span' and not node.getchildren(): |
| 27 | node.tag = 'p' |
| 28 | |
| 29 | # remove empty p tag |
| 30 | if node.tag.lower() == 'p' and not node.xpath('.//img'): |
| 31 | if not (node.text and node.text.strip()): |
| 32 | drop_tag(node) |
| 33 | |
| 34 | class_name = node.get('class') |
| 35 | if class_name: |
| 36 | for attribute in USELESS_ATTR: |
| 37 | if attribute in class_name: |
| 38 | remove_node(node) |
| 39 | break |
| 40 | |
| 41 | |
| 42 | def html2element(html): |
no test coverage detected