MCPcopy Index your code
hub / github.com/GeneralNewsExtractor/GeneralNewsExtractor / normalize_node

Function normalize_node

gne/utils.py:10–39  ·  view source on GitHub ↗
(element: HtmlElement)

Source from the content-addressed store, hash-verified

8
9
10def normalize_node(element: HtmlElement):
11 etree.strip_elements(element, *USELESS_TAG)
12 for node in iter_node(element):
13 # inspired by readability.
14 if node.tag.lower() in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node):
15 remove_node(node)
16
17 # merge text in span or strong to parent p tag
18 if node.tag.lower() == 'p':
19 etree.strip_tags(node, 'span')
20 etree.strip_tags(node, 'strong')
21
22 # if a div tag does not contain any sub node, it could be converted to p node.
23 if node.tag.lower() == 'div' and not node.getchildren():
24 node.tag = 'p'
25
26 if node.tag.lower() == 'span' and not node.getchildren():
27 node.tag = 'p'
28
29 # remove empty p tag
30 if node.tag.lower() == 'p' and not node.xpath('.//img'):
31 if not (node.text and node.text.strip()):
32 drop_tag(node)
33
34 class_name = node.get('class')
35 if class_name:
36 for attribute in USELESS_ATTR:
37 if attribute in class_name:
38 remove_node(node)
39 break
40
41
42def html2element(html):

Callers 1

pre_parseFunction · 0.85

Calls 4

iter_nodeFunction · 0.85
is_empty_elementFunction · 0.85
remove_nodeFunction · 0.85
drop_tagFunction · 0.85

Tested by

no test coverage detected