Function normalize_node

gne/utils.py:10–39 · view source on GitHub ↗

(element: HtmlElement)

Source from the content-addressed store, hash-verified

8
9
10	def normalize_node(element: HtmlElement):
11	etree.strip_elements(element, *USELESS_TAG)
12	for node in iter_node(element):
13	# inspired by readability.
14	if node.tag.lower() in TAGS_CAN_BE_REMOVE_IF_EMPTY and is_empty_element(node):
15	remove_node(node)
16
17	# merge text in span or strong to parent p tag
18	if node.tag.lower() == 'p':
19	etree.strip_tags(node, 'span')
20	etree.strip_tags(node, 'strong')
21
22	# if a div tag does not contain any sub node, it could be converted to p node.
23	if node.tag.lower() == 'div' and not node.getchildren():
24	node.tag = 'p'
25
26	if node.tag.lower() == 'span' and not node.getchildren():
27	node.tag = 'p'
28
29	# remove empty p tag
30	if node.tag.lower() == 'p' and not node.xpath('.//img'):
31	if not (node.text and node.text.strip()):
32	drop_tag(node)
33
34	class_name = node.get('class')
35	if class_name:
36	for attribute in USELESS_ATTR:
37	if attribute in class_name:
38	remove_node(node)
39	break
40
41
42	def html2element(html):

pre_parseFunction · 0.85

iter_nodeFunction · 0.85

is_empty_elementFunction · 0.85

remove_nodeFunction · 0.85

drop_tagFunction · 0.85

no test coverage detected