Function _sanitizeHTML

module/lib/feedparser.py:2725–2763 · view source on GitHub ↗

(htmlSource, encoding, _type)

Source from the content-addressed store, hash-verified

2723
2724
2725	def _sanitizeHTML(htmlSource, encoding, _type):
2726	p = _HTMLSanitizer(encoding, _type)
2727	htmlSource = htmlSource.replace('<![CDATA[', '<![CDATA[')
2728	p.feed(htmlSource)
2729	data = p.output()
2730	if TIDY_MARKUP:
2731	# loop through list of preferred Tidy interfaces looking for one that's installed,
2732	# then set up a common _tidy function to wrap the interface-specific API.
2733	_tidy = None
2734	for tidy_interface in PREFERRED_TIDY_INTERFACES:
2735	try:
2736	if tidy_interface == "uTidy":
2737	from tidy import parseString as _utidy
2738	def _tidy(data, **kwargs):
2739	return str(_utidy(data, **kwargs))
2740	break
2741	elif tidy_interface == "mxTidy":
2742	from mx.Tidy import Tidy as _mxtidy
2743	def _tidy(data, **kwargs):
2744	nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs)
2745	return data
2746	break
2747	except:
2748	pass
2749	if _tidy:
2750	utf8 = type(data) == type(u'')
2751	if utf8:
2752	data = data.encode('utf-8')
2753	data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8")
2754	if utf8:
2755	data = unicode(data, 'utf-8')
2756	if data.count('<body'):
2757	data = data.split('<body', 1)[1]
2758	if data.count('>'):
2759	data = data.split('>', 1)[1]
2760	if data.count('</body'):
2761	data = data.split('</body', 1)[0]
2762	data = data.strip().replace('\r\n', '\n')
2763	return data
2764
2765	class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler):
2766	def http_error_default(self, req, fp, code, msg, headers):

popMethod · 0.85

_HTMLSanitizerClass · 0.85

_tidyFunction · 0.85

feedMethod · 0.80

outputMethod · 0.80

splitMethod · 0.80

replaceMethod · 0.45

encodeMethod · 0.45

no test coverage detected