| 2723 | |
| 2724 | |
| 2725 | def _sanitizeHTML(htmlSource, encoding, _type): |
| 2726 | p = _HTMLSanitizer(encoding, _type) |
| 2727 | htmlSource = htmlSource.replace('<![CDATA[', '<![CDATA[') |
| 2728 | p.feed(htmlSource) |
| 2729 | data = p.output() |
| 2730 | if TIDY_MARKUP: |
| 2731 | # loop through list of preferred Tidy interfaces looking for one that's installed, |
| 2732 | # then set up a common _tidy function to wrap the interface-specific API. |
| 2733 | _tidy = None |
| 2734 | for tidy_interface in PREFERRED_TIDY_INTERFACES: |
| 2735 | try: |
| 2736 | if tidy_interface == "uTidy": |
| 2737 | from tidy import parseString as _utidy |
| 2738 | def _tidy(data, **kwargs): |
| 2739 | return str(_utidy(data, **kwargs)) |
| 2740 | break |
| 2741 | elif tidy_interface == "mxTidy": |
| 2742 | from mx.Tidy import Tidy as _mxtidy |
| 2743 | def _tidy(data, **kwargs): |
| 2744 | nerrors, nwarnings, data, errordata = _mxtidy.tidy(data, **kwargs) |
| 2745 | return data |
| 2746 | break |
| 2747 | except: |
| 2748 | pass |
| 2749 | if _tidy: |
| 2750 | utf8 = type(data) == type(u'') |
| 2751 | if utf8: |
| 2752 | data = data.encode('utf-8') |
| 2753 | data = _tidy(data, output_xhtml=1, numeric_entities=1, wrap=0, char_encoding="utf8") |
| 2754 | if utf8: |
| 2755 | data = unicode(data, 'utf-8') |
| 2756 | if data.count('<body'): |
| 2757 | data = data.split('<body', 1)[1] |
| 2758 | if data.count('>'): |
| 2759 | data = data.split('>', 1)[1] |
| 2760 | if data.count('</body'): |
| 2761 | data = data.split('</body', 1)[0] |
| 2762 | data = data.strip().replace('\r\n', '\n') |
| 2763 | return data |
| 2764 | |
| 2765 | class _FeedURLHandler(urllib2.HTTPDigestAuthHandler, urllib2.HTTPRedirectHandler, urllib2.HTTPDefaultErrorHandler): |
| 2766 | def http_error_default(self, req, fp, code, msg, headers): |