hub / github.com/pyload/pyload / _toUTF8

Function _toUTF8

module/lib/feedparser.py:3512–3563 · view source on GitHub ↗

Changes an XML data stream on the fly to specify a new encoding data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already encoding is a string recognized by encodings.aliases

(data, encoding)

Source from the content-addressed store, hash-verified

3510	return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type
3511
3512	def _toUTF8(data, encoding):
3513	'''Changes an XML data stream on the fly to specify a new encoding
3514
3515	data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already
3516	encoding is a string recognized by encodings.aliases
3517	'''
3518	if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding)
3519	# strip Byte Order Mark (if present)
3520	if (len(data) >= 4) and (data[:2] == _l2bytes([0xfe, 0xff])) and (data[2:4] != _l2bytes([0x00, 0x00])):
3521	if _debug:
3522	sys.stderr.write('stripping BOM\n')
3523	if encoding != 'utf-16be':
3524	sys.stderr.write('trying utf-16be instead\n')
3525	encoding = 'utf-16be'
3526	data = data[2:]
3527	elif (len(data) >= 4) and (data[:2] == _l2bytes([0xff, 0xfe])) and (data[2:4] != _l2bytes([0x00, 0x00])):
3528	if _debug:
3529	sys.stderr.write('stripping BOM\n')
3530	if encoding != 'utf-16le':
3531	sys.stderr.write('trying utf-16le instead\n')
3532	encoding = 'utf-16le'
3533	data = data[2:]
3534	elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]):
3535	if _debug:
3536	sys.stderr.write('stripping BOM\n')
3537	if encoding != 'utf-8':
3538	sys.stderr.write('trying utf-8 instead\n')
3539	encoding = 'utf-8'
3540	data = data[3:]
3541	elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]):
3542	if _debug:
3543	sys.stderr.write('stripping BOM\n')
3544	if encoding != 'utf-32be':
3545	sys.stderr.write('trying utf-32be instead\n')
3546	encoding = 'utf-32be'
3547	data = data[4:]
3548	elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]):
3549	if _debug:
3550	sys.stderr.write('stripping BOM\n')
3551	if encoding != 'utf-32le':
3552	sys.stderr.write('trying utf-32le instead\n')
3553	encoding = 'utf-32le'
3554	data = data[4:]
3555	newdata = unicode(data, encoding)
3556	if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding)
3557	declmatch = re.compile('^<\?xml[^>]*?>')
3558	newdecl = '''<?xml version='1.0' encoding='utf-8'?>'''
3559	if declmatch.search(newdata):
3560	newdata = declmatch.sub(newdecl, newdata)
3561	else:
3562	newdata = newdecl + u'\n' + newdata
3563	return newdata.encode('utf-8')
3564
3565	def _stripDoctype(data):
3566	'''Strips DOCTYPE from XML document, returns (rss_version, stripped_data)

Callers 1

parseFunction · 0.85

Calls 5

_l2bytesFunction · 0.85

compileMethod · 0.80

writeMethod · 0.45

searchMethod · 0.45

encodeMethod · 0.45

Tested by

no test coverage detected