hub / github.com/csev/py4e / _detectEncoding

Method _detectEncoding

code/BeautifulSoup.py:1862–1927 · view source on GitHub ↗

Given a document, tries to detect its XML encoding.

(self, xml_data, isHTML=False)

Source from the content-addressed store, hash-verified

1860	return newdata
1861
1862	def _detectEncoding(self, xml_data, isHTML=False):
1863	"""Given a document, tries to detect its XML encoding."""
1864	xml_encoding = sniffed_xml_encoding = None
1865	try:
1866	if xml_data[:4] == '\x4c\x6f\xa7\x94':
1867	# EBCDIC
1868	xml_data = self._ebcdic_to_ascii(xml_data)
1869	elif xml_data[:4] == '\x00\x3c\x00\x3f':
1870	# UTF-16BE
1871	sniffed_xml_encoding = 'utf-16be'
1872	xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1873	elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1874	and (xml_data[2:4] != '\x00\x00'):
1875	# UTF-16BE with BOM
1876	sniffed_xml_encoding = 'utf-16be'
1877	xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1878	elif xml_data[:4] == '\x3c\x00\x3f\x00':
1879	# UTF-16LE
1880	sniffed_xml_encoding = 'utf-16le'
1881	xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1882	elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1883	(xml_data[2:4] != '\x00\x00'):
1884	# UTF-16LE with BOM
1885	sniffed_xml_encoding = 'utf-16le'
1886	xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1887	elif xml_data[:4] == '\x00\x00\x00\x3c':
1888	# UTF-32BE
1889	sniffed_xml_encoding = 'utf-32be'
1890	xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1891	elif xml_data[:4] == '\x3c\x00\x00\x00':
1892	# UTF-32LE
1893	sniffed_xml_encoding = 'utf-32le'
1894	xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1895	elif xml_data[:4] == '\x00\x00\xfe\xff':
1896	# UTF-32BE with BOM
1897	sniffed_xml_encoding = 'utf-32be'
1898	xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1899	elif xml_data[:4] == '\xff\xfe\x00\x00':
1900	# UTF-32LE with BOM
1901	sniffed_xml_encoding = 'utf-32le'
1902	xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1903	elif xml_data[:3] == '\xef\xbb\xbf':
1904	# UTF-8 with BOM
1905	sniffed_xml_encoding = 'utf-8'
1906	xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1907	else:
1908	sniffed_xml_encoding = 'ascii'
1909	pass
1910	except:
1911	xml_encoding_match = None
1912	xml_encoding_match = re.compile(
1913	'^<\?.encoding=[\'"](.?)[\'"].*\?>').match(xml_data)
1914	if not xml_encoding_match and isHTML:
1915	regexp = re.compile('<\smeta[^>]+charset=([^>]?)[;\'">]', re.I)
1916	xml_encoding_match = regexp.search(xml_data)
1917	if xml_encoding_match is not None:
1918	xml_encoding = xml_encoding_match.groups()[0].lower()
1919	if isHTML:

Callers 1

__init__Method · 0.95

Calls 3

_ebcdic_to_asciiMethod · 0.95

encodeMethod · 0.45

searchMethod · 0.45

Tested by

no test coverage detected