MCPcopy Index your code
hub / github.com/csev/py4e / _detectEncoding

Method _detectEncoding

code/BeautifulSoup.py:1862–1927  ·  view source on GitHub ↗

Given a document, tries to detect its XML encoding.

(self, xml_data, isHTML=False)

Source from the content-addressed store, hash-verified

1860 return newdata
1861
1862 def _detectEncoding(self, xml_data, isHTML=False):
1863 """Given a document, tries to detect its XML encoding."""
1864 xml_encoding = sniffed_xml_encoding = None
1865 try:
1866 if xml_data[:4] == '\x4c\x6f\xa7\x94':
1867 # EBCDIC
1868 xml_data = self._ebcdic_to_ascii(xml_data)
1869 elif xml_data[:4] == '\x00\x3c\x00\x3f':
1870 # UTF-16BE
1871 sniffed_xml_encoding = 'utf-16be'
1872 xml_data = unicode(xml_data, 'utf-16be').encode('utf-8')
1873 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \
1874 and (xml_data[2:4] != '\x00\x00'):
1875 # UTF-16BE with BOM
1876 sniffed_xml_encoding = 'utf-16be'
1877 xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8')
1878 elif xml_data[:4] == '\x3c\x00\x3f\x00':
1879 # UTF-16LE
1880 sniffed_xml_encoding = 'utf-16le'
1881 xml_data = unicode(xml_data, 'utf-16le').encode('utf-8')
1882 elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \
1883 (xml_data[2:4] != '\x00\x00'):
1884 # UTF-16LE with BOM
1885 sniffed_xml_encoding = 'utf-16le'
1886 xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8')
1887 elif xml_data[:4] == '\x00\x00\x00\x3c':
1888 # UTF-32BE
1889 sniffed_xml_encoding = 'utf-32be'
1890 xml_data = unicode(xml_data, 'utf-32be').encode('utf-8')
1891 elif xml_data[:4] == '\x3c\x00\x00\x00':
1892 # UTF-32LE
1893 sniffed_xml_encoding = 'utf-32le'
1894 xml_data = unicode(xml_data, 'utf-32le').encode('utf-8')
1895 elif xml_data[:4] == '\x00\x00\xfe\xff':
1896 # UTF-32BE with BOM
1897 sniffed_xml_encoding = 'utf-32be'
1898 xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8')
1899 elif xml_data[:4] == '\xff\xfe\x00\x00':
1900 # UTF-32LE with BOM
1901 sniffed_xml_encoding = 'utf-32le'
1902 xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8')
1903 elif xml_data[:3] == '\xef\xbb\xbf':
1904 # UTF-8 with BOM
1905 sniffed_xml_encoding = 'utf-8'
1906 xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8')
1907 else:
1908 sniffed_xml_encoding = 'ascii'
1909 pass
1910 except:
1911 xml_encoding_match = None
1912 xml_encoding_match = re.compile(
1913 '^<\?.*encoding=[\'"](.*?)[\'"].*\?>&#x27;).match(xml_data)
1914 if not xml_encoding_match and isHTML:
1915 regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]&#x27;, re.I)
1916 xml_encoding_match = regexp.search(xml_data)
1917 if xml_encoding_match is not None:
1918 xml_encoding = xml_encoding_match.groups()[0].lower()
1919 if isHTML:

Callers 1

__init__Method · 0.95

Calls 3

_ebcdic_to_asciiMethod · 0.95
encodeMethod · 0.45
searchMethod · 0.45

Tested by

no test coverage detected