Given a document, tries to detect its XML encoding.
(self, xml_data, isHTML=False)
| 1860 | return newdata |
| 1861 | |
| 1862 | def _detectEncoding(self, xml_data, isHTML=False): |
| 1863 | """Given a document, tries to detect its XML encoding.""" |
| 1864 | xml_encoding = sniffed_xml_encoding = None |
| 1865 | try: |
| 1866 | if xml_data[:4] == '\x4c\x6f\xa7\x94': |
| 1867 | # EBCDIC |
| 1868 | xml_data = self._ebcdic_to_ascii(xml_data) |
| 1869 | elif xml_data[:4] == '\x00\x3c\x00\x3f': |
| 1870 | # UTF-16BE |
| 1871 | sniffed_xml_encoding = 'utf-16be' |
| 1872 | xml_data = unicode(xml_data, 'utf-16be').encode('utf-8') |
| 1873 | elif (len(xml_data) >= 4) and (xml_data[:2] == '\xfe\xff') \ |
| 1874 | and (xml_data[2:4] != '\x00\x00'): |
| 1875 | # UTF-16BE with BOM |
| 1876 | sniffed_xml_encoding = 'utf-16be' |
| 1877 | xml_data = unicode(xml_data[2:], 'utf-16be').encode('utf-8') |
| 1878 | elif xml_data[:4] == '\x3c\x00\x3f\x00': |
| 1879 | # UTF-16LE |
| 1880 | sniffed_xml_encoding = 'utf-16le' |
| 1881 | xml_data = unicode(xml_data, 'utf-16le').encode('utf-8') |
| 1882 | elif (len(xml_data) >= 4) and (xml_data[:2] == '\xff\xfe') and \ |
| 1883 | (xml_data[2:4] != '\x00\x00'): |
| 1884 | # UTF-16LE with BOM |
| 1885 | sniffed_xml_encoding = 'utf-16le' |
| 1886 | xml_data = unicode(xml_data[2:], 'utf-16le').encode('utf-8') |
| 1887 | elif xml_data[:4] == '\x00\x00\x00\x3c': |
| 1888 | # UTF-32BE |
| 1889 | sniffed_xml_encoding = 'utf-32be' |
| 1890 | xml_data = unicode(xml_data, 'utf-32be').encode('utf-8') |
| 1891 | elif xml_data[:4] == '\x3c\x00\x00\x00': |
| 1892 | # UTF-32LE |
| 1893 | sniffed_xml_encoding = 'utf-32le' |
| 1894 | xml_data = unicode(xml_data, 'utf-32le').encode('utf-8') |
| 1895 | elif xml_data[:4] == '\x00\x00\xfe\xff': |
| 1896 | # UTF-32BE with BOM |
| 1897 | sniffed_xml_encoding = 'utf-32be' |
| 1898 | xml_data = unicode(xml_data[4:], 'utf-32be').encode('utf-8') |
| 1899 | elif xml_data[:4] == '\xff\xfe\x00\x00': |
| 1900 | # UTF-32LE with BOM |
| 1901 | sniffed_xml_encoding = 'utf-32le' |
| 1902 | xml_data = unicode(xml_data[4:], 'utf-32le').encode('utf-8') |
| 1903 | elif xml_data[:3] == '\xef\xbb\xbf': |
| 1904 | # UTF-8 with BOM |
| 1905 | sniffed_xml_encoding = 'utf-8' |
| 1906 | xml_data = unicode(xml_data[3:], 'utf-8').encode('utf-8') |
| 1907 | else: |
| 1908 | sniffed_xml_encoding = 'ascii' |
| 1909 | pass |
| 1910 | except: |
| 1911 | xml_encoding_match = None |
| 1912 | xml_encoding_match = re.compile( |
| 1913 | '^<\?.*encoding=[\'"](.*?)[\'"].*\?>').match(xml_data) |
| 1914 | if not xml_encoding_match and isHTML: |
| 1915 | regexp = re.compile('<\s*meta[^>]+charset=([^>]*?)[;\'">]', re.I) |
| 1916 | xml_encoding_match = regexp.search(xml_data) |
| 1917 | if xml_encoding_match is not None: |
| 1918 | xml_encoding = xml_encoding_match.groups()[0].lower() |
| 1919 | if isHTML: |
no test coverage detected