Checks encoding name, repairs common misspellings and adjusts to proper namings used in codecs module >>> checkCharEncoding('iso-8858', False) 'iso8859-1' >>> checkCharEncoding('en_us', False) 'utf8'
(encoding, warn=True)
| 157 | |
| 158 | @cachedmethod |
| 159 | def checkCharEncoding(encoding, warn=True): |
| 160 | """ |
| 161 | Checks encoding name, repairs common misspellings and adjusts to |
| 162 | proper namings used in codecs module |
| 163 | |
| 164 | >>> checkCharEncoding('iso-8858', False) |
| 165 | 'iso8859-1' |
| 166 | >>> checkCharEncoding('en_us', False) |
| 167 | 'utf8' |
| 168 | """ |
| 169 | |
| 170 | if isinstance(encoding, six.binary_type): |
| 171 | encoding = getUnicode(encoding) |
| 172 | |
| 173 | if isListLike(encoding): |
| 174 | encoding = unArrayizeValue(encoding) |
| 175 | |
| 176 | if encoding: |
| 177 | encoding = encoding.lower() |
| 178 | else: |
| 179 | return encoding |
| 180 | |
| 181 | # Reference: http://www.destructor.de/charsets/index.htm |
| 182 | translate = {"windows-874": "iso-8859-11", "utf-8859-1": "utf8", "en_us": "utf8", "macintosh": "iso-8859-1", "euc_tw": "big5_tw", "th": "tis-620", "unicode": "utf8", "utc8": "utf8", "ebcdic": "ebcdic-cp-be", "iso-8859": "iso8859-1", "iso-8859-0": "iso8859-1", "ansi": "ascii", "gbk2312": "gbk", "windows-31j": "cp932", "en": "us"} |
| 183 | |
| 184 | for delimiter in (';', ',', '('): |
| 185 | if delimiter in encoding: |
| 186 | encoding = encoding[:encoding.find(delimiter)].strip() |
| 187 | |
| 188 | encoding = encoding.replace(""", "") |
| 189 | |
| 190 | # popular typos/errors |
| 191 | if "8858" in encoding: |
| 192 | encoding = encoding.replace("8858", "8859") # iso-8858 -> iso-8859 |
| 193 | elif "8559" in encoding: |
| 194 | encoding = encoding.replace("8559", "8859") # iso-8559 -> iso-8859 |
| 195 | elif "8895" in encoding: |
| 196 | encoding = encoding.replace("8895", "8859") # iso-8895 -> iso-8859 |
| 197 | elif "5889" in encoding: |
| 198 | encoding = encoding.replace("5889", "8859") # iso-5889 -> iso-8859 |
| 199 | elif "5589" in encoding: |
| 200 | encoding = encoding.replace("5589", "8859") # iso-5589 -> iso-8859 |
| 201 | elif "2313" in encoding: |
| 202 | encoding = encoding.replace("2313", "2312") # gb2313 -> gb2312 |
| 203 | elif encoding.startswith("x-"): |
| 204 | encoding = encoding[len("x-"):] # x-euc-kr -> euc-kr / x-mac-turkish -> mac-turkish |
| 205 | elif "windows-cp" in encoding: |
| 206 | encoding = encoding.replace("windows-cp", "windows") # windows-cp-1254 -> windows-1254 |
| 207 | |
| 208 | # name adjustment for compatibility |
| 209 | if encoding.startswith("8859"): |
| 210 | encoding = "iso-%s" % encoding |
| 211 | elif encoding.startswith("cp-"): |
| 212 | encoding = "cp%s" % encoding[3:] |
| 213 | elif encoding.startswith("euc-"): |
| 214 | encoding = "euc_%s" % encoding[4:] |
| 215 | elif encoding.startswith("windows") and not encoding.startswith("windows-"): |
| 216 | encoding = "windows-%s" % encoding[7:] |
no test coverage detected
searching dependent graphs…