Changes an XML data stream on the fly to specify a new encoding data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already encoding is a string recognized by encodings.aliases
(data, encoding)
| 3510 | return true_encoding, http_encoding, xml_encoding, sniffed_xml_encoding, acceptable_content_type |
| 3511 | |
| 3512 | def _toUTF8(data, encoding): |
| 3513 | '''Changes an XML data stream on the fly to specify a new encoding |
| 3514 | |
| 3515 | data is a raw sequence of bytes (not Unicode) that is presumed to be in %encoding already |
| 3516 | encoding is a string recognized by encodings.aliases |
| 3517 | ''' |
| 3518 | if _debug: sys.stderr.write('entering _toUTF8, trying encoding %s\n' % encoding) |
| 3519 | # strip Byte Order Mark (if present) |
| 3520 | if (len(data) >= 4) and (data[:2] == _l2bytes([0xfe, 0xff])) and (data[2:4] != _l2bytes([0x00, 0x00])): |
| 3521 | if _debug: |
| 3522 | sys.stderr.write('stripping BOM\n') |
| 3523 | if encoding != 'utf-16be': |
| 3524 | sys.stderr.write('trying utf-16be instead\n') |
| 3525 | encoding = 'utf-16be' |
| 3526 | data = data[2:] |
| 3527 | elif (len(data) >= 4) and (data[:2] == _l2bytes([0xff, 0xfe])) and (data[2:4] != _l2bytes([0x00, 0x00])): |
| 3528 | if _debug: |
| 3529 | sys.stderr.write('stripping BOM\n') |
| 3530 | if encoding != 'utf-16le': |
| 3531 | sys.stderr.write('trying utf-16le instead\n') |
| 3532 | encoding = 'utf-16le' |
| 3533 | data = data[2:] |
| 3534 | elif data[:3] == _l2bytes([0xef, 0xbb, 0xbf]): |
| 3535 | if _debug: |
| 3536 | sys.stderr.write('stripping BOM\n') |
| 3537 | if encoding != 'utf-8': |
| 3538 | sys.stderr.write('trying utf-8 instead\n') |
| 3539 | encoding = 'utf-8' |
| 3540 | data = data[3:] |
| 3541 | elif data[:4] == _l2bytes([0x00, 0x00, 0xfe, 0xff]): |
| 3542 | if _debug: |
| 3543 | sys.stderr.write('stripping BOM\n') |
| 3544 | if encoding != 'utf-32be': |
| 3545 | sys.stderr.write('trying utf-32be instead\n') |
| 3546 | encoding = 'utf-32be' |
| 3547 | data = data[4:] |
| 3548 | elif data[:4] == _l2bytes([0xff, 0xfe, 0x00, 0x00]): |
| 3549 | if _debug: |
| 3550 | sys.stderr.write('stripping BOM\n') |
| 3551 | if encoding != 'utf-32le': |
| 3552 | sys.stderr.write('trying utf-32le instead\n') |
| 3553 | encoding = 'utf-32le' |
| 3554 | data = data[4:] |
| 3555 | newdata = unicode(data, encoding) |
| 3556 | if _debug: sys.stderr.write('successfully converted %s data to unicode\n' % encoding) |
| 3557 | declmatch = re.compile('^<\?xml[^>]*?>') |
| 3558 | newdecl = '''<?xml version='1.0' encoding='utf-8'?>''' |
| 3559 | if declmatch.search(newdata): |
| 3560 | newdata = declmatch.sub(newdecl, newdata) |
| 3561 | else: |
| 3562 | newdata = newdecl + u'\n' + newdata |
| 3563 | return newdata.encode('utf-8') |
| 3564 | |
| 3565 | def _stripDoctype(data): |
| 3566 | '''Strips DOCTYPE from XML document, returns (rss_version, stripped_data) |