| 9 | |
| 10 | |
| 11 | def _decode_utf8_with_mixes(data): |
| 12 | new_data = bytearray() |
| 13 | skip_count = 0 |
| 14 | for index, char in enumerate(data): |
| 15 | if skip_count > 0: |
| 16 | skip_count -= 1 |
| 17 | continue |
| 18 | |
| 19 | if char < 127: |
| 20 | new_data.append(char) |
| 21 | continue |
| 22 | |
| 23 | next_bytes_count = 0 |
| 24 | valid = None |
| 25 | if 192 <= char <= 223: |
| 26 | next_bytes_count = 1 |
| 27 | elif 224 <= char <= 239: |
| 28 | next_bytes_count = 2 |
| 29 | elif 240 <= char <= 247: |
| 30 | next_bytes_count = 3 |
| 31 | |
| 32 | if next_bytes_count: |
| 33 | valid = True |
| 34 | for offset in range(0, next_bytes_count): |
| 35 | if not (128 <= data[index + offset + 1] <= 192): |
| 36 | valid = False |
| 37 | break |
| 38 | |
| 39 | if valid: |
| 40 | selected_bytes = data[index:(index + next_bytes_count + 1)] |
| 41 | new_data.extend(selected_bytes) |
| 42 | skip_count = next_bytes_count |
| 43 | continue |
| 44 | |
| 45 | converted_bytes = data[index:index + 1].decode('iso-8859-1').encode('utf-8') |
| 46 | new_data.extend(converted_bytes) |
| 47 | |
| 48 | return new_data.decode('utf-8', errors='replace') |