Take a codec name, and return a 'sloppy' version of that codec that can encode and decode the unassigned bytes in that encoding. Single-byte encodings in the standard library are defined using some boilerplate classes surrounding the functions that do the actual work, `codecs.c
(encoding: str)
| 81 | |
| 82 | |
| 83 | def make_sloppy_codec(encoding: str) -> codecs.CodecInfo: |
| 84 | """ |
| 85 | Take a codec name, and return a 'sloppy' version of that codec that can |
| 86 | encode and decode the unassigned bytes in that encoding. |
| 87 | |
| 88 | Single-byte encodings in the standard library are defined using some |
| 89 | boilerplate classes surrounding the functions that do the actual work, |
| 90 | `codecs.charmap_decode` and `charmap_encode`. This function, given an |
| 91 | encoding name, *defines* those boilerplate classes. |
| 92 | """ |
| 93 | # Make a bytestring of all 256 possible bytes. |
| 94 | all_bytes = bytes(range(256)) |
| 95 | |
| 96 | # Get a list of what they would decode to in Latin-1. |
| 97 | sloppy_chars = list(all_bytes.decode("latin-1")) |
| 98 | |
| 99 | # Get a list of what they decode to in the given encoding. Use the |
| 100 | # replacement character for unassigned bytes. |
| 101 | decoded_chars = all_bytes.decode(encoding, errors="replace") |
| 102 | |
| 103 | # Update the sloppy_chars list. Each byte that was successfully decoded |
| 104 | # gets its decoded value in the list. The unassigned bytes are left as |
| 105 | # they are, which gives their decoding in Latin-1. |
| 106 | for i, char in enumerate(decoded_chars): |
| 107 | if char != REPLACEMENT_CHAR: |
| 108 | sloppy_chars[i] = char |
| 109 | |
| 110 | # For ftfy's own purposes, we're going to allow byte 1A, the "Substitute" |
| 111 | # control code, to encode the Unicode replacement character U+FFFD. |
| 112 | sloppy_chars[0x1A] = REPLACEMENT_CHAR |
| 113 | |
| 114 | # Create the data structures that tell the charmap methods how to encode |
| 115 | # and decode in this sloppy encoding. |
| 116 | decoding_table = "".join(sloppy_chars) |
| 117 | encoding_table = codecs.charmap_build(decoding_table) |
| 118 | |
| 119 | # Now produce all the class boilerplate. Look at the Python source for |
| 120 | # `encodings.cp1252` for comparison; this is almost exactly the same, |
| 121 | # except I made it follow pep8. |
| 122 | class Codec(codecs.Codec): |
| 123 | def encode(self, input: str, errors: str | None = "strict") -> tuple[bytes, int]: |
| 124 | return codecs.charmap_encode(input, errors, encoding_table) |
| 125 | |
| 126 | def decode(self, input: bytes, errors: str | None = "strict") -> tuple[str, int]: |
| 127 | return codecs.charmap_decode(input, errors, decoding_table) # type: ignore[arg-type] |
| 128 | |
| 129 | class IncrementalEncoder(codecs.IncrementalEncoder): |
| 130 | def encode(self, input: str, final: bool = False) -> bytes: |
| 131 | return codecs.charmap_encode(input, self.errors, encoding_table)[0] |
| 132 | |
| 133 | class IncrementalDecoder(codecs.IncrementalDecoder): |
| 134 | def decode(self, input: bytes, final: bool = False) -> str: # type: ignore[override] |
| 135 | return codecs.charmap_decode(input, self.errors, decoding_table)[0] # type: ignore[arg-type] |
| 136 | |
| 137 | class StreamWriter(Codec, codecs.StreamWriter): |
| 138 | pass |
| 139 | |
| 140 | class StreamReader(Codec, codecs.StreamReader): |