MCPcopy
hub / github.com/rspeer/python-ftfy / make_sloppy_codec

Function make_sloppy_codec

ftfy/bad_codecs/sloppy.py:83–151  ·  view source on GitHub ↗

Take a codec name, and return a 'sloppy' version of that codec that can encode and decode the unassigned bytes in that encoding. Single-byte encodings in the standard library are defined using some boilerplate classes surrounding the functions that do the actual work, `codecs.c

(encoding: str)

Source from the content-addressed store, hash-verified

81
82
83def make_sloppy_codec(encoding: str) -> codecs.CodecInfo:
84 """
85 Take a codec name, and return a 'sloppy' version of that codec that can
86 encode and decode the unassigned bytes in that encoding.
87
88 Single-byte encodings in the standard library are defined using some
89 boilerplate classes surrounding the functions that do the actual work,
90 `codecs.charmap_decode` and `charmap_encode`. This function, given an
91 encoding name, *defines* those boilerplate classes.
92 """
93 # Make a bytestring of all 256 possible bytes.
94 all_bytes = bytes(range(256))
95
96 # Get a list of what they would decode to in Latin-1.
97 sloppy_chars = list(all_bytes.decode("latin-1"))
98
99 # Get a list of what they decode to in the given encoding. Use the
100 # replacement character for unassigned bytes.
101 decoded_chars = all_bytes.decode(encoding, errors="replace")
102
103 # Update the sloppy_chars list. Each byte that was successfully decoded
104 # gets its decoded value in the list. The unassigned bytes are left as
105 # they are, which gives their decoding in Latin-1.
106 for i, char in enumerate(decoded_chars):
107 if char != REPLACEMENT_CHAR:
108 sloppy_chars[i] = char
109
110 # For ftfy's own purposes, we're going to allow byte 1A, the "Substitute"
111 # control code, to encode the Unicode replacement character U+FFFD.
112 sloppy_chars[0x1A] = REPLACEMENT_CHAR
113
114 # Create the data structures that tell the charmap methods how to encode
115 # and decode in this sloppy encoding.
116 decoding_table = "".join(sloppy_chars)
117 encoding_table = codecs.charmap_build(decoding_table)
118
119 # Now produce all the class boilerplate. Look at the Python source for
120 # `encodings.cp1252` for comparison; this is almost exactly the same,
121 # except I made it follow pep8.
122 class Codec(codecs.Codec):
123 def encode(self, input: str, errors: str | None = "strict") -> tuple[bytes, int]:
124 return codecs.charmap_encode(input, errors, encoding_table)
125
126 def decode(self, input: bytes, errors: str | None = "strict") -> tuple[str, int]:
127 return codecs.charmap_decode(input, errors, decoding_table) # type: ignore[arg-type]
128
129 class IncrementalEncoder(codecs.IncrementalEncoder):
130 def encode(self, input: str, final: bool = False) -> bytes:
131 return codecs.charmap_encode(input, self.errors, encoding_table)[0]
132
133 class IncrementalDecoder(codecs.IncrementalDecoder):
134 def decode(self, input: bytes, final: bool = False) -> str: # type: ignore[override]
135 return codecs.charmap_decode(input, self.errors, decoding_table)[0] # type: ignore[arg-type]
136
137 class StreamWriter(Codec, codecs.StreamWriter):
138 pass
139
140 class StreamReader(Codec, codecs.StreamReader):

Callers 1

sloppy.pyFile · 0.85

Calls 2

CodecClass · 0.85
decodeMethod · 0.45

Tested by

no test coverage detected