hub / github.com/rspeer/python-ftfy / make_sloppy_codec

Function make_sloppy_codec

ftfy/bad_codecs/sloppy.py:83–151 · view source on GitHub ↗

Take a codec name, and return a 'sloppy' version of that codec that can encode and decode the unassigned bytes in that encoding. Single-byte encodings in the standard library are defined using some boilerplate classes surrounding the functions that do the actual work, `codecs.c

(encoding: str)

Source from the content-addressed store, hash-verified

81
82
83	def make_sloppy_codec(encoding: str) -> codecs.CodecInfo:
84	"""
85	Take a codec name, and return a 'sloppy' version of that codec that can
86	encode and decode the unassigned bytes in that encoding.
87
88	Single-byte encodings in the standard library are defined using some
89	boilerplate classes surrounding the functions that do the actual work,
90	`codecs.charmap_decode` and `charmap_encode`. This function, given an
91	encoding name, defines those boilerplate classes.
92	"""
93	# Make a bytestring of all 256 possible bytes.
94	all_bytes = bytes(range(256))
95
96	# Get a list of what they would decode to in Latin-1.
97	sloppy_chars = list(all_bytes.decode("latin-1"))
98
99	# Get a list of what they decode to in the given encoding. Use the
100	# replacement character for unassigned bytes.
101	decoded_chars = all_bytes.decode(encoding, errors="replace")
102
103	# Update the sloppy_chars list. Each byte that was successfully decoded
104	# gets its decoded value in the list. The unassigned bytes are left as
105	# they are, which gives their decoding in Latin-1.
106	for i, char in enumerate(decoded_chars):
107	if char != REPLACEMENT_CHAR:
108	sloppy_chars[i] = char
109
110	# For ftfy's own purposes, we're going to allow byte 1A, the "Substitute"
111	# control code, to encode the Unicode replacement character U+FFFD.
112	sloppy_chars[0x1A] = REPLACEMENT_CHAR
113
114	# Create the data structures that tell the charmap methods how to encode
115	# and decode in this sloppy encoding.
116	decoding_table = "".join(sloppy_chars)
117	encoding_table = codecs.charmap_build(decoding_table)
118
119	# Now produce all the class boilerplate. Look at the Python source for
120	# `encodings.cp1252` for comparison; this is almost exactly the same,
121	# except I made it follow pep8.
122	class Codec(codecs.Codec):
123	def encode(self, input: str, errors: str \| None = "strict") -> tuple[bytes, int]:
124	return codecs.charmap_encode(input, errors, encoding_table)
125
126	def decode(self, input: bytes, errors: str \| None = "strict") -> tuple[str, int]:
127	return codecs.charmap_decode(input, errors, decoding_table) # type: ignore[arg-type]
128
129	class IncrementalEncoder(codecs.IncrementalEncoder):
130	def encode(self, input: str, final: bool = False) -> bytes:
131	return codecs.charmap_encode(input, self.errors, encoding_table)[0]
132
133	class IncrementalDecoder(codecs.IncrementalDecoder):
134	def decode(self, input: bytes, final: bool = False) -> str: # type: ignore[override]
135	return codecs.charmap_decode(input, self.errors, decoding_table)[0] # type: ignore[arg-type]
136
137	class StreamWriter(Codec, codecs.StreamWriter):
138	pass
139
140	class StreamReader(Codec, codecs.StreamReader):

Callers 1

sloppy.pyFile · 0.85

Calls 2

CodecClass · 0.85

decodeMethod · 0.45

Tested by

no test coverage detected