ENCODING_REGEXES contain reasonably fast ways to detect if we could represent a given string in a given encoding. The simplest one is the 'ascii' detector, which of course just determines if all characters are between U+0000 and U+007F.
()
| 30 | |
| 31 | |
| 32 | def _build_regexes() -> dict[str, re.Pattern[str]]: |
| 33 | """ |
| 34 | ENCODING_REGEXES contain reasonably fast ways to detect if we |
| 35 | could represent a given string in a given encoding. The simplest one is |
| 36 | the 'ascii' detector, which of course just determines if all characters |
| 37 | are between U+0000 and U+007F. |
| 38 | """ |
| 39 | # Define a regex that matches ASCII text. |
| 40 | encoding_regexes = {"ascii": re.compile("^[\x00-\x7f]*$")} |
| 41 | |
| 42 | for encoding in CHARMAP_ENCODINGS: |
| 43 | # Make a sequence of characters that bytes \x80 to \xFF decode to |
| 44 | # in each encoding, as well as byte \x1A, which is used to represent |
| 45 | # the replacement character � in the sloppy-* encodings. |
| 46 | byte_range = bytes(list(range(0x80, 0x100)) + [0x1A]) |
| 47 | charlist = byte_range.decode(encoding) |
| 48 | |
| 49 | # The rest of the ASCII bytes -- bytes \x00 to \x19 and \x1B |
| 50 | # to \x7F -- will decode as those ASCII characters in any encoding we |
| 51 | # support, so we can just include them as ranges. This also lets us |
| 52 | # not worry about escaping regex special characters, because all of |
| 53 | # them are in the \x1B to \x7F range. |
| 54 | regex = f"^[\x00-\x19\x1b-\x7f{charlist}]*$" |
| 55 | encoding_regexes[encoding] = re.compile(regex) |
| 56 | return encoding_regexes |
| 57 | |
| 58 | |
| 59 | ENCODING_REGEXES = _build_regexes() |