Infer the encoding of content from the content-type header.
(content_type: str, content: bytes = b"")
| 37 | |
| 38 | |
| 39 | def infer_content_encoding(content_type: str, content: bytes = b"") -> str: |
| 40 | """ |
| 41 | Infer the encoding of content from the content-type header. |
| 42 | """ |
| 43 | enc = None |
| 44 | |
| 45 | # BOM has the highest priority |
| 46 | if content.startswith(b"\x00\x00\xfe\xff"): |
| 47 | enc = "utf-32be" |
| 48 | elif content.startswith(b"\xff\xfe\x00\x00"): |
| 49 | enc = "utf-32le" |
| 50 | elif content.startswith(b"\xfe\xff"): |
| 51 | enc = "utf-16be" |
| 52 | elif content.startswith(b"\xff\xfe"): |
| 53 | enc = "utf-16le" |
| 54 | elif content.startswith(b"\xef\xbb\xbf"): |
| 55 | # 'utf-8-sig' will strip the BOM on decode |
| 56 | enc = "utf-8-sig" |
| 57 | elif parsed_content_type := parse_content_type(content_type): |
| 58 | # Use the charset from the header if possible |
| 59 | enc = parsed_content_type[2].get("charset") |
| 60 | |
| 61 | # Otherwise, infer the encoding |
| 62 | if not enc and "json" in content_type: |
| 63 | enc = "utf8" |
| 64 | |
| 65 | if not enc and "html" in content_type: |
| 66 | meta_charset = re.search( |
| 67 | rb"""<meta[^>]+charset=['"]?([^'">]+)""", content, re.IGNORECASE |
| 68 | ) |
| 69 | if meta_charset: |
| 70 | enc = meta_charset.group(1).decode("ascii", "ignore") |
| 71 | else: |
| 72 | # Fallback to utf8 for html |
| 73 | # Ref: https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding |
| 74 | # > 9. [snip] the comprehensive UTF-8 encoding is suggested. |
| 75 | enc = "utf8" |
| 76 | |
| 77 | if not enc and "xml" in content_type: |
| 78 | if xml_encoding := re.search( |
| 79 | rb"""<\?xml[^\?>]+encoding=['"]([^'"\?>]+)""", content, re.IGNORECASE |
| 80 | ): |
| 81 | enc = xml_encoding.group(1).decode("ascii", "ignore") |
| 82 | else: |
| 83 | # Fallback to utf8 for xml |
| 84 | # Ref: https://datatracker.ietf.org/doc/html/rfc7303#section-8.5 |
| 85 | # > the XML processor [snip] to determine an encoding of UTF-8. |
| 86 | enc = "utf8" |
| 87 | |
| 88 | if not enc and ("javascript" in content_type or "ecmascript" in content_type): |
| 89 | # Fallback to utf8 for javascript |
| 90 | # Ref: https://datatracker.ietf.org/doc/html/rfc9239#section-4.2 |
| 91 | # > 3. Else, the character encoding scheme is assumed to be UTF-8 |
| 92 | enc = "utf8" |
| 93 | |
| 94 | if not enc and "text/css" in content_type: |
| 95 | # @charset rule must be the very first thing. |
| 96 | css_charset = re.match(rb"""@charset "([^"]+)";""", content, re.IGNORECASE) |
searching dependent graphs…