MCPcopy
hub / github.com/mitmproxy/mitmproxy / infer_content_encoding

Function infer_content_encoding

mitmproxy/net/http/headers.py:39–113  ·  view source on GitHub ↗

Infer the encoding of content from the content-type header.

(content_type: str, content: bytes = b"")

Source from the content-addressed store, hash-verified

37
38
39def infer_content_encoding(content_type: str, content: bytes = b"") -> str:
40 """
41 Infer the encoding of content from the content-type header.
42 """
43 enc = None
44
45 # BOM has the highest priority
46 if content.startswith(b"\x00\x00\xfe\xff"):
47 enc = "utf-32be"
48 elif content.startswith(b"\xff\xfe\x00\x00"):
49 enc = "utf-32le"
50 elif content.startswith(b"\xfe\xff"):
51 enc = "utf-16be"
52 elif content.startswith(b"\xff\xfe"):
53 enc = "utf-16le"
54 elif content.startswith(b"\xef\xbb\xbf"):
55 # 'utf-8-sig' will strip the BOM on decode
56 enc = "utf-8-sig"
57 elif parsed_content_type := parse_content_type(content_type):
58 # Use the charset from the header if possible
59 enc = parsed_content_type[2].get("charset")
60
61 # Otherwise, infer the encoding
62 if not enc and "json" in content_type:
63 enc = "utf8"
64
65 if not enc and "html" in content_type:
66 meta_charset = re.search(
67 rb"""<meta[^>]+charset=['"]?([^&#x27;">]+)""", content, re.IGNORECASE
68 )
69 if meta_charset:
70 enc = meta_charset.group(1).decode("ascii", "ignore")
71 else:
72 # Fallback to utf8 for html
73 # Ref: https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
74 # > 9. [snip] the comprehensive UTF-8 encoding is suggested.
75 enc = "utf8"
76
77 if not enc and "xml" in content_type:
78 if xml_encoding := re.search(
79 rb"""<\?xml[^\?>]+encoding=['"]([^&#x27;"\?>]+)""", content, re.IGNORECASE
80 ):
81 enc = xml_encoding.group(1).decode("ascii", "ignore")
82 else:
83 # Fallback to utf8 for xml
84 # Ref: https://datatracker.ietf.org/doc/html/rfc7303#section-8.5
85 # > the XML processor [snip] to determine an encoding of UTF-8.
86 enc = "utf8"
87
88 if not enc and ("javascript" in content_type or "ecmascript" in content_type):
89 # Fallback to utf8 for javascript
90 # Ref: https://datatracker.ietf.org/doc/html/rfc9239#section-4.2
91 # > 3. Else, the character encoding scheme is assumed to be UTF-8
92 enc = "utf8"
93
94 if not enc and "text/css" in content_type:
95 # @charset rule must be the very first thing.
96 css_charset = re.match(rb"""@charset "([^"]+)";""", content, re.IGNORECASE)

Callers 4

set_textMethod · 0.90
get_textMethod · 0.90
request_to_flowFunction · 0.90

Calls 4

parse_content_typeFunction · 0.85
getMethod · 0.45
decodeMethod · 0.45
matchMethod · 0.45

Tested by 1

Used in the wild real call sites across dependent graphs

searching dependent graphs…