## Unreleased: mitmproxy next
+- Fix various issues in infer_content_encoding
+ ([#7928](https://github.com/mitmproxy/mitmproxy/pull/7928), @xu-cheng)
## 15 October 2025: mitmproxy 12.2.0
"""
Infer the encoding of content from the content-type header.
"""
- # Use the charset from the header if possible
- parsed_content_type = parse_content_type(content_type)
- enc = parsed_content_type[2].get("charset") if parsed_content_type else None
+ enc = None
+
+ # BOM has the highest priority
+ if content.startswith(b"\x00\x00\xfe\xff"):
+ enc = "utf-32be"
+ elif content.startswith(b"\xff\xfe\x00\x00"):
+ enc = "utf-32le"
+ elif content.startswith(b"\xfe\xff"):
+ enc = "utf-16be"
+ elif content.startswith(b"\xff\xfe"):
+ enc = "utf-16le"
+ elif content.startswith(b"\xef\xbb\xbf"):
+ # 'utf-8-sig' will strip the BOM on decode
+ enc = "utf-8-sig"
+ elif parsed_content_type := parse_content_type(content_type):
+ # Use the charset from the header if possible
+ enc = parsed_content_type[2].get("charset")
# Otherwise, infer the encoding
if not enc and "json" in content_type:
if meta_charset:
enc = meta_charset.group(1).decode("ascii", "ignore")
+ if not enc and "xml" in content_type:
+ if xml_encoding := re.search(
+ rb"""<\?xml[^\?>]+encoding=['"]([^'"\?>]+)""", content, re.IGNORECASE
+ ):
+ enc = xml_encoding.group(1).decode("ascii", "ignore")
+
if not enc and "text/css" in content_type:
# @charset rule must be the very first thing.
css_charset = re.match(rb"""@charset "([^"]+)";""", content, re.IGNORECASE)
if css_charset:
enc = css_charset.group(1).decode("ascii", "ignore")
+ else:
+ # Fallback to utf8 for css
+ # Ref: https://drafts.csswg.org/css-syntax/#determine-the-fallback-encoding
+ enc = "utf8"
# Fallback to latin-1
if not enc:
("", b"foo", "latin-1"),
("", b"\xfc", "latin-1"),
("", b"\xf0\xe2", "latin-1"),
+ # bom
+ ("", b"\xef\xbb\xbffoo", "utf-8-sig"),
+ ("", b"\xff\xfef\x00o\x00o\x00", "utf-16le"),
+ ("", b"\xfe\xff\x00f\x00o\x00o", "utf-16be"),
+ ("", b"\xff\xfe\x00\x00f\x00\x00\x00o\x00\x00\x00o\x00\x00\x00", "utf-32le"),
+ ("", b"\x00\x00\xfe\xff\x00\x00\x00f\x00\x00\x00o\x00\x00\x00o", "utf-32be"),
+ # content-type charset
("text/html; charset=latin1", b"\xc3\xbc", "latin1"),
("text/html; charset=utf8", b"\xc3\xbc", "utf8"),
# json
("application/json", b'"\xc3\xbc"', "utf8"),
- # meta charset
+ # html meta charset
+ (
+ "text/html",
+ b'<meta charset="gb2312">\xe6\x98\x8e\xe4\xbc\xaf',
+ "gb18030",
+ ),
(
"text/html",
b'<meta http-equiv="content-type" '
b'content="text/html;charset=gb2312">\xe6\x98\x8e\xe4\xbc\xaf',
"gb18030",
),
+ # xml declaration encoding
+ (
+ "application/xml",
+ b'<?xml version="1.0" encoding="gb2312"?>'
+ b"<root>\xe6\x98\x8e\xe4\xbc\xaf</root>",
+ "gb18030",
+ ),
# css charset
+ (
+ "text/css",
+ b'\xef\xbb\xbf@charset "UTF-8";.\xe5\xb9\xb3\xe5\x92\x8c,#div2 {color: green;}',
+ "utf-8-sig",
+ ),
(
"text/css",
b'@charset "gb2312";#foo::before {content: "\xe6\x98\x8e\xe4\xbc\xaf"}',
"gb18030",
),
+ (
+ "text/css",
+ b"h1 {}",
+ "utf8",
+ ),
],
)
def test_infer_content_encoding(content_type, content, expected):