]> git.feebdaed.xyz Git - 0xmirror/mitmproxy.git/commitdiff
Fix various issues in infer_content_encoding (#7928)
authorCheng Xu <3105373+xu-cheng@users.noreply.github.com>
Mon, 27 Oct 2025 20:55:36 +0000 (13:55 -0700)
committerGitHub <noreply@github.com>
Mon, 27 Oct 2025 20:55:36 +0000 (20:55 +0000)
* Fix various issues in infer_content_encoding

- Per spec, the byte order mark (BOM) always has the highest priority to
  determine various UTF related encodings. Ref:
    - https://www.w3.org/International/questions/qa-html-encoding-declarations#bom
    - https://html.spec.whatwg.org/multipage/semantics.html#charset
    - https://drafts.csswg.org/css-syntax/#css-decode-bytes
- For xml traffics such as `application/xml`, `image/svg+xml` etc, a XML
  declaration may contain the encoding information. Ref:
    - https://www.w3.org/TR/xml/#charencoding
    - https://html.spec.whatwg.org/multipage/semantics.html#charset
- For CSS, the default fallback encoding should be UTF-8. Ref:
    - https://drafts.csswg.org/css-syntax/#determine-the-fallback-encoding

* nits

---------

Co-authored-by: Maximilian Hils <git@maximilianhils.com>
CHANGELOG.md
mitmproxy/net/http/headers.py
test/mitmproxy/net/http/test_headers.py

index 68236d74e3bc8f737cdb4d4bdd24ad4f544fab85..29a2483e45272aee4cb78e277ce18af810fa99b9 100644 (file)
@@ -7,6 +7,8 @@
 
 ## Unreleased: mitmproxy next
 
+- Fix various issues in infer_content_encoding
+  ([#7928](https://github.com/mitmproxy/mitmproxy/pull/7928), @xu-cheng)
 
 ## 15 October 2025: mitmproxy 12.2.0
 
index 7e14b2a77c0215292a86730e37aa1618c95e1bff..4521c9c5e29369133bff975c1af4e027b6b0a022 100644 (file)
@@ -40,9 +40,23 @@ def infer_content_encoding(content_type: str, content: bytes = b"") -> str:
     """
     Infer the encoding of content from the content-type header.
     """
-    # Use the charset from the header if possible
-    parsed_content_type = parse_content_type(content_type)
-    enc = parsed_content_type[2].get("charset") if parsed_content_type else None
+    enc = None
+
+    # BOM has the highest priority
+    if content.startswith(b"\x00\x00\xfe\xff"):
+        enc = "utf-32be"
+    elif content.startswith(b"\xff\xfe\x00\x00"):
+        enc = "utf-32le"
+    elif content.startswith(b"\xfe\xff"):
+        enc = "utf-16be"
+    elif content.startswith(b"\xff\xfe"):
+        enc = "utf-16le"
+    elif content.startswith(b"\xef\xbb\xbf"):
+        # 'utf-8-sig' will strip the BOM on decode
+        enc = "utf-8-sig"
+    elif parsed_content_type := parse_content_type(content_type):
+        # Use the charset from the header if possible
+        enc = parsed_content_type[2].get("charset")
 
     # Otherwise, infer the encoding
     if not enc and "json" in content_type:
@@ -55,11 +69,21 @@ def infer_content_encoding(content_type: str, content: bytes = b"") -> str:
         if meta_charset:
             enc = meta_charset.group(1).decode("ascii", "ignore")
 
+    if not enc and "xml" in content_type:
+        if xml_encoding := re.search(
+            rb"""<\?xml[^\?>]+encoding=['"]([^'"\?>]+)""", content, re.IGNORECASE
+        ):
+            enc = xml_encoding.group(1).decode("ascii", "ignore")
+
     if not enc and "text/css" in content_type:
         # @charset rule must be the very first thing.
         css_charset = re.match(rb"""@charset "([^"]+)";""", content, re.IGNORECASE)
         if css_charset:
             enc = css_charset.group(1).decode("ascii", "ignore")
+        else:
+            # Fallback to utf8 for css
+            # Ref: https://drafts.csswg.org/css-syntax/#determine-the-fallback-encoding
+            enc = "utf8"
 
     # Fallback to latin-1
     if not enc:
index cd8d9e98c610bcf033de00c962c4c66dc090294d..5d7687f5d3aa1201622cbee3d001061e2853ada5 100644 (file)
@@ -37,23 +37,52 @@ def test_assemble_content_type():
         ("", b"foo", "latin-1"),
         ("", b"\xfc", "latin-1"),
         ("", b"\xf0\xe2", "latin-1"),
+        # bom
+        ("", b"\xef\xbb\xbffoo", "utf-8-sig"),
+        ("", b"\xff\xfef\x00o\x00o\x00", "utf-16le"),
+        ("", b"\xfe\xff\x00f\x00o\x00o", "utf-16be"),
+        ("", b"\xff\xfe\x00\x00f\x00\x00\x00o\x00\x00\x00o\x00\x00\x00", "utf-32le"),
+        ("", b"\x00\x00\xfe\xff\x00\x00\x00f\x00\x00\x00o\x00\x00\x00o", "utf-32be"),
+        # content-type charset
         ("text/html; charset=latin1", b"\xc3\xbc", "latin1"),
         ("text/html; charset=utf8", b"\xc3\xbc", "utf8"),
         # json
         ("application/json", b'"\xc3\xbc"', "utf8"),
-        # meta charset
+        # html meta charset
+        (
+            "text/html",
+            b'<meta charset="gb2312">\xe6\x98\x8e\xe4\xbc\xaf',
+            "gb18030",
+        ),
         (
             "text/html",
             b'<meta http-equiv="content-type" '
             b'content="text/html;charset=gb2312">\xe6\x98\x8e\xe4\xbc\xaf',
             "gb18030",
         ),
+        # xml declaration encoding
+        (
+            "application/xml",
+            b'<?xml version="1.0" encoding="gb2312"?>'
+            b"<root>\xe6\x98\x8e\xe4\xbc\xaf</root>",
+            "gb18030",
+        ),
         # css charset
+        (
+            "text/css",
+            b'\xef\xbb\xbf@charset "UTF-8";.\xe5\xb9\xb3\xe5\x92\x8c,#div2 {color: green;}',
+            "utf-8-sig",
+        ),
         (
             "text/css",
             b'@charset "gb2312";#foo::before {content: "\xe6\x98\x8e\xe4\xbc\xaf"}',
             "gb18030",
         ),
+        (
+            "text/css",
+            b"h1 {}",
+            "utf8",
+        ),
     ],
 )
 def test_infer_content_encoding(content_type, content, expected):