Fix various issues in infer_content_encoding (#7928)

author Cheng Xu <3105373+xu-cheng@users.noreply.github.com>

Mon, 27 Oct 2025 20:55:36 +0000 (13:55 -0700)

committer GitHub <noreply@github.com>

Mon, 27 Oct 2025 20:55:36 +0000 (20:55 +0000)
author Cheng Xu <3105373+xu-cheng@users.noreply.github.com>
Mon, 27 Oct 2025 20:55:36 +0000 (13:55 -0700)
committer GitHub <noreply@github.com>
Mon, 27 Oct 2025 20:55:36 +0000 (20:55 +0000)
diff --git a/CHANGELOG.md b/CHANGELOG.md

index 68236d74e3bc8f737cdb4d4bdd24ad4f544fab85..29a2483e45272aee4cb78e277ce18af810fa99b9 100644 (file)
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@
  
  ## Unreleased: mitmproxy next
  
+- Fix various issues in infer_content_encoding
+  ([#7928](https://github.com/mitmproxy/mitmproxy/pull/7928), @xu-cheng)
  
  ## 15 October 2025: mitmproxy 12.2.0
  
diff --git a/mitmproxy/net/http/headers.py b/mitmproxy/net/http/headers.py

index 7e14b2a77c0215292a86730e37aa1618c95e1bff..4521c9c5e29369133bff975c1af4e027b6b0a022 100644 (file)
--- a/mitmproxy/net/http/headers.py
+++ b/mitmproxy/net/http/headers.py
@@ -40,9 +40,23 @@ def infer_content_encoding(content_type: str, content: bytes = b"") -> str:
      """
      Infer the encoding of content from the content-type header.
      """
-    # Use the charset from the header if possible
-    parsed_content_type = parse_content_type(content_type)
-    enc = parsed_content_type[2].get("charset") if parsed_content_type else None
+    enc = None
+
+    # BOM has the highest priority
+    if content.startswith(b"\x00\x00\xfe\xff"):
+        enc = "utf-32be"
+    elif content.startswith(b"\xff\xfe\x00\x00"):
+        enc = "utf-32le"
+    elif content.startswith(b"\xfe\xff"):
+        enc = "utf-16be"
+    elif content.startswith(b"\xff\xfe"):
+        enc = "utf-16le"
+    elif content.startswith(b"\xef\xbb\xbf"):
+        # 'utf-8-sig' will strip the BOM on decode
+        enc = "utf-8-sig"
+    elif parsed_content_type := parse_content_type(content_type):
+        # Use the charset from the header if possible
+        enc = parsed_content_type[2].get("charset")
  
      # Otherwise, infer the encoding
      if not enc and "json" in content_type:
@@ -55,11 +69,21 @@ def infer_content_encoding(content_type: str, content: bytes = b"") -> str:
          if meta_charset:
              enc = meta_charset.group(1).decode("ascii", "ignore")
  
+    if not enc and "xml" in content_type:
+        if xml_encoding := re.search(
+            rb"""<\?xml[^\?>]+encoding=['"]([^'"\?>]+)""", content, re.IGNORECASE
+        ):
+            enc = xml_encoding.group(1).decode("ascii", "ignore")
+
      if not enc and "text/css" in content_type:
          # @charset rule must be the very first thing.
          css_charset = re.match(rb"""@charset "([^"]+)";""", content, re.IGNORECASE)
          if css_charset:
              enc = css_charset.group(1).decode("ascii", "ignore")
+        else:
+            # Fallback to utf8 for css
+            # Ref: https://drafts.csswg.org/css-syntax/#determine-the-fallback-encoding
+            enc = "utf8"
  
      # Fallback to latin-1
      if not enc:
diff --git a/test/mitmproxy/net/http/test_headers.py b/test/mitmproxy/net/http/test_headers.py

index cd8d9e98c610bcf033de00c962c4c66dc090294d..5d7687f5d3aa1201622cbee3d001061e2853ada5 100644 (file)
--- a/test/mitmproxy/net/http/test_headers.py
+++ b/test/mitmproxy/net/http/test_headers.py
@@ -37,23 +37,52 @@ def test_assemble_content_type():
          ("", b"foo", "latin-1"),
          ("", b"\xfc", "latin-1"),
          ("", b"\xf0\xe2", "latin-1"),
+        # bom
+        ("", b"\xef\xbb\xbffoo", "utf-8-sig"),
+        ("", b"\xff\xfef\x00o\x00o\x00", "utf-16le"),
+        ("", b"\xfe\xff\x00f\x00o\x00o", "utf-16be"),
+        ("", b"\xff\xfe\x00\x00f\x00\x00\x00o\x00\x00\x00o\x00\x00\x00", "utf-32le"),
+        ("", b"\x00\x00\xfe\xff\x00\x00\x00f\x00\x00\x00o\x00\x00\x00o", "utf-32be"),
+        # content-type charset
          ("text/html; charset=latin1", b"\xc3\xbc", "latin1"),
          ("text/html; charset=utf8", b"\xc3\xbc", "utf8"),
          # json
          ("application/json", b'"\xc3\xbc"', "utf8"),
-        # meta charset
+        # html meta charset
+        (
+            "text/html",
+            b'<meta charset="gb2312">\xe6\x98\x8e\xe4\xbc\xaf',
+            "gb18030",
+        ),
          (
              "text/html",
              b'<meta http-equiv="content-type" '
              b'content="text/html;charset=gb2312">\xe6\x98\x8e\xe4\xbc\xaf',
              "gb18030",
          ),
+        # xml declaration encoding
+        (
+            "application/xml",
+            b'<?xml version="1.0" encoding="gb2312"?>'
+            b"<root>\xe6\x98\x8e\xe4\xbc\xaf</root>",
+            "gb18030",
+        ),
          # css charset
+        (
+            "text/css",
+            b'\xef\xbb\xbf@charset "UTF-8";.\xe5\xb9\xb3\xe5\x92\x8c,#div2 {color: green;}',
+            "utf-8-sig",
+        ),
          (
              "text/css",
              b'@charset "gb2312";#foo::before {content: "\xe6\x98\x8e\xe4\xbc\xaf"}',
              "gb18030",
          ),
+        (
+            "text/css",
+            b"h1 {}",
+            "utf8",
+        ),
      ],
  )
  def test_infer_content_encoding(content_type, content, expected):
author	Cheng Xu <3105373+xu-cheng@users.noreply.github.com>
	Mon, 27 Oct 2025 20:55:36 +0000 (13:55 -0700)
committer	GitHub <noreply@github.com>
	Mon, 27 Oct 2025 20:55:36 +0000 (20:55 +0000)
CHANGELOG.md		patch \| blob \| history
mitmproxy/net/http/headers.py		patch \| blob \| history
test/mitmproxy/net/http/test_headers.py		patch \| blob \| history