infer_content_encoding: Fallback to UTF-8 for more content types (#7961)

author Cheng Xu <3105373+xu-cheng@users.noreply.github.com>

Sun, 9 Nov 2025 14:18:13 +0000 (06:18 -0800)

committer GitHub <noreply@github.com>

Sun, 9 Nov 2025 14:18:13 +0000 (15:18 +0100)
author Cheng Xu <3105373+xu-cheng@users.noreply.github.com>
Sun, 9 Nov 2025 14:18:13 +0000 (06:18 -0800)
committer GitHub <noreply@github.com>
Sun, 9 Nov 2025 14:18:13 +0000 (15:18 +0100)
diff --git a/CHANGELOG.md b/CHANGELOG.md

index 9072baa83d8ca0332d5c8e78f1cb0bfc833e71ee..1f8683c99c0640dd69edaf1b8fe126524312a701 100644 (file)
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@
  
  ## Unreleased: mitmproxy next
  
+- infer_content_encoding: Fallback to UTF-8 for more content types
+  ([#7961](https://github.com/mitmproxy/mitmproxy/pull/7961), @xu-cheng)
  - Remove `bless` from hex editors to avoid issues with macOS
    ([#7937](https://github.com/mitmproxy/mitmproxy/pull/7937), @caiquejjx)
  - Improves `is_mostly_bin` check to support chinese characters
diff --git a/mitmproxy/net/http/headers.py b/mitmproxy/net/http/headers.py

index 4521c9c5e29369133bff975c1af4e027b6b0a022..3bad1d2b5bf99fb3aa58e81324299b0b490058c2 100644 (file)
--- a/mitmproxy/net/http/headers.py
+++ b/mitmproxy/net/http/headers.py
@@ -68,12 +68,28 @@ def infer_content_encoding(content_type: str, content: bytes = b"") -> str:
          )
          if meta_charset:
              enc = meta_charset.group(1).decode("ascii", "ignore")
+        else:
+            # Fallback to utf8 for html
+            # Ref: https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
+            # > 9. [snip] the comprehensive UTF-8 encoding is suggested.
+            enc = "utf8"
  
      if not enc and "xml" in content_type:
          if xml_encoding := re.search(
              rb"""<\?xml[^\?>]+encoding=['"]([^'"\?>]+)""", content, re.IGNORECASE
          ):
              enc = xml_encoding.group(1).decode("ascii", "ignore")
+        else:
+            # Fallback to utf8 for xml
+            # Ref: https://datatracker.ietf.org/doc/html/rfc7303#section-8.5
+            # > the XML processor [snip] to determine an encoding of UTF-8.
+            enc = "utf8"
+
+    if not enc and ("javascript" in content_type or "ecmascript" in content_type):
+        # Fallback to utf8 for javascript
+        # Ref: https://datatracker.ietf.org/doc/html/rfc9239#section-4.2
+        # > 3. Else, the character encoding scheme is assumed to be UTF-8
+        enc = "utf8"
  
      if not enc and "text/css" in content_type:
          # @charset rule must be the very first thing.
@@ -83,6 +99,7 @@ def infer_content_encoding(content_type: str, content: bytes = b"") -> str:
          else:
              # Fallback to utf8 for css
              # Ref: https://drafts.csswg.org/css-syntax/#determine-the-fallback-encoding
+            # > 4. Otherwise, return utf-8
              enc = "utf8"
  
      # Fallback to latin-1
diff --git a/test/mitmproxy/data/har_files/firefox.json b/test/mitmproxy/data/har_files/firefox.json

index 5e7f7cef58080c4d972e1141e89aaf35219edc1c..71619c64ab23bc3a14711c38e6c7c3b2d01331f6 100644 (file)
--- a/test/mitmproxy/data/har_files/firefox.json
+++ b/test/mitmproxy/data/har_files/firefox.json
@@ -1024,11 +1024,11 @@
                  ],
                  [
                      "content-length",
-                    "10452"
+                    "10453"
                  ]
              ],
-            "contentLength": 10452,
-            "contentHash": "ebe20255d8922ecbc1a159357d0cb0eb1ef05b8cc5a9d59f2a9f7e6fb585d7d3",
+            "contentLength": 10453,
+            "contentHash": "8a7739925f4c03586479852df840b7061948832a7fda30c8c812d2ea4dd4c4f2",
              "timestamp_start": 1680134339.498,
              "timestamp_end": 1680134339.498
          }
@@ -2105,4 +2105,4 @@
              "timestamp_end": 1680134339.643
          }
      }
-]
-\ No newline at end of file
+]
diff --git a/test/mitmproxy/data/har_files/safari.json b/test/mitmproxy/data/har_files/safari.json

index 15872c0524adde4ccde6a734f3227b5cca74e304..bc30d77d17b285a1ed33da0f86b20fc66b767fa1 100644 (file)
--- a/test/mitmproxy/data/har_files/safari.json
+++ b/test/mitmproxy/data/har_files/safari.json
@@ -1045,11 +1045,11 @@
                  ],
                  [
                      "content-length",
-                    "10452"
+                    "10453"
                  ]
              ],
-            "contentLength": 10452,
-            "contentHash": "ebe20255d8922ecbc1a159357d0cb0eb1ef05b8cc5a9d59f2a9f7e6fb585d7d3",
+            "contentLength": 10453,
+            "contentHash": "8a7739925f4c03586479852df840b7061948832a7fda30c8c812d2ea4dd4c4f2",
              "timestamp_start": 1680135212.559,
              "timestamp_end": 1680135212.5590725
          }
@@ -2564,4 +2564,4 @@
              "timestamp_end": 1680135212.609897
          }
      }
-]
-\ No newline at end of file
+]
diff --git a/test/mitmproxy/net/http/test_headers.py b/test/mitmproxy/net/http/test_headers.py

index 5d7687f5d3aa1201622cbee3d001061e2853ada5..21dccad3769811b3557cb7fe87f4186beef73c73 100644 (file)
--- a/test/mitmproxy/net/http/test_headers.py
+++ b/test/mitmproxy/net/http/test_headers.py
@@ -60,6 +60,11 @@ def test_assemble_content_type():
              b'content="text/html;charset=gb2312">\xe6\x98\x8e\xe4\xbc\xaf',
              "gb18030",
          ),
+        (
+            "text/html",
+            b"<html></html>",
+            "utf8",
+        ),
          # xml declaration encoding
          (
              "application/xml",
@@ -67,6 +72,11 @@ def test_assemble_content_type():
              b"<root>\xe6\x98\x8e\xe4\xbc\xaf</root>",
              "gb18030",
          ),
+        (
+            "application/xml",
+            b'<?xml version="1.0"?>',
+            "utf8",
+        ),
          # css charset
          (
              "text/css",
@@ -83,6 +93,10 @@ def test_assemble_content_type():
              b"h1 {}",
              "utf8",
          ),
+        # js
+        ("application/javascript", b"", "utf8"),
+        ("application/ecmascript", b"", "utf8"),
+        ("text/javascript", b"", "utf8"),
      ],
  )
  def test_infer_content_encoding(content_type, content, expected):
author	Cheng Xu <3105373+xu-cheng@users.noreply.github.com>
	Sun, 9 Nov 2025 14:18:13 +0000 (06:18 -0800)
committer	GitHub <noreply@github.com>
	Sun, 9 Nov 2025 14:18:13 +0000 (15:18 +0100)
CHANGELOG.md		patch \| blob \| history
mitmproxy/net/http/headers.py		patch \| blob \| history
test/mitmproxy/data/har_files/firefox.json		patch \| blob \| history
test/mitmproxy/data/har_files/safari.json		patch \| blob \| history
test/mitmproxy/net/http/test_headers.py		patch \| blob \| history