]> git.feebdaed.xyz Git - 0xmirror/mitmproxy.git/commitdiff
infer_content_encoding: Fallback to UTF-8 for more content types (#7961)
authorCheng Xu <3105373+xu-cheng@users.noreply.github.com>
Sun, 9 Nov 2025 14:18:13 +0000 (06:18 -0800)
committerGitHub <noreply@github.com>
Sun, 9 Nov 2025 14:18:13 +0000 (15:18 +0100)
For html, css, js, and xml content types, the official specs tell
us to use UTF-8 as fallback when charset are not specified.

Relevant sections of the specs are included in the comments for
the corresponding branches.

This commit also uncovered incorrect decoding in the following har
tests:

test/mitmproxy/data/har_files/firefox.json
test/mitmproxy/data/har_files/safari.json

```diff
- * Licensed MIT © Zeno Rocha
+ * Licensed MIT  Zeno Rocha
```

Noted that the UTF-8 char `©` was missing due to the incorrect encoding
fallback.

CHANGELOG.md
mitmproxy/net/http/headers.py
test/mitmproxy/data/har_files/firefox.json
test/mitmproxy/data/har_files/safari.json
test/mitmproxy/net/http/test_headers.py

index 9072baa83d8ca0332d5c8e78f1cb0bfc833e71ee..1f8683c99c0640dd69edaf1b8fe126524312a701 100644 (file)
@@ -7,6 +7,8 @@
 
 ## Unreleased: mitmproxy next
 
+- infer_content_encoding: Fallback to UTF-8 for more content types
+  ([#7961](https://github.com/mitmproxy/mitmproxy/pull/7961), @xu-cheng)
 - Remove `bless` from hex editors to avoid issues with macOS
   ([#7937](https://github.com/mitmproxy/mitmproxy/pull/7937), @caiquejjx)
 - Improves `is_mostly_bin` check to support chinese characters
index 4521c9c5e29369133bff975c1af4e027b6b0a022..3bad1d2b5bf99fb3aa58e81324299b0b490058c2 100644 (file)
@@ -68,12 +68,28 @@ def infer_content_encoding(content_type: str, content: bytes = b"") -> str:
         )
         if meta_charset:
             enc = meta_charset.group(1).decode("ascii", "ignore")
+        else:
+            # Fallback to utf8 for html
+            # Ref: https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
+            # > 9. [snip] the comprehensive UTF-8 encoding is suggested.
+            enc = "utf8"
 
     if not enc and "xml" in content_type:
         if xml_encoding := re.search(
             rb"""<\?xml[^\?>]+encoding=['"]([^'"\?>]+)""", content, re.IGNORECASE
         ):
             enc = xml_encoding.group(1).decode("ascii", "ignore")
+        else:
+            # Fallback to utf8 for xml
+            # Ref: https://datatracker.ietf.org/doc/html/rfc7303#section-8.5
+            # > the XML processor [snip] to determine an encoding of UTF-8.
+            enc = "utf8"
+
+    if not enc and ("javascript" in content_type or "ecmascript" in content_type):
+        # Fallback to utf8 for javascript
+        # Ref: https://datatracker.ietf.org/doc/html/rfc9239#section-4.2
+        # > 3. Else, the character encoding scheme is assumed to be UTF-8
+        enc = "utf8"
 
     if not enc and "text/css" in content_type:
         # @charset rule must be the very first thing.
@@ -83,6 +99,7 @@ def infer_content_encoding(content_type: str, content: bytes = b"") -> str:
         else:
             # Fallback to utf8 for css
             # Ref: https://drafts.csswg.org/css-syntax/#determine-the-fallback-encoding
+            # > 4. Otherwise, return utf-8
             enc = "utf8"
 
     # Fallback to latin-1
index 5e7f7cef58080c4d972e1141e89aaf35219edc1c..71619c64ab23bc3a14711c38e6c7c3b2d01331f6 100644 (file)
                 ],
                 [
                     "content-length",
-                    "10452"
+                    "10453"
                 ]
             ],
-            "contentLength": 10452,
-            "contentHash": "ebe20255d8922ecbc1a159357d0cb0eb1ef05b8cc5a9d59f2a9f7e6fb585d7d3",
+            "contentLength": 10453,
+            "contentHash": "8a7739925f4c03586479852df840b7061948832a7fda30c8c812d2ea4dd4c4f2",
             "timestamp_start": 1680134339.498,
             "timestamp_end": 1680134339.498
         }
             "timestamp_end": 1680134339.643
         }
     }
-]
\ No newline at end of file
+]
index 15872c0524adde4ccde6a734f3227b5cca74e304..bc30d77d17b285a1ed33da0f86b20fc66b767fa1 100644 (file)
                 ],
                 [
                     "content-length",
-                    "10452"
+                    "10453"
                 ]
             ],
-            "contentLength": 10452,
-            "contentHash": "ebe20255d8922ecbc1a159357d0cb0eb1ef05b8cc5a9d59f2a9f7e6fb585d7d3",
+            "contentLength": 10453,
+            "contentHash": "8a7739925f4c03586479852df840b7061948832a7fda30c8c812d2ea4dd4c4f2",
             "timestamp_start": 1680135212.559,
             "timestamp_end": 1680135212.5590725
         }
             "timestamp_end": 1680135212.609897
         }
     }
-]
\ No newline at end of file
+]
index 5d7687f5d3aa1201622cbee3d001061e2853ada5..21dccad3769811b3557cb7fe87f4186beef73c73 100644 (file)
@@ -60,6 +60,11 @@ def test_assemble_content_type():
             b'content="text/html;charset=gb2312">\xe6\x98\x8e\xe4\xbc\xaf',
             "gb18030",
         ),
+        (
+            "text/html",
+            b"<html></html>",
+            "utf8",
+        ),
         # xml declaration encoding
         (
             "application/xml",
@@ -67,6 +72,11 @@ def test_assemble_content_type():
             b"<root>\xe6\x98\x8e\xe4\xbc\xaf</root>",
             "gb18030",
         ),
+        (
+            "application/xml",
+            b'<?xml version="1.0"?>',
+            "utf8",
+        ),
         # css charset
         (
             "text/css",
@@ -83,6 +93,10 @@ def test_assemble_content_type():
             b"h1 {}",
             "utf8",
         ),
+        # js
+        ("application/javascript", b"", "utf8"),
+        ("application/ecmascript", b"", "utf8"),
+        ("text/javascript", b"", "utf8"),
     ],
 )
 def test_infer_content_encoding(content_type, content, expected):