## Unreleased: mitmproxy next
+- infer_content_encoding: Fallback to UTF-8 for more content types
+ ([#7961](https://github.com/mitmproxy/mitmproxy/pull/7961), @xu-cheng)
- Remove `bless` from hex editors to avoid issues with macOS
([#7937](https://github.com/mitmproxy/mitmproxy/pull/7937), @caiquejjx)
- Improves `is_mostly_bin` check to support chinese characters
)
if meta_charset:
enc = meta_charset.group(1).decode("ascii", "ignore")
+ else:
+ # Fallback to utf8 for html
+ # Ref: https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
+ # > 9. [snip] the comprehensive UTF-8 encoding is suggested.
+ enc = "utf8"
if not enc and "xml" in content_type:
if xml_encoding := re.search(
rb"""<\?xml[^\?>]+encoding=['"]([^'"\?>]+)""", content, re.IGNORECASE
):
enc = xml_encoding.group(1).decode("ascii", "ignore")
+ else:
+ # Fallback to utf8 for xml
+ # Ref: https://datatracker.ietf.org/doc/html/rfc7303#section-8.5
+ # > the XML processor [snip] to determine an encoding of UTF-8.
+ enc = "utf8"
+
+ if not enc and ("javascript" in content_type or "ecmascript" in content_type):
+ # Fallback to utf8 for javascript
+ # Ref: https://datatracker.ietf.org/doc/html/rfc9239#section-4.2
+ # > 3. Else, the character encoding scheme is assumed to be UTF-8
+ enc = "utf8"
if not enc and "text/css" in content_type:
# @charset rule must be the very first thing.
else:
# Fallback to utf8 for css
# Ref: https://drafts.csswg.org/css-syntax/#determine-the-fallback-encoding
+ # > 4. Otherwise, return utf-8
enc = "utf8"
# Fallback to latin-1
],
[
"content-length",
- "10452"
+ "10453"
]
],
- "contentLength": 10452,
- "contentHash": "ebe20255d8922ecbc1a159357d0cb0eb1ef05b8cc5a9d59f2a9f7e6fb585d7d3",
+ "contentLength": 10453,
+ "contentHash": "8a7739925f4c03586479852df840b7061948832a7fda30c8c812d2ea4dd4c4f2",
"timestamp_start": 1680134339.498,
"timestamp_end": 1680134339.498
}
"timestamp_end": 1680134339.643
}
}
-]
\ No newline at end of file
+]
],
[
"content-length",
- "10452"
+ "10453"
]
],
- "contentLength": 10452,
- "contentHash": "ebe20255d8922ecbc1a159357d0cb0eb1ef05b8cc5a9d59f2a9f7e6fb585d7d3",
+ "contentLength": 10453,
+ "contentHash": "8a7739925f4c03586479852df840b7061948832a7fda30c8c812d2ea4dd4c4f2",
"timestamp_start": 1680135212.559,
"timestamp_end": 1680135212.5590725
}
"timestamp_end": 1680135212.609897
}
}
-]
\ No newline at end of file
+]
b'content="text/html;charset=gb2312">\xe6\x98\x8e\xe4\xbc\xaf',
"gb18030",
),
+ (
+ "text/html",
+ b"<html></html>",
+ "utf8",
+ ),
# xml declaration encoding
(
"application/xml",
b"<root>\xe6\x98\x8e\xe4\xbc\xaf</root>",
"gb18030",
),
+ (
+ "application/xml",
+ b'<?xml version="1.0"?>',
+ "utf8",
+ ),
# css charset
(
"text/css",
b"h1 {}",
"utf8",
),
+ # js
+ ("application/javascript", b"", "utf8"),
+ ("application/ecmascript", b"", "utf8"),
+ ("text/javascript", b"", "utf8"),
],
)
def test_infer_content_encoding(content_type, content, expected):