From 3b3beedef69f6ad8900e5d1cb5848469467dbeec Mon Sep 17 00:00:00 2001 From: =?utf8?q?Ca=C3=ADque=20Porfirio?= <56317416+caiquejjx@users.noreply.github.com> Date: Thu, 30 Oct 2025 20:09:39 -0300 Subject: [PATCH] fix: is_mostly_bin support chinese characters (#7933) * improve is_mostly_bin heuristic * [autofix.ci] apply automated fixes * nits --------- Co-authored-by: Maximilian Hils Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> --- CHANGELOG.md | 2 ++ mitmproxy/utils/strutils.py | 34 +++++++++++++++++++++++++-- test/mitmproxy/utils/test_strutils.py | 13 +++++++++- 3 files changed, 46 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index db22a8b8d..0cdc8b56e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ - Remove `bless` from hex editors to avoid issues with macOS ([#7937](https://github.com/mitmproxy/mitmproxy/pull/7937), @caiquejjx) +- Improves `is_mostly_bin` check to support chinese characters + ([#7933](https://github.com/mitmproxy/mitmproxy/pull/7933), @caiquejjx, @mhils) - Fix various issues in infer_content_encoding ([#7928](https://github.com/mitmproxy/mitmproxy/pull/7928), @xu-cheng) diff --git a/mitmproxy/utils/strutils.py b/mitmproxy/utils/strutils.py index ff3396306..d7ab4a8e1 100644 --- a/mitmproxy/utils/strutils.py +++ b/mitmproxy/utils/strutils.py @@ -126,10 +126,40 @@ def escaped_str_to_bytes(data: str) -> bytes: def is_mostly_bin(s: bytes) -> bool: - if not s or len(s) == 0: + if not s: return False - return sum(i < 9 or 13 < i < 32 or 126 < i for i in s[:100]) / len(s[:100]) > 0.3 + # Cut off at ~100 chars, but do it smartly so that if the input is UTF-8, we don't + # chop a multibyte code point in half. + if len(s) > 100: + for cut in range(100, 104): + is_continuation_byte = (s[cut] >> 6) == 0b10 + if not is_continuation_byte: + # A new character starts here, so we cut off just before that. + s = s[:cut] + break + else: + s = s[:100] + + low_bytes = sum(i < 9 or 13 < i < 32 for i in s) + high_bytes = sum(i > 126 for i in s) + ascii_bytes = len(s) - low_bytes - high_bytes + + # Heuristic 1: If it's mostly printable ASCII, it's not bin. + if ascii_bytes / len(s) > 0.7: + return False + + # Heuristic 2: If it's UTF-8 without too many ASCII control chars, it's not bin. + # Note that b"\x00\x00\x00" would be valid UTF-8, so we don't want to accept _any_ + # UTF-8 with higher code points. + if (ascii_bytes + high_bytes) / len(s) > 0.95: + try: + s.decode() + return False + except ValueError: + pass + + return True def is_xml(s: bytes) -> bool: diff --git a/test/mitmproxy/utils/test_strutils.py b/test/mitmproxy/utils/test_strutils.py index 97e4181a5..644e4ea0a 100644 --- a/test/mitmproxy/utils/test_strutils.py +++ b/test/mitmproxy/utils/test_strutils.py @@ -82,7 +82,18 @@ def test_escaped_str_to_bytes(): def test_is_mostly_bin(): assert not strutils.is_mostly_bin(b"foo\xff") assert strutils.is_mostly_bin(b"foo" + b"\xff" * 10) - assert not strutils.is_mostly_bin("") + assert not strutils.is_mostly_bin(b"") + assert strutils.is_mostly_bin(b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09") + # shift UTF8 break point + # 𐍅 is four bytes in UTF-8, so we're breaking the 100 chars barrier. + assert not strutils.is_mostly_bin(b"" + 50 * "𐍅".encode()) + assert not strutils.is_mostly_bin(b"a" + 50 * "𐍅".encode()) + assert not strutils.is_mostly_bin(b"aa" + 50 * "𐍅".encode()) + assert not strutils.is_mostly_bin(b"aaa" + 50 * "𐍅".encode()) + assert not strutils.is_mostly_bin(b"aaaa" + 50 * "𐍅".encode()) + assert not strutils.is_mostly_bin(b"aaaaa" + 50 * "𐍅".encode()) + # only utf8 continuation chars + assert strutils.is_mostly_bin(150 * b"\x80") def test_is_xml(): -- 2.43.0