From 3b3beedef69f6ad8900e5d1cb5848469467dbeec Mon Sep 17 00:00:00 2001
From: =?utf8?q?Ca=C3=ADque=20Porfirio?=
 <56317416+caiquejjx@users.noreply.github.com>
Date: Thu, 30 Oct 2025 20:09:39 -0300
Subject: [PATCH] fix: is_mostly_bin support chinese characters (#7933)

* improve is_mostly_bin heuristic

* [autofix.ci] apply automated fixes

* nits

---------

Co-authored-by: Maximilian Hils <git@maximilianhils.com>
Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
---
 CHANGELOG.md                          |  2 ++
 mitmproxy/utils/strutils.py           | 34 +++++++++++++++++++++++++--
 test/mitmproxy/utils/test_strutils.py | 13 +++++++++-
 3 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index db22a8b8d..0cdc8b56e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@
 
 - Remove `bless` from hex editors to avoid issues with macOS
   ([#7937](https://github.com/mitmproxy/mitmproxy/pull/7937), @caiquejjx)
+- Improves `is_mostly_bin` check to support chinese characters
+  ([#7933](https://github.com/mitmproxy/mitmproxy/pull/7933), @caiquejjx, @mhils)
 - Fix various issues in infer_content_encoding
   ([#7928](https://github.com/mitmproxy/mitmproxy/pull/7928), @xu-cheng)
 
diff --git a/mitmproxy/utils/strutils.py b/mitmproxy/utils/strutils.py
index ff3396306..d7ab4a8e1 100644
--- a/mitmproxy/utils/strutils.py
+++ b/mitmproxy/utils/strutils.py
@@ -126,10 +126,40 @@ def escaped_str_to_bytes(data: str) -> bytes:
 
 
 def is_mostly_bin(s: bytes) -> bool:
-    if not s or len(s) == 0:
+    if not s:
         return False
 
-    return sum(i < 9 or 13 < i < 32 or 126 < i for i in s[:100]) / len(s[:100]) > 0.3
+    # Cut off at ~100 chars, but do it smartly so that if the input is UTF-8, we don't
+    # chop a multibyte code point in half.
+    if len(s) > 100:
+        for cut in range(100, 104):
+            is_continuation_byte = (s[cut] >> 6) == 0b10
+            if not is_continuation_byte:
+                # A new character starts here, so we cut off just before that.
+                s = s[:cut]
+                break
+        else:
+            s = s[:100]
+
+    low_bytes = sum(i < 9 or 13 < i < 32 for i in s)
+    high_bytes = sum(i > 126 for i in s)
+    ascii_bytes = len(s) - low_bytes - high_bytes
+
+    # Heuristic 1: If it's mostly printable ASCII, it's not bin.
+    if ascii_bytes / len(s) > 0.7:
+        return False
+
+    # Heuristic 2: If it's UTF-8 without too many ASCII control chars, it's not bin.
+    # Note that b"\x00\x00\x00" would be valid UTF-8, so we don't want to accept _any_
+    # UTF-8 with higher code points.
+    if (ascii_bytes + high_bytes) / len(s) > 0.95:
+        try:
+            s.decode()
+            return False
+        except ValueError:
+            pass
+
+    return True
 
 
 def is_xml(s: bytes) -> bool:
diff --git a/test/mitmproxy/utils/test_strutils.py b/test/mitmproxy/utils/test_strutils.py
index 97e4181a5..644e4ea0a 100644
--- a/test/mitmproxy/utils/test_strutils.py
+++ b/test/mitmproxy/utils/test_strutils.py
@@ -82,7 +82,18 @@ def test_escaped_str_to_bytes():
 def test_is_mostly_bin():
     assert not strutils.is_mostly_bin(b"foo\xff")
     assert strutils.is_mostly_bin(b"foo" + b"\xff" * 10)
-    assert not strutils.is_mostly_bin("")
+    assert not strutils.is_mostly_bin(b"")
+    assert strutils.is_mostly_bin(b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09")
+    # shift UTF8 break point
+    # ð is four bytes in UTF-8, so we're breaking the 100 chars barrier.
+    assert not strutils.is_mostly_bin(b"" + 50 * "ð".encode())
+    assert not strutils.is_mostly_bin(b"a" + 50 * "ð".encode())
+    assert not strutils.is_mostly_bin(b"aa" + 50 * "ð".encode())
+    assert not strutils.is_mostly_bin(b"aaa" + 50 * "ð".encode())
+    assert not strutils.is_mostly_bin(b"aaaa" + 50 * "ð".encode())
+    assert not strutils.is_mostly_bin(b"aaaaa" + 50 * "ð".encode())
+    # only utf8 continuation chars
+    assert strutils.is_mostly_bin(150 * b"\x80")
 
 
 def test_is_xml():
-- 
2.43.0