- Remove `bless` from hex editors to avoid issues with macOS
([#7937](https://github.com/mitmproxy/mitmproxy/pull/7937), @caiquejjx)
+- Improves `is_mostly_bin` check to support chinese characters
+ ([#7933](https://github.com/mitmproxy/mitmproxy/pull/7933), @caiquejjx, @mhils)
- Fix various issues in infer_content_encoding
([#7928](https://github.com/mitmproxy/mitmproxy/pull/7928), @xu-cheng)
def is_mostly_bin(s: bytes) -> bool:
- if not s or len(s) == 0:
+ if not s:
return False
- return sum(i < 9 or 13 < i < 32 or 126 < i for i in s[:100]) / len(s[:100]) > 0.3
+ # Cut off at ~100 chars, but do it smartly so that if the input is UTF-8, we don't
+ # chop a multibyte code point in half.
+ if len(s) > 100:
+ for cut in range(100, 104):
+ is_continuation_byte = (s[cut] >> 6) == 0b10
+ if not is_continuation_byte:
+ # A new character starts here, so we cut off just before that.
+ s = s[:cut]
+ break
+ else:
+ s = s[:100]
+
+ low_bytes = sum(i < 9 or 13 < i < 32 for i in s)
+ high_bytes = sum(i > 126 for i in s)
+ ascii_bytes = len(s) - low_bytes - high_bytes
+
+ # Heuristic 1: If it's mostly printable ASCII, it's not bin.
+ if ascii_bytes / len(s) > 0.7:
+ return False
+
+ # Heuristic 2: If it's UTF-8 without too many ASCII control chars, it's not bin.
+ # Note that b"\x00\x00\x00" would be valid UTF-8, so we don't want to accept _any_
+ # UTF-8 with higher code points.
+ if (ascii_bytes + high_bytes) / len(s) > 0.95:
+ try:
+ s.decode()
+ return False
+ except ValueError:
+ pass
+
+ return True
def is_xml(s: bytes) -> bool:
def test_is_mostly_bin():
assert not strutils.is_mostly_bin(b"foo\xff")
assert strutils.is_mostly_bin(b"foo" + b"\xff" * 10)
- assert not strutils.is_mostly_bin("")
+ assert not strutils.is_mostly_bin(b"")
+ assert strutils.is_mostly_bin(b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09")
+ # shift UTF8 break point
+ # 𐍅 is four bytes in UTF-8, so we're breaking the 100 chars barrier.
+ assert not strutils.is_mostly_bin(b"" + 50 * "𐍅".encode())
+ assert not strutils.is_mostly_bin(b"a" + 50 * "𐍅".encode())
+ assert not strutils.is_mostly_bin(b"aa" + 50 * "𐍅".encode())
+ assert not strutils.is_mostly_bin(b"aaa" + 50 * "𐍅".encode())
+ assert not strutils.is_mostly_bin(b"aaaa" + 50 * "𐍅".encode())
+ assert not strutils.is_mostly_bin(b"aaaaa" + 50 * "𐍅".encode())
+ # only utf8 continuation chars
+ assert strutils.is_mostly_bin(150 * b"\x80")
def test_is_xml():