gh-63161: Fix tokenize.detect_encoding() (GH-139446)

author Serhiy Storchaka <storchaka@gmail.com>

Mon, 20 Oct 2025 17:08:47 +0000 (20:08 +0300)

committer GitHub <noreply@github.com>

Mon, 20 Oct 2025 17:08:47 +0000 (20:08 +0300)
author Serhiy Storchaka <storchaka@gmail.com>
Mon, 20 Oct 2025 17:08:47 +0000 (20:08 +0300)
committer GitHub <noreply@github.com>
Mon, 20 Oct 2025 17:08:47 +0000 (20:08 +0300)
diff --git a/Lib/test/test_tokenize.py b/Lib/test/test_tokenize.py

index 8fdd03f347b632c54713dee709c55b91ea16b6e6..d274726eed2e659a33a4246c2e74f0ad09f5c4be 100644 (file)
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1495,6 +1495,61 @@ class TestDetectEncoding(TestCase):
          expected = [b"print('\xc2\xa3')\n"]
          self.assertEqual(consumed_lines, expected)
  
+    def test_first_non_utf8_coding_line(self):
+        lines = (
+            b'#coding:iso-8859-15 \xa4\n',
+            b'print(something)\n'
+        )
+        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'iso-8859-15')
+        self.assertEqual(consumed_lines, list(lines[:1]))
+
+    def test_first_utf8_coding_line_error(self):
+        lines = (
+            b'#coding:ascii \xc3\xa4\n',
+            b'print(something)\n'
+        )
+        with self.assertRaises(SyntaxError):
+            tokenize.detect_encoding(self.get_readline(lines))
+
+    def test_second_non_utf8_coding_line(self):
+        lines = (
+            b'#!/usr/bin/python\n',
+            b'#coding:iso-8859-15 \xa4\n',
+            b'print(something)\n'
+        )
+        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'iso-8859-15')
+        self.assertEqual(consumed_lines, list(lines[:2]))
+
+    def test_second_utf8_coding_line_error(self):
+        lines = (
+            b'#!/usr/bin/python\n',
+            b'#coding:ascii \xc3\xa4\n',
+            b'print(something)\n'
+        )
+        with self.assertRaises(SyntaxError):
+            tokenize.detect_encoding(self.get_readline(lines))
+
+    def test_non_utf8_shebang(self):
+        lines = (
+            b'#!/home/\xa4/bin/python\n',
+            b'#coding:iso-8859-15\n',
+            b'print(something)\n'
+        )
+        encoding, consumed_lines = tokenize.detect_encoding(self.get_readline(lines))
+        self.assertEqual(encoding, 'iso-8859-15')
+        self.assertEqual(consumed_lines, list(lines[:2]))
+
+    def test_utf8_shebang_error(self):
+        lines = (
+            b'#!/home/\xc3\xa4/bin/python\n',
+            b'#coding:ascii\n',
+            b'print(something)\n'
+        )
+        with self.assertRaises(SyntaxError):
+            tokenize.detect_encoding(self.get_readline(lines))
+
      def test_cookie_second_line_empty_first_line(self):
          lines = (
              b'\n',
@@ -1548,6 +1603,28 @@ class TestDetectEncoding(TestCase):
          self.assertEqual(encoding, 'utf-8')
          self.assertEqual(consumed_lines, list(lines[:1]))
  
+    def test_nul_in_first_coding_line(self):
+        lines = (
+            b'#coding:iso8859-15\x00\n',
+            b'\n',
+            b'\n',
+            b'print(something)\n'
+        )
+        with self.assertRaisesRegex(SyntaxError,
+                "source code cannot contain null bytes"):
+            tokenize.detect_encoding(self.get_readline(lines))
+
+    def test_nul_in_second_coding_line(self):
+        lines = (
+            b'#!/usr/bin/python\n',
+            b'#coding:iso8859-15\x00\n',
+            b'\n',
+            b'print(something)\n'
+        )
+        with self.assertRaisesRegex(SyntaxError,
+                "source code cannot contain null bytes"):
+            tokenize.detect_encoding(self.get_readline(lines))
+
      def test_latin1_normalization(self):
          # See get_normal_name() in Parser/tokenizer/helpers.c.
          encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
diff --git a/Lib/tokenize.py b/Lib/tokenize.py

index 7e71755068e1df39f02d94d6eef42ef57e15b8e3..1f31258ce361c945f4e370a6b44efa3277911ecd 100644 (file)
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -36,7 +36,7 @@ from token import *
  from token import EXACT_TOKEN_TYPES
  import _tokenize
  
-cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
+cookie_re = re.compile(br'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
  blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
  
  import token
@@ -385,22 +385,23 @@ def detect_encoding(readline):
          except StopIteration:
              return b''
  
-    def find_cookie(line):
+    def check(line, encoding):
+        # Check if the line matches the encoding.
+        if 0 in line:
+            raise SyntaxError("source code cannot contain null bytes")
          try:
-            # Decode as UTF-8. Either the line is an encoding declaration,
-            # in which case it should be pure ASCII, or it must be UTF-8
-            # per default encoding.
-            line_string = line.decode('utf-8')
+            line.decode(encoding)
          except UnicodeDecodeError:
              msg = "invalid or missing encoding declaration"
              if filename is not None:
                  msg = '{} for {!r}'.format(msg, filename)
              raise SyntaxError(msg)
  
-        match = cookie_re.match(line_string)
+    def find_cookie(line):
+        match = cookie_re.match(line)
          if not match:
              return None
-        encoding = _get_normal_name(match.group(1))
+        encoding = _get_normal_name(match.group(1).decode())
          try:
              codec = lookup(encoding)
          except LookupError:
@@ -433,18 +434,23 @@ def detect_encoding(readline):
  
      encoding = find_cookie(first)
      if encoding:
+        check(first, encoding)
          return encoding, [first]
      if not blank_re.match(first):
+        check(first, default)
          return default, [first]
  
      second = read_or_stop()
      if not second:
+        check(first, default)
          return default, [first]
  
      encoding = find_cookie(second)
      if encoding:
+        check(first + second, encoding)
          return encoding, [first, second]
  
+    check(first + second, default)
      return default, [first, second]
  
  
diff --git a/Misc/NEWS.d/next/Library/2025-09-30-12-52-54.gh-issue-63161.mECM1A.rst b/Misc/NEWS.d/next/Library/2025-09-30-12-52-54.gh-issue-63161.mECM1A.rst

new file mode 100644 (file)

index 0000000..3daed20
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2025-09-30-12-52-54.gh-issue-63161.mECM1A.rst
@@ -0,0 +1,3 @@
+Fix :func:`tokenize.detect_encoding`. Support non-UTF-8 shebang and comments
+if non-UTF-8 encoding is specified. Detect decoding error for non-UTF-8
+encoding. Detect null bytes in source code.
author	Serhiy Storchaka <storchaka@gmail.com>
	Mon, 20 Oct 2025 17:08:47 +0000 (20:08 +0300)
committer	GitHub <noreply@github.com>
	Mon, 20 Oct 2025 17:08:47 +0000 (20:08 +0300)
Lib/test/test_tokenize.py		patch \| blob \| blame \| history
Lib/tokenize.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2025-09-30-12-52-54.gh-issue-63161.mECM1A.rst	[new file with mode: 0644]	patch \| blob