gh-123803: Support arbitrary code page encodings on Windows (GH-123804)

author Serhiy Storchaka <storchaka@gmail.com>

Mon, 18 Nov 2024 17:45:25 +0000 (19:45 +0200)

committer GitHub <noreply@github.com>

Mon, 18 Nov 2024 17:45:25 +0000 (17:45 +0000)
author Serhiy Storchaka <storchaka@gmail.com>
Mon, 18 Nov 2024 17:45:25 +0000 (19:45 +0200)
committer GitHub <noreply@github.com>
Mon, 18 Nov 2024 17:45:25 +0000 (17:45 +0000)
diff --git a/Doc/library/codecs.rst b/Doc/library/codecs.rst

index 2cfd8a1eaee806f743f6866474ac27b19a0c9092..a129a26190ba991f3c5bd8d910d25ca2a65254a4 100644 (file)
--- a/Doc/library/codecs.rst
+++ b/Doc/library/codecs.rst
@@ -1042,6 +1042,10 @@ is meant to be exhaustive. Notice that spelling alternatives that only differ in
  case or use a hyphen instead of an underscore are also valid aliases; therefore,
  e.g. ``'utf-8'`` is a valid alias for the ``'utf_8'`` codec.
  
+On Windows, ``cpXXX`` codecs are available for all code pages.
+But only codecs listed in the following table are guarantead to exist on
+other platforms.
+
  .. impl-detail::
  
     Some common encodings can bypass the codecs lookup machinery to
@@ -1307,6 +1311,9 @@ particular, the following variants typically exist:
  .. versionchanged:: 3.8
     ``cp65001`` is now an alias to ``utf_8``.
  
+.. versionchanged:: 3.14
+   On Windows, ``cpXXX`` codecs are now available for all code pages.
+
  
  Python Specific Encodings
  -------------------------
diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst

index 958efbe73c1c27b9a48f530229cae9b0ed80d064..8196250d784843abec43ecaef2506b736d61eb01 100644 (file)
--- a/Doc/whatsnew/3.14.rst
+++ b/Doc/whatsnew/3.14.rst
@@ -194,6 +194,9 @@ Other language changes
    They raise an error if the argument is a string.
    (Contributed by Serhiy Storchaka in :gh:`84978`.)
  
+* All Windows code pages are now supported as "cpXXX" codecs on Windows.
+  (Contributed by Serhiy Storchaka in :gh:`123803`.)
+
  * :class:`super` objects are now :mod:`pickleable <pickle>` and
    :mod:`copyable <copy>`.
    (Contributed by Serhiy Storchaka in :gh:`125767`.)
diff --git a/Lib/encodings/__init__.py b/Lib/encodings/__init__.py

index f9075b8f0d98acf23cbf61ae62f4881aa5d15f79..298177eb8003a7af233fc286f2dfd954cb0cfd2d 100644 (file)
--- a/Lib/encodings/__init__.py
+++ b/Lib/encodings/__init__.py
@@ -156,19 +156,22 @@ def search_function(encoding):
  codecs.register(search_function)
  
  if sys.platform == 'win32':
-    # bpo-671666, bpo-46668: If Python does not implement a codec for current
-    # Windows ANSI code page, use the "mbcs" codec instead:
-    # WideCharToMultiByte() and MultiByteToWideChar() functions with CP_ACP.
-    # Python does not support custom code pages.
-    def _alias_mbcs(encoding):
+    from ._win_cp_codecs import create_win32_code_page_codec
+
+    def win32_code_page_search_function(encoding):
+        encoding = encoding.lower()
+        if not encoding.startswith('cp'):
+            return None
          try:
-            import _winapi
-            ansi_code_page = "cp%s" % _winapi.GetACP()
-            if encoding == ansi_code_page:
-                import encodings.mbcs
-                return encodings.mbcs.getregentry()
-        except ImportError:
-            # Imports may fail while we are shutting down
-            pass
+            cp = int(encoding[2:])
+        except ValueError:
+            return None
+        # Test if the code page is supported
+        try:
+            codecs.code_page_encode(cp, 'x')
+        except (OverflowError, OSError):
+            return None
+
+        return create_win32_code_page_codec(cp)
  
-    codecs.register(_alias_mbcs)
+    codecs.register(win32_code_page_search_function)
diff --git a/Lib/encodings/_win_cp_codecs.py b/Lib/encodings/_win_cp_codecs.py

new file mode 100644 (file)

index 0000000..4f8eb88
--- /dev/null
+++ b/Lib/encodings/_win_cp_codecs.py
@@ -0,0 +1,36 @@
+import codecs
+
+def create_win32_code_page_codec(cp):
+    from codecs import code_page_encode, code_page_decode
+
+    def encode(input, errors='strict'):
+        return code_page_encode(cp, input, errors)
+
+    def decode(input, errors='strict'):
+        return code_page_decode(cp, input, errors, True)
+
+    class IncrementalEncoder(codecs.IncrementalEncoder):
+        def encode(self, input, final=False):
+            return code_page_encode(cp, input, self.errors)[0]
+
+    class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+        def _buffer_decode(self, input, errors, final):
+            return code_page_decode(cp, input, errors, final)
+
+    class StreamWriter(codecs.StreamWriter):
+        def encode(self, input, errors='strict'):
+            return code_page_encode(cp, input, errors)
+
+    class StreamReader(codecs.StreamReader):
+        def decode(self, input, errors, final):
+            return code_page_decode(cp, input, errors, final)
+
+    return codecs.CodecInfo(
+        name=f'cp{cp}',
+        encode=encode,
+        decode=decode,
+        incrementalencoder=IncrementalEncoder,
+        incrementaldecoder=IncrementalDecoder,
+        streamreader=StreamReader,
+        streamwriter=StreamWriter,
+    )
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py

index 290656f070503ad9c271ee1d7649982e744c7537..e51f7e0ee12b1f31403d36b4d95012e424eb023a 100644 (file)
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -3256,7 +3256,11 @@ class CodePageTest(unittest.TestCase):
              codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
  
      def check_decode(self, cp, tests):
-        for raw, errors, expected in tests:
+        for raw, errors, expected, *rest in tests:
+            if rest:
+                altexpected, = rest
+            else:
+                altexpected = expected
              if expected is not None:
                  try:
                      decoded = codecs.code_page_decode(cp, raw, errors, True)
@@ -3273,8 +3277,21 @@ class CodePageTest(unittest.TestCase):
                  self.assertRaises(UnicodeDecodeError,
                      codecs.code_page_decode, cp, raw, errors, True)
  
+            if altexpected is not None:
+                decoded = raw.decode(f'cp{cp}', errors)
+                self.assertEqual(decoded, altexpected,
+                    '%a.decode("cp%s", %r)=%a != %a'
+                    % (raw, cp, errors, decoded, altexpected))
+            else:
+                self.assertRaises(UnicodeDecodeError,
+                    raw.decode, f'cp{cp}', errors)
+
      def check_encode(self, cp, tests):
-        for text, errors, expected in tests:
+        for text, errors, expected, *rest in tests:
+            if rest:
+                altexpected, = rest
+            else:
+                altexpected = expected
              if expected is not None:
                  try:
                      encoded = codecs.code_page_encode(cp, text, errors)
@@ -3285,18 +3302,26 @@ class CodePageTest(unittest.TestCase):
                      '%a.encode("cp%s", %r)=%a != %a'
                      % (text, cp, errors, encoded[0], expected))
                  self.assertEqual(encoded[1], len(text))
+
+                encoded = text.encode(f'cp{cp}', errors)
+                self.assertEqual(encoded, altexpected,
+                    '%a.encode("cp%s", %r)=%a != %a'
+                    % (text, cp, errors, encoded, altexpected))
              else:
                  self.assertRaises(UnicodeEncodeError,
                      codecs.code_page_encode, cp, text, errors)
+                self.assertRaises(UnicodeEncodeError,
+                    text.encode, f'cp{cp}', errors)
  
      def test_cp932(self):
          self.check_encode(932, (
              ('abc', 'strict', b'abc'),
              ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
+            ('\uf8f3', 'strict', b'\xff'),
              # test error handlers
              ('\xff', 'strict', None),
              ('[\xff]', 'ignore', b'[]'),
-            ('[\xff]', 'replace', b'[y]'),
+            ('[\xff]', 'replace', b'[y]', b'[?]'),
              ('[\u20ac]', 'replace', b'[?]'),
              ('[\xff]', 'backslashreplace', b'[\\xff]'),
              ('[\xff]', 'namereplace',
@@ -3310,12 +3335,12 @@ class CodePageTest(unittest.TestCase):
              (b'abc', 'strict', 'abc'),
              (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
              # invalid bytes
-            (b'[\xff]', 'strict', None),
-            (b'[\xff]', 'ignore', '[]'),
-            (b'[\xff]', 'replace', '[\ufffd]'),
-            (b'[\xff]', 'backslashreplace', '[\\xff]'),
-            (b'[\xff]', 'surrogateescape', '[\udcff]'),
-            (b'[\xff]', 'surrogatepass', None),
+            (b'[\xff]', 'strict', None, '[\uf8f3]'),
+            (b'[\xff]', 'ignore', '[]', '[\uf8f3]'),
+            (b'[\xff]', 'replace', '[\ufffd]', '[\uf8f3]'),
+            (b'[\xff]', 'backslashreplace', '[\\xff]', '[\uf8f3]'),
+            (b'[\xff]', 'surrogateescape', '[\udcff]', '[\uf8f3]'),
+            (b'[\xff]', 'surrogatepass', None, '[\uf8f3]'),
              (b'\x81\x00abc', 'strict', None),
              (b'\x81\x00abc', 'ignore', '\x00abc'),
              (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
@@ -3330,7 +3355,7 @@ class CodePageTest(unittest.TestCase):
              # test error handlers
              ('\u0141', 'strict', None),
              ('\u0141', 'ignore', b''),
-            ('\u0141', 'replace', b'L'),
+            ('\u0141', 'replace', b'L', b'?'),
              ('\udc98', 'surrogateescape', b'\x98'),
              ('\udc98', 'surrogatepass', None),
          ))
@@ -3340,6 +3365,59 @@ class CodePageTest(unittest.TestCase):
              (b'\xff', 'strict', '\xff'),
          ))
  
+    def test_cp708(self):
+        self.check_encode(708, (
+            ('abc2%', 'strict', b'abc2%'),
+            ('\u060c\u0621\u064a', 'strict',  b'\xac\xc1\xea'),
+            ('\u2562\xe7\xa0', 'strict',  b'\x86\x87\xff'),
+            ('\x9a\x9f', 'strict', b'\x9a\x9f'),
+            ('\u256b', 'strict', b'\xc0'),
+            # test error handlers
+            ('[\u0662]', 'strict',  None),
+            ('[\u0662]', 'ignore',  b'[]'),
+            ('[\u0662]', 'replace',  b'[?]'),
+            ('\udca0', 'surrogateescape', b'\xa0'),
+            ('\udca0', 'surrogatepass', None),
+        ))
+        self.check_decode(708, (
+            (b'abc2%', 'strict', 'abc2%'),
+            (b'\xac\xc1\xea', 'strict', '\u060c\u0621\u064a'),
+            (b'\x86\x87\xff', 'strict', '\u2562\xe7\xa0'),
+            (b'\x9a\x9f', 'strict', '\x9a\x9f'),
+            (b'\xc0', 'strict', '\u256b'),
+            # test error handlers
+            (b'\xa0', 'strict', None),
+            (b'[\xa0]', 'ignore', '[]'),
+            (b'[\xa0]', 'replace', '[\ufffd]'),
+            (b'[\xa0]', 'backslashreplace', '[\\xa0]'),
+            (b'[\xa0]', 'surrogateescape', '[\udca0]'),
+            (b'[\xa0]', 'surrogatepass', None),
+        ))
+
+    def test_cp20106(self):
+        self.check_encode(20106, (
+            ('abc', 'strict', b'abc'),
+            ('\xa7\xc4\xdf', 'strict',  b'@[~'),
+            # test error handlers
+            ('@', 'strict', None),
+            ('@', 'ignore', b''),
+            ('@', 'replace', b'?'),
+            ('\udcbf', 'surrogateescape', b'\xbf'),
+            ('\udcbf', 'surrogatepass', None),
+        ))
+        self.check_decode(20106, (
+            (b'abc', 'strict', 'abc'),
+            (b'@[~', 'strict', '\xa7\xc4\xdf'),
+            (b'\xe1\xfe', 'strict', 'a\xdf'),
+            # test error handlers
+            (b'(\xbf)', 'strict', None),
+            (b'(\xbf)', 'ignore', '()'),
+            (b'(\xbf)', 'replace', '(\ufffd)'),
+            (b'(\xbf)', 'backslashreplace', '(\\xbf)'),
+            (b'(\xbf)', 'surrogateescape', '(\udcbf)'),
+            (b'(\xbf)', 'surrogatepass', None),
+        ))
+
      def test_cp_utf7(self):
          cp = 65000
          self.check_encode(cp, (
@@ -3412,17 +3490,15 @@ class CodePageTest(unittest.TestCase):
                                            False)
          self.assertEqual(decoded, ('abc', 3))
  
-    def test_mbcs_alias(self):
-        # Check that looking up our 'default' codepage will return
-        # mbcs when we don't have a more specific one available
-        code_page = 99_999
-        name = f'cp{code_page}'
-        with mock.patch('_winapi.GetACP', return_value=code_page):
-            try:
-                codec = codecs.lookup(name)
-                self.assertEqual(codec.name, 'mbcs')
-            finally:
-                codecs.unregister(name)
+    def test_mbcs_code_page(self):
+        # Check that codec for the current Windows (ANSII) code page is
+        # always available.
+        try:
+            from _winapi import GetACP
+        except ImportError:
+            self.skipTest('requires _winapi.GetACP')
+        cp = GetACP()
+        codecs.lookup(f'cp{cp}')
  
      @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
      def test_large_input(self, size):
diff --git a/Misc/NEWS.d/next/Windows/2024-09-07-15-16-24.gh-issue-123803.J9VNQU.rst b/Misc/NEWS.d/next/Windows/2024-09-07-15-16-24.gh-issue-123803.J9VNQU.rst

new file mode 100644 (file)

index 0000000..3ad4d12
--- /dev/null
+++ b/Misc/NEWS.d/next/Windows/2024-09-07-15-16-24.gh-issue-123803.J9VNQU.rst
@@ -0,0 +1 @@
+All Windows code pages are now supported as "cpXXX" codecs on Windows.
author	Serhiy Storchaka <storchaka@gmail.com>
	Mon, 18 Nov 2024 17:45:25 +0000 (19:45 +0200)
committer	GitHub <noreply@github.com>
	Mon, 18 Nov 2024 17:45:25 +0000 (17:45 +0000)
Doc/library/codecs.rst		patch \| blob \| blame \| history
Doc/whatsnew/3.14.rst		patch \| blob \| blame \| history
Lib/encodings/__init__.py		patch \| blob \| blame \| history
Lib/encodings/_win_cp_codecs.py	[new file with mode: 0644]	patch \| blob
Lib/test/test_codecs.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Windows/2024-09-07-15-16-24.gh-issue-123803.J9VNQU.rst	[new file with mode: 0644]	patch \| blob