bpo-36312: Fix decoders for some code pages. (GH-12369)

author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>

Thu, 21 Mar 2019 04:31:57 +0000 (21:31 -0700)

committer GitHub <noreply@github.com>

Thu, 21 Mar 2019 04:31:57 +0000 (21:31 -0700)
author Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
Thu, 21 Mar 2019 04:31:57 +0000 (21:31 -0700)
committer GitHub <noreply@github.com>
Thu, 21 Mar 2019 04:31:57 +0000 (21:31 -0700)
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py

index 5c2de212b1992ffad760cebf4f66b4cfd59763af..293dfbc61aba10ecb40598aec5dc8f62510da1e6 100644 (file)
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -3159,6 +3159,15 @@ class CodePageTest(unittest.TestCase):
              ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
          ))
  
+    def test_code_page_decode_flags(self):
+        # Issue #36312: For some code pages (e.g. UTF-7) flags for
+        # MultiByteToWideChar() must be set to 0.
+        for cp in (50220, 50221, 50222, 50225, 50227, 50229,
+                   *range(57002, 57011+1), 65000):
+            self.assertEqual(codecs.code_page_decode(cp, b'abc'), ('abc', 3))
+        self.assertEqual(codecs.code_page_decode(42, b'abc'),
+                         ('\uf061\uf062\uf063', 3))
+
      def test_incremental(self):
          decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
          self.assertEqual(decoded, ('', 0))
diff --git a/Misc/NEWS.d/next/Windows/2019-03-16-16-51-17.bpo-36312.Niwm-T.rst b/Misc/NEWS.d/next/Windows/2019-03-16-16-51-17.bpo-36312.Niwm-T.rst

new file mode 100644 (file)

index 0000000..8b325db
--- /dev/null
+++ b/Misc/NEWS.d/next/Windows/2019-03-16-16-51-17.bpo-36312.Niwm-T.rst
@@ -0,0 +1,2 @@
+Fixed decoders for the following code pages: 50220, 50221, 50222, 50225,
+50227, 50229, 57002 through 57011, 65000 and 42.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index b67ffac4e9fb98466246df28baef948c0abd8e66..adcf69d4e53935630f84a9acc7d36f8695f88a11 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -7123,15 +7123,21 @@ decode_code_page_strict(UINT code_page,
                          const char *in,
                          int insize)
  {
-    const DWORD flags = decode_code_page_flags(code_page);
+    DWORD flags = MB_ERR_INVALID_CHARS;
      wchar_t *out;
      DWORD outsize;
  
      /* First get the size of the result */
      assert(insize > 0);
-    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
-    if (outsize <= 0)
-        goto error;
+    while ((outsize = MultiByteToWideChar(code_page, flags,
+                                          in, insize, NULL, 0)) <= 0)
+    {
+        if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
+            goto error;
+        }
+        /* For some code pages (e.g. UTF-7) flags must be set to 0. */
+        flags = 0;
+    }
  
      if (*v == NULL) {
          /* Create unicode object */
@@ -7177,7 +7183,7 @@ decode_code_page_errors(UINT code_page,
  {
      const char *startin = in;
      const char *endin = in + size;
-    const DWORD flags = decode_code_page_flags(code_page);
+    DWORD flags = MB_ERR_INVALID_CHARS;
      /* Ideally, we should get reason from FormatMessage. This is the Windows
         2000 English version of the message. */
      const char *reason = "No mapping for the Unicode character exists "
@@ -7248,6 +7254,11 @@ decode_code_page_errors(UINT code_page,
              if (outsize > 0)
                  break;
              err = GetLastError();
+            if (err == ERROR_INVALID_FLAGS && flags) {
+                /* For some code pages (e.g. UTF-7) flags must be set to 0. */
+                flags = 0;
+                continue;
+            }
              if (err != ERROR_NO_UNICODE_TRANSLATION
                  && err != ERROR_INSUFFICIENT_BUFFER)
              {
author	Miss Islington (bot) <31488909+miss-islington@users.noreply.github.com>
	Thu, 21 Mar 2019 04:31:57 +0000 (21:31 -0700)
committer	GitHub <noreply@github.com>
	Thu, 21 Mar 2019 04:31:57 +0000 (21:31 -0700)
Lib/test/test_codecs.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Windows/2019-03-16-16-51-17.bpo-36312.Niwm-T.rst	[new file with mode: 0644]	patch \| blob
Objects/unicodeobject.c		patch \| blob \| blame \| history