bpo-45467: Fix IncrementalDecoder and StreamReader in the "raw-unicode-escape" codec...

author Serhiy Storchaka <storchaka@gmail.com>

Thu, 14 Oct 2021 17:04:19 +0000 (20:04 +0300)

committer GitHub <noreply@github.com>

Thu, 14 Oct 2021 17:04:19 +0000 (20:04 +0300)
author Serhiy Storchaka <storchaka@gmail.com>
Thu, 14 Oct 2021 17:04:19 +0000 (20:04 +0300)
committer GitHub <noreply@github.com>
Thu, 14 Oct 2021 17:04:19 +0000 (20:04 +0300)
diff --git a/Include/cpython/unicodeobject.h b/Include/cpython/unicodeobject.h

index bc5a3b4bd0b99f4194a7ebbb5fc9a4c81eb914ea..ab4aebf5e70b9375eac4790446c7cc51187d88a5 100644 (file)
--- a/Include/cpython/unicodeobject.h
+++ b/Include/cpython/unicodeobject.h
@@ -796,6 +796,16 @@ PyAPI_FUNC(PyObject*) _PyUnicode_DecodeUnicodeEscapeInternal(
                                                string. */
  );
  
+/* --- Raw-Unicode-Escape Codecs ---------------------------------------------- */
+
+/* Variant of PyUnicode_DecodeRawUnicodeEscape that supports partial decoding. */
+PyAPI_FUNC(PyObject*) _PyUnicode_DecodeRawUnicodeEscapeStateful(
+        const char *string,     /* Unicode-Escape encoded string */
+        Py_ssize_t length,      /* size of string */
+        const char *errors,     /* error handling */
+        Py_ssize_t *consumed    /* bytes consumed */
+);
+
  /* --- Latin-1 Codecs ----------------------------------------------------- */
  
  PyAPI_FUNC(PyObject*) _PyUnicode_AsLatin1String(
diff --git a/Lib/encodings/raw_unicode_escape.py b/Lib/encodings/raw_unicode_escape.py

index 2b919b40d3788a7bd7dab3a983af091deb48a235..46c8e070dd192ed4bae9682d91e5d56413345cf1 100644 (file)
--- a/Lib/encodings/raw_unicode_escape.py
+++ b/Lib/encodings/raw_unicode_escape.py
@@ -21,15 +21,16 @@ class IncrementalEncoder(codecs.IncrementalEncoder):
      def encode(self, input, final=False):
          return codecs.raw_unicode_escape_encode(input, self.errors)[0]
  
-class IncrementalDecoder(codecs.IncrementalDecoder):
-    def decode(self, input, final=False):
-        return codecs.raw_unicode_escape_decode(input, self.errors)[0]
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):
+    def _buffer_decode(self, input, errors, final):
+        return codecs.raw_unicode_escape_decode(input, errors, final)
  
  class StreamWriter(Codec,codecs.StreamWriter):
      pass
  
  class StreamReader(Codec,codecs.StreamReader):
-    pass
+    def decode(self, input, errors='strict'):
+        return codecs.raw_unicode_escape_decode(input, errors, False)
  
  ### encodings module API
  
diff --git a/Lib/test/test_codecs.py b/Lib/test/test_codecs.py

index 288a3006cdeb73973003c3d5bd757236d1845b4b..506b51c428fb57c77644bd0569bf9df3b61ed694 100644 (file)
--- a/Lib/test/test_codecs.py
+++ b/Lib/test/test_codecs.py
@@ -2483,7 +2483,11 @@ class UnicodeEscapeTest(ReadTest, unittest.TestCase):
              ]
          )
  
-class RawUnicodeEscapeTest(unittest.TestCase):
+class RawUnicodeEscapeTest(ReadTest, unittest.TestCase):
+    encoding = "raw-unicode-escape"
+
+    test_lone_surrogates = None
+
      def test_empty(self):
          self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
          self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
@@ -2532,6 +2536,35 @@ class RawUnicodeEscapeTest(unittest.TestCase):
          self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
          self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
  
+    def test_partial(self):
+        self.check_partial(
+            "\x00\t\n\r\\\xff\uffff\U00010000",
+            [
+                '\x00',
+                '\x00\t',
+                '\x00\t\n',
+                '\x00\t\n\r',
+                '\x00\t\n\r',
+                '\x00\t\n\r\\\xff',
+                '\x00\t\n\r\\\xff',
+                '\x00\t\n\r\\\xff',
+                '\x00\t\n\r\\\xff',
+                '\x00\t\n\r\\\xff',
+                '\x00\t\n\r\\\xff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff',
+                '\x00\t\n\r\\\xff\uffff\U00010000',
+            ]
+        )
+
  
  class EscapeEncodeTest(unittest.TestCase):
  
diff --git a/Misc/NEWS.d/next/Library/2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst b/Misc/NEWS.d/next/Library/2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst

new file mode 100644 (file)

index 0000000..f2c0ae4
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst
@@ -0,0 +1,2 @@
+Fix incremental decoder and stream reader in the "raw-unicode-escape" codec.
+Previously they failed if the escape sequence was split.
diff --git a/Modules/_codecsmodule.c b/Modules/_codecsmodule.c

index fc74127ce56c42898a878417270da11884056460..50afc097b35026ee129006b120b0b665cb9e5f5b 100644 (file)
--- a/Modules/_codecsmodule.c
+++ b/Modules/_codecsmodule.c
@@ -509,17 +509,20 @@ _codecs_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
  _codecs.raw_unicode_escape_decode
      data: Py_buffer(accept={str, buffer})
      errors: str(accept={str, NoneType}) = None
+    final: bool(accept={int}) = True
      /
  [clinic start generated code]*/
  
  static PyObject *
  _codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
-                                       const char *errors)
-/*[clinic end generated code: output=c98eeb56028070a6 input=d2f5159ce3b3392f]*/
+                                       const char *errors, int final)
+/*[clinic end generated code: output=11dbd96301e2879e input=2d166191beb3235a]*/
  {
-    PyObject *decoded = PyUnicode_DecodeRawUnicodeEscape(data->buf, data->len,
-                                                         errors);
-    return codec_tuple(decoded, data->len);
+    Py_ssize_t consumed = data->len;
+    PyObject *decoded = _PyUnicode_DecodeRawUnicodeEscapeStateful(data->buf, data->len,
+                                                                  errors,
+                                                                  final ? NULL : &consumed);
+    return codec_tuple(decoded, consumed);
  }
  
  /*[clinic input]
diff --git a/Modules/clinic/_codecsmodule.c.h b/Modules/clinic/_codecsmodule.c.h

index a7086dd6e18d7c600089faf3d57f5ebd96ff3fdf..855ac77a7f73fd82626df3783131d41e1298d33b 100644 (file)
--- a/Modules/clinic/_codecsmodule.c.h
+++ b/Modules/clinic/_codecsmodule.c.h
@@ -1143,7 +1143,7 @@ exit:
  }
  
  PyDoc_STRVAR(_codecs_raw_unicode_escape_decode__doc__,
-"raw_unicode_escape_decode($module, data, errors=None, /)\n"
+"raw_unicode_escape_decode($module, data, errors=None, final=True, /)\n"
  "--\n"
  "\n");
  
@@ -1152,7 +1152,7 @@ PyDoc_STRVAR(_codecs_raw_unicode_escape_decode__doc__,
  
  static PyObject *
  _codecs_raw_unicode_escape_decode_impl(PyObject *module, Py_buffer *data,
-                                       const char *errors);
+                                       const char *errors, int final);
  
  static PyObject *
  _codecs_raw_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ssize_t nargs)
@@ -1160,8 +1160,9 @@ _codecs_raw_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ss
      PyObject *return_value = NULL;
      Py_buffer data = {NULL, NULL};
      const char *errors = NULL;
+    int final = 1;
  
-    if (!_PyArg_CheckPositional("raw_unicode_escape_decode", nargs, 1, 2)) {
+    if (!_PyArg_CheckPositional("raw_unicode_escape_decode", nargs, 1, 3)) {
          goto exit;
      }
      if (PyUnicode_Check(args[0])) {
@@ -1202,8 +1203,15 @@ _codecs_raw_unicode_escape_decode(PyObject *module, PyObject *const *args, Py_ss
          _PyArg_BadArgument("raw_unicode_escape_decode", "argument 2", "str or None", args[1]);
          goto exit;
      }
+    if (nargs < 3) {
+        goto skip_optional;
+    }
+    final = _PyLong_AsInt(args[2]);
+    if (final == -1 && PyErr_Occurred()) {
+        goto exit;
+    }
  skip_optional:
-    return_value = _codecs_raw_unicode_escape_decode_impl(module, &data, errors);
+    return_value = _codecs_raw_unicode_escape_decode_impl(module, &data, errors, final);
  
  exit:
      /* Cleanup for data */
@@ -2809,4 +2817,4 @@ exit:
  #ifndef _CODECS_CODE_PAGE_ENCODE_METHODDEF
      #define _CODECS_CODE_PAGE_ENCODE_METHODDEF
  #endif /* !defined(_CODECS_CODE_PAGE_ENCODE_METHODDEF) */
-/*[clinic end generated code: output=9e9fb1d5d81577e0 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=814dae36b6f885cb input=a9049054013a1b77]*/
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index af3b3335d60ca8c52522eba263bdab4769b6f7d7..386052f31bea246a54a982a459e9dbc4201b579b 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6379,8 +6379,6 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
          unsigned char c = (unsigned char) *s++;
          Py_UCS4 ch;
          int count;
-        Py_ssize_t startinpos;
-        Py_ssize_t endinpos;
          const char *message;
  
  #define WRITE_ASCII_CHAR(ch)                                                  \
@@ -6407,7 +6405,7 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
              continue;
          }
  
-        startinpos = s - starts - 1;
+        Py_ssize_t startinpos = s - starts - 1;
          /* \ - Escapes */
          if (s >= end) {
              message = "\\ at end of string";
@@ -6554,8 +6552,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
              *consumed = startinpos;
              break;
          }
-      error:
-        endinpos = s-starts;
+      error:;
+        Py_ssize_t endinpos = s-starts;
          writer.min_length = end - s + writer.pos;
          if (unicode_decode_call_errorhandler_writer(
                  errors, &errorHandler,
@@ -6735,9 +6733,10 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
  /* --- Raw Unicode Escape Codec ------------------------------------------- */
  
  PyObject *
-PyUnicode_DecodeRawUnicodeEscape(const char *s,
-                                 Py_ssize_t size,
-                                 const char *errors)
+_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
+                                          Py_ssize_t size,
+                                          const char *errors,
+                                          Py_ssize_t *consumed)
  {
      const char *starts = s;
      _PyUnicodeWriter writer;
@@ -6746,6 +6745,9 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
      PyObject *exc = NULL;
  
      if (size == 0) {
+        if (consumed) {
+            *consumed = 0;
+        }
          _Py_RETURN_UNICODE_EMPTY();
      }
  
@@ -6764,8 +6766,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
          unsigned char c = (unsigned char) *s++;
          Py_UCS4 ch;
          int count;
-        Py_ssize_t startinpos;
-        Py_ssize_t endinpos;
          const char *message;
  
  #define WRITE_CHAR(ch)                                                        \
@@ -6780,11 +6780,21 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
              } while(0)
  
          /* Non-escape characters are interpreted as Unicode ordinals */
-        if (c != '\\' || s >= end) {
+        if (c != '\\' || (s >= end && !consumed)) {
              WRITE_CHAR(c);
              continue;
          }
  
+        Py_ssize_t startinpos = s - starts - 1;
+        /* \ - Escapes */
+        if (s >= end) {
+            assert(consumed);
+            // Set message to silent compiler warning.
+            // Actually it is never used.
+            message = "\\ at end of string";
+            goto incomplete;
+        }
+
          c = (unsigned char) *s++;
          if (c == 'u') {
              count = 4;
@@ -6800,10 +6810,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
              WRITE_CHAR(c);
              continue;
          }
-        startinpos = s - starts - 2;
  
          /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
-        for (ch = 0; count && s < end; ++s, --count) {
+        for (ch = 0; count; ++s, --count) {
+            if (s >= end) {
+                goto incomplete;
+            }
              c = (unsigned char)*s;
              ch <<= 4;
              if (c >= '0' && c <= '9') {
@@ -6816,18 +6828,23 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
                  ch += c - ('A' - 10);
              }
              else {
-                break;
+                goto error;
              }
          }
-        if (!count) {
-            if (ch <= MAX_UNICODE) {
-                WRITE_CHAR(ch);
-                continue;
-            }
+        if (ch > MAX_UNICODE) {
              message = "\\Uxxxxxxxx out of range";
+            goto error;
          }
+        WRITE_CHAR(ch);
+        continue;
  
-        endinpos = s-starts;
+      incomplete:
+        if (consumed) {
+            *consumed = startinpos;
+            break;
+        }
+      error:;
+        Py_ssize_t endinpos = s-starts;
          writer.min_length = end - s + writer.pos;
          if (unicode_decode_call_errorhandler_writer(
                  errors, &errorHandler,
@@ -6849,7 +6866,14 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
      Py_XDECREF(errorHandler);
      Py_XDECREF(exc);
      return NULL;
+}
  
+PyObject *
+PyUnicode_DecodeRawUnicodeEscape(const char *s,
+                                 Py_ssize_t size,
+                                 const char *errors)
+{
+    return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
  }
author	Serhiy Storchaka <storchaka@gmail.com>
	Thu, 14 Oct 2021 17:04:19 +0000 (20:04 +0300)
committer	GitHub <noreply@github.com>
	Thu, 14 Oct 2021 17:04:19 +0000 (20:04 +0300)
Include/cpython/unicodeobject.h		patch \| blob \| blame \| history
Lib/encodings/raw_unicode_escape.py		patch \| blob \| blame \| history
Lib/test/test_codecs.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2021-10-14-13-31-19.bpo-45467.Q7Ma6A.rst	[new file with mode: 0644]	patch \| blob
Modules/_codecsmodule.c		patch \| blob \| blame \| history
Modules/clinic/_codecsmodule.c.h		patch \| blob \| blame \| history
Objects/unicodeobject.c		patch \| blob \| blame \| history