bpo-32030: Add _Py_EncodeUTF8_surrogateescape() (#4960)

author Victor Stinner <victor.stinner@gmail.com>

Thu, 21 Dec 2017 14:45:16 +0000 (15:45 +0100)

committer GitHub <noreply@github.com>

Thu, 21 Dec 2017 14:45:16 +0000 (15:45 +0100)
author Victor Stinner <victor.stinner@gmail.com>
Thu, 21 Dec 2017 14:45:16 +0000 (15:45 +0100)
committer GitHub <noreply@github.com>
Thu, 21 Dec 2017 14:45:16 +0000 (15:45 +0100)
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index c8600a894130f09034207521ddb77faf681b01fc..716e352dea611dfca33f74da497e64e611248991 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -5147,6 +5147,95 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size, size_t *p_wlen)
  }
  
  
+/* UTF-8 encoder using the surrogateescape error handler .
+
+   On success, return a pointer to a newly allocated character string (use
+   PyMem_Free() to free the memory).
+
+   On encoding failure, return NULL and write the position of the invalid
+   surrogate character into *error_pos (if error_pos is set).
+
+   On memory allocation failure, return NULL and write (size_t)-1 into
+   *error_pos (if error_pos is set). */
+char*
+_Py_EncodeUTF8_surrogateescape(const wchar_t *text, size_t *error_pos)
+{
+    const Py_ssize_t max_char_size = 4;
+    Py_ssize_t len = wcslen(text);
+
+    assert(len >= 0);
+
+    char *bytes;
+    if (len <= PY_SSIZE_T_MAX / max_char_size - 1) {
+        bytes = PyMem_Malloc((len + 1) * max_char_size);
+    }
+    else {
+        bytes = NULL;
+    }
+    if (bytes == NULL) {
+        if (error_pos != NULL) {
+            *error_pos = (size_t)-1;
+        }
+        return NULL;
+    }
+
+    char *p = bytes;
+    Py_ssize_t i;
+    for (i = 0; i < len;) {
+        Py_UCS4 ch = text[i++];
+
+        if (ch < 0x80) {
+            /* Encode ASCII */
+            *p++ = (char) ch;
+
+        }
+        else if (ch < 0x0800) {
+            /* Encode Latin-1 */
+            *p++ = (char)(0xc0 | (ch >> 6));
+            *p++ = (char)(0x80 | (ch & 0x3f));
+        }
+        else if (Py_UNICODE_IS_SURROGATE(ch)) {
+            /* surrogateescape error handler */
+            if (!(0xDC80 <= ch && ch <= 0xDCFF)) {
+                if (error_pos != NULL) {
+                    *error_pos = (size_t)i - 1;
+                }
+                goto error;
+            }
+            *p++ = (char)(ch & 0xff);
+        }
+        else if (ch < 0x10000) {
+            *p++ = (char)(0xe0 | (ch >> 12));
+            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+            *p++ = (char)(0x80 | (ch & 0x3f));
+        }
+        else {  /* ch >= 0x10000 */
+            assert(ch <= MAX_UNICODE);
+            /* Encode UCS4 Unicode ordinals */
+            *p++ = (char)(0xf0 | (ch >> 18));
+            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
+            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
+            *p++ = (char)(0x80 | (ch & 0x3f));
+        }
+    }
+    *p++ = '\0';
+
+    size_t final_size = (p - bytes);
+    char *bytes2 = PyMem_Realloc(bytes, final_size);
+    if (bytes2 == NULL) {
+        if (error_pos != NULL) {
+            *error_pos = (size_t)-1;
+        }
+        goto error;
+    }
+    return bytes2;
+
+ error:
+    PyMem_Free(bytes);
+    return NULL;
+}
+
+
  /* Primary internal function which creates utf8 encoded bytes objects.
  
     Allocation strategy:  if the string is short, convert into a stack buffer
diff --git a/Python/fileutils.c b/Python/fileutils.c

index c4d495d0d63550336029353049ccbe8bc39985a0..eeb5f2e89d251c6f0f91d70185553c0853d77029 100644 (file)
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -22,6 +22,8 @@ extern int winerror_to_errno(int);
  
  extern wchar_t* _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size,
                                                 size_t *p_wlen);
+extern char* _Py_EncodeUTF8_surrogateescape(const wchar_t *text,
+                                            size_t *error_pos);
  
  #ifdef O_CLOEXEC
  /* Does open() support the O_CLOEXEC flag? Possible values:
@@ -418,42 +420,6 @@ Py_DecodeLocale(const char* arg, size_t *size)
  #endif   /* __APPLE__ or __ANDROID__ */
  }
  
-static char*
-_Py_EncodeLocaleUTF8(const wchar_t *text, size_t *error_pos)
-{
-    Py_ssize_t len;
-    PyObject *unicode, *bytes = NULL;
-    char *cpath;
-
-    unicode = PyUnicode_FromWideChar(text, wcslen(text));
-    if (unicode == NULL) {
-        return NULL;
-    }
-
-    bytes = _PyUnicode_AsUTF8String(unicode, "surrogateescape");
-    Py_DECREF(unicode);
-    if (bytes == NULL) {
-        PyErr_Clear();
-        if (error_pos != NULL) {
-            *error_pos = (size_t)-1;
-        }
-        return NULL;
-    }
-
-    len = PyBytes_GET_SIZE(bytes);
-    cpath = PyMem_Malloc(len+1);
-    if (cpath == NULL) {
-        PyErr_Clear();
-        Py_DECREF(bytes);
-        if (error_pos != NULL) {
-            *error_pos = (size_t)-1;
-        }
-        return NULL;
-    }
-    memcpy(cpath, PyBytes_AsString(bytes), len + 1);
-    Py_DECREF(bytes);
-    return cpath;
-}
  
  #if !defined(__APPLE__) && !defined(__ANDROID__)
  static char*
@@ -537,10 +503,10 @@ char*
  Py_EncodeLocale(const wchar_t *text, size_t *error_pos)
  {
  #if defined(__APPLE__) || defined(__ANDROID__)
-    return _Py_EncodeLocaleUTF8(text, error_pos);
+    return _Py_EncodeUTF8_surrogateescape(text, error_pos);
  #else   /* __APPLE__ */
      if (Py_UTF8Mode == 1) {
-        return _Py_EncodeLocaleUTF8(text, error_pos);
+        return _Py_EncodeUTF8_surrogateescape(text, error_pos);
      }
  
  #ifndef MS_WINDOWS
author	Victor Stinner <victor.stinner@gmail.com>
	Thu, 21 Dec 2017 14:45:16 +0000 (15:45 +0100)
committer	GitHub <noreply@github.com>
	Thu, 21 Dec 2017 14:45:16 +0000 (15:45 +0100)
Objects/unicodeobject.c		patch \| blob \| blame \| history
Python/fileutils.c		patch \| blob \| blame \| history