[3.9] bpo-43667: Fix broken Unicode encoding in non-UTF locales on Solaris (GH-25096...

author Jakub Kulík <Kulikjak@gmail.com>

Fri, 21 May 2021 14:59:39 +0000 (16:59 +0200)

committer GitHub <noreply@github.com>

Fri, 21 May 2021 14:59:39 +0000 (16:59 +0200)
author Jakub Kulík <Kulikjak@gmail.com>
Fri, 21 May 2021 14:59:39 +0000 (16:59 +0200)
committer GitHub <noreply@github.com>
Fri, 21 May 2021 14:59:39 +0000 (16:59 +0200)
diff --git a/Include/internal/pycore_fileutils.h b/Include/internal/pycore_fileutils.h

index bbee58617fd05e45ddc9adf146cf3ebb903e4551..8cf137bb4bdf9dc89ed24cb4cdde391f4588f980 100644 (file)
--- a/Include/internal/pycore_fileutils.h
+++ b/Include/internal/pycore_fileutils.h
@@ -48,6 +48,18 @@ PyAPI_FUNC(int) _Py_GetLocaleconvNumeric(
      PyObject **decimal_point,
      PyObject **thousands_sep);
  
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+extern int _Py_LocaleUsesNonUnicodeWchar(void);
+
+extern wchar_t* _Py_DecodeNonUnicodeWchar(
+    const wchar_t* native,
+    Py_ssize_t size);
+
+extern int _Py_EncodeNonUnicodeWchar_InPlace(
+    wchar_t* unicode,
+    Py_ssize_t size);
+#endif
+
  #ifdef __cplusplus
  }
  #endif
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 19326fa60e58c3ebcb2728e1f29f7e685246cc23..46a0956c8bb70e34aaad2cde2cc9349cde9b40f9 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -56,6 +56,10 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  #include <windows.h>
  #endif
  
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+#include "pycore_fileutils.h"     // _Py_LocaleUsesNonUnicodeWchar()
+#endif
+
  /* Uncomment to display statistics on interned strings at exit when
     using Valgrind or Insecure++. */
  /* #define INTERNED_STATS 1 */
@@ -2211,6 +2215,20 @@ PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
      if (size == 0)
          _Py_RETURN_UNICODE_EMPTY();
  
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+    /* Oracle Solaris uses non-Unicode internal wchar_t form for
+       non-Unicode locales and hence needs conversion to UCS-4 first. */
+    if (_Py_LocaleUsesNonUnicodeWchar()) {
+        wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
+        if (!converted) {
+            return NULL;
+        }
+        PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
+        PyMem_Free(converted);
+        return unicode;
+    }
+#endif
+
      /* Single character Unicode objects in the Latin-1 range are
         shared when using this constructor */
      if (size == 1 && (Py_UCS4)*u < 256)
@@ -3223,6 +3241,17 @@ PyUnicode_AsWideChar(PyObject *unicode,
          res = size;
      }
      unicode_copy_as_widechar(unicode, w, size);
+
+#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+    /* Oracle Solaris uses non-Unicode internal wchar_t form for
+       non-Unicode locales and hence needs conversion first. */
+    if (_Py_LocaleUsesNonUnicodeWchar()) {
+        if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
+            return -1;
+        }
+    }
+#endif
+
      return res;
  }
  
@@ -3249,6 +3278,17 @@ PyUnicode_AsWideCharString(PyObject *unicode,
          return NULL;
      }
      unicode_copy_as_widechar(unicode, buffer, buflen + 1);
+
+#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+    /* Oracle Solaris uses non-Unicode internal wchar_t form for
+       non-Unicode locales and hence needs conversion first. */
+    if (_Py_LocaleUsesNonUnicodeWchar()) {
+        if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
+            return NULL;
+        }
+    }
+#endif
+
      if (size != NULL) {
          *size = buflen;
      }
diff --git a/Python/fileutils.c b/Python/fileutils.c

index 769ab591ab43fb7f2a58bddc740a30386bf2e8d0..45ea2043912597ec11ea86cfea0b393b887c1d04 100644 (file)
--- a/Python/fileutils.c
+++ b/Python/fileutils.c
@@ -17,6 +17,10 @@ extern int winerror_to_errno(int);
  #include <sys/ioctl.h>
  #endif
  
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+#include <iconv.h>
+#endif
+
  #ifdef HAVE_FCNTL_H
  #include <fcntl.h>
  #endif /* HAVE_FCNTL_H */
@@ -96,6 +100,12 @@ _Py_device_encoding(int fd)
  static size_t
  is_valid_wide_char(wchar_t ch)
  {
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+    /* Oracle Solaris doesn't use Unicode code points as wchar_t encoding
+       for non-Unicode locales, which makes values higher than MAX_UNICODE
+       possibly valid. */
+    return 1;
+#endif
      if (Py_UNICODE_IS_SURROGATE(ch)) {
          // Reject lone surrogate characters
          return 0;
@@ -859,6 +869,102 @@ _Py_EncodeLocaleEx(const wchar_t *text, char **str,
                              current_locale, errors);
  }
  
+#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+
+/* Check whether current locale uses Unicode as internal wchar_t form. */
+int
+_Py_LocaleUsesNonUnicodeWchar(void)
+{
+    /* Oracle Solaris uses non-Unicode internal wchar_t form for
+       non-Unicode locales and hence needs conversion to UTF first. */
+    char* codeset = nl_langinfo(CODESET);
+    if (!codeset) {
+        return 0;
+    }
+    /* 646 refers to ISO/IEC 646 standard that corresponds to ASCII encoding */
+    return (strcmp(codeset, "UTF-8") != 0 && strcmp(codeset, "646") != 0);
+}
+
+static wchar_t *
+_Py_ConvertWCharForm(const wchar_t *source, Py_ssize_t size,
+                     const char *tocode, const char *fromcode)
+{
+    Py_BUILD_ASSERT(sizeof(wchar_t) == 4);
+
+    /* Ensure we won't overflow the size. */
+    if (size > (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t))) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    /* the string doesn't have to be NULL terminated */
+    wchar_t* target = PyMem_Malloc(size * sizeof(wchar_t));
+    if (target == NULL) {
+        PyErr_NoMemory();
+        return NULL;
+    }
+
+    iconv_t cd = iconv_open(tocode, fromcode);
+    if (cd == (iconv_t)-1) {
+        PyErr_Format(PyExc_ValueError, "iconv_open() failed");
+        PyMem_Free(target);
+        return NULL;
+    }
+
+    char *inbuf = (char *) source;
+    char *outbuf = (char *) target;
+    size_t inbytesleft = sizeof(wchar_t) * size;
+    size_t outbytesleft = inbytesleft;
+
+    size_t ret = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
+    if (ret == DECODE_ERROR) {
+        PyErr_Format(PyExc_ValueError, "iconv() failed");
+        PyMem_Free(target);
+        iconv_close(cd);
+        return NULL;
+    }
+
+    iconv_close(cd);
+    return target;
+}
+
+/* Convert a wide character string to the UCS-4 encoded string. This
+   is necessary on systems where internal form of wchar_t are not Unicode
+   code points (e.g. Oracle Solaris).
+
+   Return a pointer to a newly allocated string, use PyMem_Free() to free
+   the memory. Return NULL and raise exception on conversion or memory
+   allocation error. */
+wchar_t *
+_Py_DecodeNonUnicodeWchar(const wchar_t *native, Py_ssize_t size)
+{
+    return _Py_ConvertWCharForm(native, size, "UCS-4-INTERNAL", "wchar_t");
+}
+
+/* Convert a UCS-4 encoded string to native wide character string. This
+   is necessary on systems where internal form of wchar_t are not Unicode
+   code points (e.g. Oracle Solaris).
+
+   The conversion is done in place. This can be done because both wchar_t
+   and UCS-4 use 4-byte encoding, and one wchar_t symbol always correspond
+   to a single UCS-4 symbol and vice versa. (This is true for Oracle Solaris,
+   which is currently the only system using these functions; it doesn't have
+   to be for other systems).
+
+   Return 0 on success. Return -1 and raise exception on conversion
+   or memory allocation error. */
+int
+_Py_EncodeNonUnicodeWchar_InPlace(wchar_t *unicode, Py_ssize_t size)
+{
+    wchar_t* result = _Py_ConvertWCharForm(unicode, size, "wchar_t", "UCS-4-INTERNAL");
+    if (!result) {
+        return -1;
+    }
+    memcpy(unicode, result, size * sizeof(wchar_t));
+    PyMem_Free(result);
+    return 0;
+}
+#endif /* HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION */
  
  #ifdef MS_WINDOWS
  static __int64 secs_between_epochs = 11644473600; /* Seconds between 1.1.1601 and 1.1.1970 */
diff --git a/configure b/configure

index 8dcdbf198900537f237cb1b91d1a10608e6baf60..c584866581df38c824af907b161151880bd62c0d 100755 (executable)
--- a/configure
+++ b/configure
@@ -15123,6 +15123,22 @@ else
  $as_echo "no" >&6; }
  fi
  
+case $ac_sys_system/$ac_sys_release in
+SunOS/*)
+  if test -f /etc/os-release; then
+    OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release)
+    if test "x$OS_NAME" = "xOracle Solaris"; then
+      # bpo-43667: In Oracle Solaris, the internal form of wchar_t in
+      # non-Unicode locales is not Unicode and hence cannot be used directly.
+      # https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html
+
+$as_echo "#define HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION 1" >>confdefs.h
+
+    fi
+  fi
+  ;;
+esac
+
  # check for endianness
   { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether byte ordering is bigendian" >&5
  $as_echo_n "checking whether byte ordering is bigendian... " >&6; }
diff --git a/configure.ac b/configure.ac

index b1e4c6ce19de8ff7f81b761e45658bd714f48a7c..a0750777c151356b98f00611e100ce06467151b5 100644 (file)
--- a/configure.ac
+++ b/configure.ac
@@ -4759,6 +4759,22 @@ else
    AC_MSG_RESULT(no)
  fi
  
+case $ac_sys_system/$ac_sys_release in
+SunOS/*)
+  if test -f /etc/os-release; then
+    OS_NAME=$(awk -F= '/^NAME=/ {print substr($2,2,length($2)-2)}' /etc/os-release)
+    if test "x$OS_NAME" = "xOracle Solaris"; then
+      # bpo-43667: In Oracle Solaris, the internal form of wchar_t in
+      # non-Unicode locales is not Unicode and hence cannot be used directly.
+      # https://docs.oracle.com/cd/E37838_01/html/E61053/gmwke.html
+      AC_DEFINE(HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION, 1,
+      [Define if the internal form of wchar_t in non-Unicode locales
+       is not Unicode.])
+    fi
+  fi
+  ;;
+esac
+
  # check for endianness
  AC_C_BIGENDIAN
  
diff --git a/pyconfig.h.in b/pyconfig.h.in

index 8510c8778b5690beca8440d37720e1ed63744e85..6358e568f4a6f8f71c8b3632060e0df1f2d54733 100644 (file)
--- a/pyconfig.h.in
+++ b/pyconfig.h.in
@@ -733,6 +733,10 @@
  /* Define to 1 if you have the `nice' function. */
  #undef HAVE_NICE
  
+/* Define if the internal form of wchar_t in non-Unicode locales is not
+   Unicode. */
+#undef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
+
  /* Define to 1 if you have the `openat' function. */
  #undef HAVE_OPENAT
author	Jakub Kulík <Kulikjak@gmail.com>
	Fri, 21 May 2021 14:59:39 +0000 (16:59 +0200)
committer	GitHub <noreply@github.com>
	Fri, 21 May 2021 14:59:39 +0000 (16:59 +0200)
Include/internal/pycore_fileutils.h		patch \| blob \| blame \| history
Objects/unicodeobject.c		patch \| blob \| blame \| history
Python/fileutils.c		patch \| blob \| blame \| history
configure		patch \| blob \| blame \| history
configure.ac		patch \| blob \| blame \| history
pyconfig.h.in		patch \| blob \| blame \| history