[3.13] gh-128013: fix data race in PyUnicode_AsUTF8AndSize on free-threading (#128021...

author Kumar Aditya <kumaraditya@python.org>

Thu, 2 Jan 2025 16:40:17 +0000 (22:10 +0530)

committer GitHub <noreply@github.com>

Thu, 2 Jan 2025 16:40:17 +0000 (22:10 +0530)
author Kumar Aditya <kumaraditya@python.org>
Thu, 2 Jan 2025 16:40:17 +0000 (22:10 +0530)
committer GitHub <noreply@github.com>
Thu, 2 Jan 2025 16:40:17 +0000 (22:10 +0530)
diff --git a/Lib/test/test_capi/test_unicode.py b/Lib/test/test_capi/test_unicode.py

index a69f817c515ba7d41e6861838d2ec9b60b00c823..f750ec1a56fef9da47a50791826aa118e6d14882 100644 (file)
--- a/Lib/test/test_capi/test_unicode.py
+++ b/Lib/test/test_capi/test_unicode.py
@@ -1,7 +1,7 @@
  import unittest
  import sys
  from test import support
-from test.support import import_helper
+from test.support import threading_helper, import_helper
  
  try:
      import _testcapi
@@ -959,6 +959,24 @@ class CAPITest(unittest.TestCase):
          self.assertRaises(TypeError, unicode_asutf8, [], 0)
          # CRASHES unicode_asutf8(NULL, 0)
  
+    @unittest.skipIf(_testcapi is None, 'need _testcapi module')
+    @threading_helper.requires_working_threading()
+    def test_asutf8_race(self):
+        """Test that there's no race condition in PyUnicode_AsUTF8()"""
+        unicode_asutf8 = _testcapi.unicode_asutf8
+        from threading import Thread
+
+        data = "😊"
+
+        def worker():
+            for _ in range(1000):
+                self.assertEqual(unicode_asutf8(data, 5), b'\xf0\x9f\x98\x8a\0')
+
+        threads = [Thread(target=worker) for _ in range(10)]
+        with threading_helper.start_threads(threads):
+            pass
+
+
      @support.cpython_only
      @unittest.skipIf(_testlimitedcapi is None, 'need _testlimitedcapi module')
      def test_asutf8andsize(self):
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index d4ce7fd7c9797013d18bf9de5be4ce43e394d563..7e61d615418722548373523707467f12c372fe3e 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -111,20 +111,42 @@ NOTE: In the interpreter's initialization phase, some globals are currently
  #  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
  #endif
  
-#define _PyUnicode_UTF8(op)                             \
-    (_PyCompactUnicodeObject_CAST(op)->utf8)
-#define PyUnicode_UTF8(op)                              \
-    (assert(_PyUnicode_CHECK(op)),                      \
-     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
-         ((char*)(_PyASCIIObject_CAST(op) + 1)) :       \
-         _PyUnicode_UTF8(op))
-#define _PyUnicode_UTF8_LENGTH(op)                      \
-    (_PyCompactUnicodeObject_CAST(op)->utf8_length)
-#define PyUnicode_UTF8_LENGTH(op)                       \
-    (assert(_PyUnicode_CHECK(op)),                      \
-     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
-         _PyASCIIObject_CAST(op)->length :              \
-         _PyUnicode_UTF8_LENGTH(op))
+static inline char* _PyUnicode_UTF8(PyObject *op)
+{
+    return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
+}
+
+static inline char* PyUnicode_UTF8(PyObject *op)
+{
+    assert(_PyUnicode_CHECK(op));
+    if (PyUnicode_IS_COMPACT_ASCII(op)) {
+        return ((char*)(_PyASCIIObject_CAST(op) + 1));
+    }
+    else {
+         return _PyUnicode_UTF8(op);
+    }
+}
+
+static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
+{
+    FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
+}
+
+static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
+{
+    assert(_PyUnicode_CHECK(op));
+    if (PyUnicode_IS_COMPACT_ASCII(op)) {
+         return _PyASCIIObject_CAST(op)->length;
+    }
+    else {
+         return _PyCompactUnicodeObject_CAST(op)->utf8_length;
+    }
+}
+
+static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
+{
+    _PyCompactUnicodeObject_CAST(op)->utf8_length = length;
+}
  
  #define _PyUnicode_LENGTH(op)                           \
      (_PyASCIIObject_CAST(op)->length)
@@ -132,26 +154,37 @@ NOTE: In the interpreter's initialization phase, some globals are currently
      (_PyASCIIObject_CAST(op)->state)
  #define _PyUnicode_HASH(op)                             \
      (_PyASCIIObject_CAST(op)->hash)
-#define _PyUnicode_KIND(op)                             \
-    (assert(_PyUnicode_CHECK(op)),                      \
-     _PyASCIIObject_CAST(op)->state.kind)
-#define _PyUnicode_GET_LENGTH(op)                       \
-    (assert(_PyUnicode_CHECK(op)),                      \
-     _PyASCIIObject_CAST(op)->length)
+
+static inline Py_hash_t PyUnicode_HASH(PyObject *op)
+{
+    assert(_PyUnicode_CHECK(op));
+    return FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash);
+}
+
+static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
+{
+    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
+}
+
  #define _PyUnicode_DATA_ANY(op)                         \
      (_PyUnicodeObject_CAST(op)->data.any)
  
-#define _PyUnicode_SHARE_UTF8(op)                       \
-    (assert(_PyUnicode_CHECK(op)),                      \
-     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
-     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
+static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
+{
+    assert(_PyUnicode_CHECK(op));
+    assert(!PyUnicode_IS_COMPACT_ASCII(op));
+    return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
+}
  
  /* true if the Unicode object has an allocated UTF-8 memory block
     (not shared with other data) */
-#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
-    ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
-      && _PyUnicode_UTF8(op)                            \
-      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
+static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
+{
+    return (!PyUnicode_IS_COMPACT_ASCII(op)
+            && _PyUnicode_UTF8(op) != NULL
+            && _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
+}
+
  
  /* Generic helper macro to convert characters of different types.
     from_type and to_type have to be valid type names, begin and end
@@ -650,7 +683,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
                                   || kind == PyUnicode_2BYTE_KIND
                                   || kind == PyUnicode_4BYTE_KIND);
              CHECK(ascii->state.ascii == 0);
-            CHECK(compact->utf8 != data);
+            CHECK(_PyUnicode_UTF8(op) != data);
          }
          else {
              PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
@@ -662,16 +695,17 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
              CHECK(ascii->state.compact == 0);
              CHECK(data != NULL);
              if (ascii->state.ascii) {
-                CHECK(compact->utf8 == data);
+                CHECK(_PyUnicode_UTF8(op) == data);
                  CHECK(compact->utf8_length == ascii->length);
              }
              else {
-                CHECK(compact->utf8 != data);
+                CHECK(_PyUnicode_UTF8(op) != data);
              }
          }
-
-        if (compact->utf8 == NULL)
+#ifndef Py_GIL_DISABLED
+        if (_PyUnicode_UTF8(op) == NULL)
              CHECK(compact->utf8_length == 0);
+#endif
      }
  
      /* check that the best kind is used: O(n) operation */
@@ -1115,8 +1149,8 @@ resize_compact(PyObject *unicode, Py_ssize_t length)
  
      if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
          PyMem_Free(_PyUnicode_UTF8(unicode));
-        _PyUnicode_UTF8(unicode) = NULL;
-        _PyUnicode_UTF8_LENGTH(unicode) = 0;
+        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
+        PyUnicode_SET_UTF8(unicode, NULL);
      }
  #ifdef Py_TRACE_REFS
      _Py_ForgetReference(unicode);
@@ -1169,8 +1203,8 @@ resize_inplace(PyObject *unicode, Py_ssize_t length)
      if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
      {
          PyMem_Free(_PyUnicode_UTF8(unicode));
-        _PyUnicode_UTF8(unicode) = NULL;
-        _PyUnicode_UTF8_LENGTH(unicode) = 0;
+        PyUnicode_SET_UTF8_LENGTH(unicode, 0);
+        PyUnicode_SET_UTF8(unicode, NULL);
      }
  
      data = (PyObject *)PyObject_Realloc(data, new_size);
@@ -1180,8 +1214,8 @@ resize_inplace(PyObject *unicode, Py_ssize_t length)
      }
      _PyUnicode_DATA_ANY(unicode) = data;
      if (share_utf8) {
-        _PyUnicode_UTF8(unicode) = data;
-        _PyUnicode_UTF8_LENGTH(unicode) = length;
+        PyUnicode_SET_UTF8_LENGTH(unicode, length);
+        PyUnicode_SET_UTF8(unicode, data);
      }
      _PyUnicode_LENGTH(unicode) = length;
      PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
@@ -1411,12 +1445,12 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
  
      assert(unicode != NULL);
      assert(_PyUnicode_CHECK(unicode));
-    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
+    assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
      ucs4_out = PyUnicode_4BYTE_DATA(unicode);
  
      for (iter = begin; iter < end; ) {
          assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
-                           _PyUnicode_GET_LENGTH(unicode)));
+                           PyUnicode_GET_LENGTH(unicode)));
          if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
              && (iter+1) < end
              && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
@@ -1430,7 +1464,7 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
          }
      }
      assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
-                        _PyUnicode_GET_LENGTH(unicode)));
+                        PyUnicode_GET_LENGTH(unicode)));
  
  }
  #endif
@@ -1801,7 +1835,7 @@ unicode_modifiable(PyObject *unicode)
      assert(_PyUnicode_CHECK(unicode));
      if (Py_REFCNT(unicode) != 1)
          return 0;
-    if (FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(unicode)) != -1)
+    if (PyUnicode_HASH(unicode) != -1)
          return 0;
      if (PyUnicode_CHECK_INTERNED(unicode))
          return 0;
@@ -4052,6 +4086,21 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
  
  static int unicode_fill_utf8(PyObject *unicode);
  
+
+static int
+unicode_ensure_utf8(PyObject *unicode)
+{
+    int err = 0;
+    if (PyUnicode_UTF8(unicode) == NULL) {
+        Py_BEGIN_CRITICAL_SECTION(unicode);
+        if (PyUnicode_UTF8(unicode) == NULL) {
+            err = unicode_fill_utf8(unicode);
+        }
+        Py_END_CRITICAL_SECTION();
+    }
+    return err;
+}
+
  const char *
  PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
  {
@@ -4063,13 +4112,11 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
          return NULL;
      }
  
-    if (PyUnicode_UTF8(unicode) == NULL) {
-        if (unicode_fill_utf8(unicode) == -1) {
-            if (psize) {
-                *psize = -1;
-            }
-            return NULL;
+    if (unicode_ensure_utf8(unicode) == -1) {
+        if (psize) {
+            *psize = -1;
          }
+        return NULL;
      }
  
      if (psize) {
@@ -5401,6 +5448,7 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
  static int
  unicode_fill_utf8(PyObject *unicode)
  {
+    _Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
      /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
      assert(!PyUnicode_IS_ASCII(unicode));
  
@@ -5442,10 +5490,10 @@ unicode_fill_utf8(PyObject *unicode)
          PyErr_NoMemory();
          return -1;
      }
-    _PyUnicode_UTF8(unicode) = cache;
-    _PyUnicode_UTF8_LENGTH(unicode) = len;
      memcpy(cache, start, len);
      cache[len] = '\0';
+    PyUnicode_SET_UTF8_LENGTH(unicode, len);
+    PyUnicode_SET_UTF8(unicode, cache);
      _PyBytesWriter_Dealloc(&writer);
      return 0;
  }
@@ -10996,9 +11044,9 @@ _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
          return 0;
      }
  
-    Py_hash_t right_hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(right_uni));
+    Py_hash_t right_hash = PyUnicode_HASH(right_uni);
      assert(right_hash != -1);
-    Py_hash_t hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(left));
+    Py_hash_t hash = PyUnicode_HASH(left);
      if (hash != -1 && hash != right_hash) {
          return 0;
      }
@@ -11484,14 +11532,14 @@ unicode_hash(PyObject *self)
  #ifdef Py_DEBUG
      assert(_Py_HashSecret_Initialized);
  #endif
-    Py_hash_t hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(self));
+    Py_hash_t hash = PyUnicode_HASH(self);
      if (hash != -1) {
          return hash;
      }
      x = _Py_HashBytes(PyUnicode_DATA(self),
                        PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
  
-    FT_ATOMIC_STORE_SSIZE_RELAXED(_PyUnicode_HASH(self), x);
+    PyUnicode_SET_HASH(self, x);
      return x;
  }
  
@@ -14888,8 +14936,8 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
      _PyUnicode_STATE(self).compact = 0;
      _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
      _PyUnicode_STATE(self).statically_allocated = 0;
-    _PyUnicode_UTF8_LENGTH(self) = 0;
-    _PyUnicode_UTF8(self) = NULL;
+    PyUnicode_SET_UTF8_LENGTH(self, 0);
+    PyUnicode_SET_UTF8(self, NULL);
      _PyUnicode_DATA_ANY(self) = NULL;
  
      share_utf8 = 0;
@@ -14919,8 +14967,8 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
  
      _PyUnicode_DATA_ANY(self) = data;
      if (share_utf8) {
-        _PyUnicode_UTF8_LENGTH(self) = length;
-        _PyUnicode_UTF8(self) = data;
+        PyUnicode_SET_UTF8_LENGTH(self, length);
+        PyUnicode_SET_UTF8(self, data);
      }
  
      memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));
author	Kumar Aditya <kumaraditya@python.org>
	Thu, 2 Jan 2025 16:40:17 +0000 (22:10 +0530)
committer	GitHub <noreply@github.com>
	Thu, 2 Jan 2025 16:40:17 +0000 (22:10 +0530)
Lib/test/test_capi/test_unicode.py		patch \| blob \| blame \| history
Objects/unicodeobject.c		patch \| blob \| blame \| history