bpo-39465: Fix _PyUnicode_FromId() for subinterpreters (GH-20058)

author Victor Stinner <vstinner@python.org>

Fri, 25 Dec 2020 23:41:46 +0000 (00:41 +0100)

committer GitHub <noreply@github.com>

Fri, 25 Dec 2020 23:41:46 +0000 (00:41 +0100)
author Victor Stinner <vstinner@python.org>
Fri, 25 Dec 2020 23:41:46 +0000 (00:41 +0100)
committer GitHub <noreply@github.com>
Fri, 25 Dec 2020 23:41:46 +0000 (00:41 +0100)
diff --git a/Include/cpython/object.h b/Include/cpython/object.h

index 19c066b0ab78c777e0a245a68ec8dba793b925e7..86889f857689c4f2d0b7412a646de3583387ec67 100644 (file)
--- a/Include/cpython/object.h
+++ b/Include/cpython/object.h
@@ -35,12 +35,13 @@ PyAPI_FUNC(Py_ssize_t) _Py_GetRefTotal(void);
     _PyObject_{Get,Set,Has}AttrId are __getattr__ versions using _Py_Identifier*.
  */
  typedef struct _Py_Identifier {
-    struct _Py_Identifier *next;
      const char* string;
-    PyObject *object;
+    // Index in PyInterpreterState.unicode.ids.array. It is process-wide
+    // unique and must be initialized to -1.
+    Py_ssize_t index;
  } _Py_Identifier;
  
-#define _Py_static_string_init(value) { .next = NULL, .string = value, .object = NULL }
+#define _Py_static_string_init(value) { .string = value, .index = -1 }
  #define _Py_static_string(varname, value)  static _Py_Identifier varname = _Py_static_string_init(value)
  #define _Py_IDENTIFIER(varname) _Py_static_string(PyId_##varname, #varname)
  
diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h

index 9ec5358b5e36e59f9a527465210104a125a4c2b6..8c61802545253d21dd9ca53105bbbb922ac2311b 100644 (file)
--- a/Include/internal/pycore_interp.h
+++ b/Include/internal/pycore_interp.h
@@ -64,6 +64,11 @@ struct _Py_bytes_state {
      PyBytesObject *characters[256];
  };
  
+struct _Py_unicode_ids {
+    Py_ssize_t size;
+    PyObject **array;
+};
+
  struct _Py_unicode_state {
      // The empty Unicode object is a singleton to improve performance.
      PyObject *empty_string;
@@ -71,6 +76,8 @@ struct _Py_unicode_state {
         shared as well. */
      PyObject *latin1[256];
      struct _Py_unicode_fs_codec fs_codec;
+    // Unicode identifiers (_Py_Identifier): see _PyUnicode_FromId()
+    struct _Py_unicode_ids ids;
  };
  
  struct _Py_float_state {
diff --git a/Include/internal/pycore_runtime.h b/Include/internal/pycore_runtime.h

index 3a01d64e63d81169eee262b2ce3b94c07d57f305..8c54abbb0694a3dd0a932c171408abe016ff533e 100644 (file)
--- a/Include/internal/pycore_runtime.h
+++ b/Include/internal/pycore_runtime.h
@@ -49,6 +49,11 @@ typedef struct _Py_AuditHookEntry {
      void *userData;
  } _Py_AuditHookEntry;
  
+struct _Py_unicode_runtime_ids {
+    PyThread_type_lock lock;
+    Py_ssize_t next_index;
+};
+
  /* Full Python runtime state */
  
  typedef struct pyruntimestate {
@@ -106,6 +111,8 @@ typedef struct pyruntimestate {
      void *open_code_userdata;
      _Py_AuditHookEntry *audit_hook_head;
  
+    struct _Py_unicode_runtime_ids unicode_ids;
+
      // XXX Consolidate globals found via the check-c-globals script.
  } _PyRuntimeState;
  
diff --git a/Misc/NEWS.d/next/Core and Builtins/2020-05-13-18-50-27.bpo-39465.j7nl6A.rst b/Misc/NEWS.d/next/Core and Builtins/2020-05-13-18-50-27.bpo-39465.j7nl6A.rst

new file mode 100644 (file)

index 0000000..c29b5e1
--- /dev/null
+++ b/Misc/NEWS.d/next/Core and Builtins/2020-05-13-18-50-27.bpo-39465.j7nl6A.rst
@@ -0,0 +1,3 @@
+Make :c:func:`_PyUnicode_FromId` function compatible with subinterpreters.
+Each interpreter now has an array of identifier objects (interned strings
+decoded from UTF-8). Patch by Victor Stinner.
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 409355534a2ce180482a5a4df50bf6582503ffc1..3238d1e692a5175b12acf2707de085d69e9bdae2 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -41,6 +41,7 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  #define PY_SSIZE_T_CLEAN
  #include "Python.h"
  #include "pycore_abstract.h"      // _PyIndex_Check()
+#include "pycore_atomic_funcs.h"  // _Py_atomic_size_get()
  #include "pycore_bytes_methods.h" // _Py_bytes_lower()
  #include "pycore_format.h"        // F_LJUST
  #include "pycore_initconfig.h"    // _PyStatus_OK()
@@ -302,9 +303,6 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
                      _Py_error_handler error_handler, const char *errors,
                      Py_ssize_t *consumed);
  
-/* List of static strings. */
-static _Py_Identifier *static_strings = NULL;
-
  /* Fast detection of the most frequent whitespace characters */
  const unsigned char _Py_ascii_whitespace[] = {
      0, 0, 0, 0, 0, 0, 0, 0,
@@ -2312,42 +2310,85 @@ PyUnicode_FromString(const char *u)
      return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
  }
  
+
  PyObject *
  _PyUnicode_FromId(_Py_Identifier *id)
  {
-    if (id->object) {
-        return id->object;
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    struct _Py_unicode_ids *ids = &interp->unicode.ids;
+
+    int index = _Py_atomic_size_get(&id->index);
+    if (index < 0) {
+        struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
+
+        PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
+        // Check again to detect concurrent access. Another thread can have
+        // initialized the index while this thread waited for the lock.
+        index = _Py_atomic_size_get(&id->index);
+        if (index < 0) {
+            assert(rt_ids->next_index < PY_SSIZE_T_MAX);
+            index = rt_ids->next_index;
+            rt_ids->next_index++;
+            _Py_atomic_size_set(&id->index, index);
+        }
+        PyThread_release_lock(rt_ids->lock);
      }
+    assert(index >= 0);
  
      PyObject *obj;
-    obj = PyUnicode_DecodeUTF8Stateful(id->string,
-                                       strlen(id->string),
+    if (index < ids->size) {
+        obj = ids->array[index];
+        if (obj) {
+            // Return a borrowed reference
+            return obj;
+        }
+    }
+
+    obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
                                         NULL, NULL);
      if (!obj) {
          return NULL;
      }
      PyUnicode_InternInPlace(&obj);
  
-    assert(!id->next);
-    id->object = obj;
-    id->next = static_strings;
-    static_strings = id;
-    return id->object;
+    if (index >= ids->size) {
+        // Overallocate to reduce the number of realloc
+        Py_ssize_t new_size = Py_MAX(index * 2, 16);
+        Py_ssize_t item_size = sizeof(ids->array[0]);
+        PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
+        if (new_array == NULL) {
+            PyErr_NoMemory();
+            return NULL;
+        }
+        memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
+        ids->array = new_array;
+        ids->size = new_size;
+    }
+
+    // The array stores a strong reference
+    ids->array[index] = obj;
+
+    // Return a borrowed reference
+    return obj;
  }
  
+
  static void
-unicode_clear_static_strings(void)
+unicode_clear_identifiers(PyThreadState *tstate)
  {
-    _Py_Identifier *tmp, *s = static_strings;
-    while (s) {
-        Py_CLEAR(s->object);
-        tmp = s->next;
-        s->next = NULL;
-        s = tmp;
+    PyInterpreterState *interp = _PyInterpreterState_GET();
+    struct _Py_unicode_ids *ids = &interp->unicode.ids;
+    for (Py_ssize_t i=0; i < ids->size; i++) {
+        Py_XDECREF(ids->array[i]);
      }
-    static_strings = NULL;
+    ids->size = 0;
+    PyMem_Free(ids->array);
+    ids->array = NULL;
+    // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
+    // after Py_Finalize().
  }
  
+
  /* Internal function, doesn't check maximum character */
  
  PyObject*
@@ -16238,9 +16279,7 @@ _PyUnicode_Fini(PyThreadState *tstate)
          Py_CLEAR(state->latin1[i]);
      }
  
-    if (_Py_IsMainInterpreter(tstate)) {
-        unicode_clear_static_strings();
-    }
+    unicode_clear_identifiers(tstate);
  
      _PyUnicode_FiniEncodings(&tstate->interp->unicode.fs_codec);
  }
diff --git a/Python/pystate.c b/Python/pystate.c

index ead25b08d7a83a6c4d9822283d1c871bebf36359..231144b082861f42b35f6bfa75f63e56e480af9e 100644 (file)
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -73,18 +73,24 @@ _PyRuntimeState_Init_impl(_PyRuntimeState *runtime)
  
      runtime->interpreters.mutex = PyThread_allocate_lock();
      if (runtime->interpreters.mutex == NULL) {
-        return _PyStatus_ERR("Can't initialize threads for interpreter");
+        return _PyStatus_NO_MEMORY();
      }
      runtime->interpreters.next_id = -1;
  
      runtime->xidregistry.mutex = PyThread_allocate_lock();
      if (runtime->xidregistry.mutex == NULL) {
-        return _PyStatus_ERR("Can't initialize threads for cross-interpreter data registry");
+        return _PyStatus_NO_MEMORY();
      }
  
      // Set it to the ID of the main thread of the main interpreter.
      runtime->main_thread = PyThread_get_thread_ident();
  
+    runtime->unicode_ids.lock = PyThread_allocate_lock();
+    if (runtime->unicode_ids.lock == NULL) {
+        return _PyStatus_NO_MEMORY();
+    }
+    runtime->unicode_ids.next_index = 0;
+
      return _PyStatus_OK();
  }
  
@@ -108,17 +114,17 @@ _PyRuntimeState_Fini(_PyRuntimeState *runtime)
      /* Force the allocator used by _PyRuntimeState_Init(). */
      PyMemAllocatorEx old_alloc;
      _PyMem_SetDefaultAllocator(PYMEM_DOMAIN_RAW, &old_alloc);
-
-    if (runtime->interpreters.mutex != NULL) {
-        PyThread_free_lock(runtime->interpreters.mutex);
-        runtime->interpreters.mutex = NULL;
+#define FREE_LOCK(LOCK) \
+    if (LOCK != NULL) { \
+        PyThread_free_lock(LOCK); \
+        LOCK = NULL; \
      }
  
-    if (runtime->xidregistry.mutex != NULL) {
-        PyThread_free_lock(runtime->xidregistry.mutex);
-        runtime->xidregistry.mutex = NULL;
-    }
+    FREE_LOCK(runtime->interpreters.mutex);
+    FREE_LOCK(runtime->xidregistry.mutex);
+    FREE_LOCK(runtime->unicode_ids.lock);
  
+#undef FREE_LOCK
      PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &old_alloc);
  }
  
@@ -139,12 +145,14 @@ _PyRuntimeState_ReInitThreads(_PyRuntimeState *runtime)
      int reinit_interp = _PyThread_at_fork_reinit(&runtime->interpreters.mutex);
      int reinit_main_id = _PyThread_at_fork_reinit(&runtime->interpreters.main->id_mutex);
      int reinit_xidregistry = _PyThread_at_fork_reinit(&runtime->xidregistry.mutex);
+    int reinit_unicode_ids = _PyThread_at_fork_reinit(&runtime->unicode_ids.lock);
  
      PyMem_SetAllocator(PYMEM_DOMAIN_RAW, &old_alloc);
  
      if (reinit_interp < 0
          || reinit_main_id < 0
-        || reinit_xidregistry < 0)
+        || reinit_xidregistry < 0
+        || reinit_unicode_ids < 0)
      {
          return _PyStatus_ERR("Failed to reinitialize runtime locks");
author	Victor Stinner <vstinner@python.org>
	Fri, 25 Dec 2020 23:41:46 +0000 (00:41 +0100)
committer	GitHub <noreply@github.com>
	Fri, 25 Dec 2020 23:41:46 +0000 (00:41 +0100)
Include/cpython/object.h		patch \| blob \| blame \| history
Include/internal/pycore_interp.h		patch \| blob \| blame \| history
Include/internal/pycore_runtime.h		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core and Builtins/2020-05-13-18-50-27.bpo-39465.j7nl6A.rst	[new file with mode: 0644]	patch \| blob
Objects/unicodeobject.c		patch \| blob \| blame \| history
Python/pystate.c		patch \| blob \| blame \| history