gh-116738: Make `_codecs` module thread-safe (#117530)

author Brett Simmers <swtaarrs@users.noreply.github.com>

Thu, 2 May 2024 22:25:36 +0000 (15:25 -0700)

committer GitHub <noreply@github.com>

Thu, 2 May 2024 22:25:36 +0000 (18:25 -0400)
author Brett Simmers <swtaarrs@users.noreply.github.com>
Thu, 2 May 2024 22:25:36 +0000 (15:25 -0700)
committer GitHub <noreply@github.com>
Thu, 2 May 2024 22:25:36 +0000 (18:25 -0400)
diff --git a/Include/internal/pycore_codecs.h b/Include/internal/pycore_codecs.h

index a2a7151d50ade7ce61932681aef806686de2fcfe..5e2d5c5ce9d868a2b87824e458028e2b65ad922b 100644 (file)
--- a/Include/internal/pycore_codecs.h
+++ b/Include/internal/pycore_codecs.h
@@ -8,6 +8,17 @@ extern "C" {
  #  error "this header requires Py_BUILD_CORE define"
  #endif
  
+#include "pycore_lock.h"          // PyMutex
+
+/* Initialize codecs-related state for the given interpreter, including
+   registering the first codec search function. Must be called before any other
+   PyCodec-related functions, and while only one thread is active. */
+extern PyStatus _PyCodec_InitRegistry(PyInterpreterState *interp);
+
+/* Finalize codecs-related state for the given interpreter. No PyCodec-related
+   functions other than PyCodec_Unregister() may be called after this. */
+extern void _PyCodec_Fini(PyInterpreterState *interp);
+
  extern PyObject* _PyCodec_Lookup(const char *encoding);
  
  /* Text codec specific encoding and decoding API.
@@ -48,6 +59,26 @@ extern PyObject* _PyCodecInfo_GetIncrementalEncoder(
     PyObject *codec_info,
     const char *errors);
  
+// Per-interpreter state used by codecs.c.
+struct codecs_state {
+    // A list of callable objects used to search for codecs.
+    PyObject *search_path;
+
+    // A dict mapping codec names to codecs returned from a callable in
+    // search_path.
+    PyObject *search_cache;
+
+    // A dict mapping error handling strategies to functions to implement them.
+    PyObject *error_registry;
+
+#ifdef Py_GIL_DISABLED
+    // Used to safely delete a specific item from search_path.
+    PyMutex search_path_mutex;
+#endif
+
+    // Whether or not the rest of the state is initialized.
+    int initialized;
+};
  
  #ifdef __cplusplus
  }
diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h

index d38959e3ce4ec526529fd3d49fa177c0d382f718..a2aec6dd4b7241196763073d270411a3d8328d73 100644 (file)
--- a/Include/internal/pycore_interp.h
+++ b/Include/internal/pycore_interp.h
@@ -14,6 +14,7 @@ extern "C" {
  #include "pycore_atexit.h"        // struct atexit_state
  #include "pycore_ceval_state.h"   // struct _ceval_state
  #include "pycore_code.h"          // struct callable_cache
+#include "pycore_codecs.h"        // struct codecs_state
  #include "pycore_context.h"       // struct _Py_context_state
  #include "pycore_crossinterp.h"   // struct _xidregistry
  #include "pycore_dict_state.h"    // struct _Py_dict_state
@@ -182,10 +183,7 @@ struct _is {
       possible to facilitate out-of-process observability
       tools. */
  
-    PyObject *codec_search_path;
-    PyObject *codec_search_cache;
-    PyObject *codec_error_registry;
-    int codecs_initialized;
+    struct codecs_state codecs;
  
      PyConfig config;
      unsigned long feature_flags;
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 2c259b7e869efeb02c07e2024bc0383b6429e3c2..67b1282de790e09a42bfc9825c824e30dbf269d5 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -15441,7 +15441,11 @@ init_fs_encoding(PyThreadState *tstate)
  PyStatus
  _PyUnicode_InitEncodings(PyThreadState *tstate)
  {
-    PyStatus status = init_fs_encoding(tstate);
+    PyStatus status = _PyCodec_InitRegistry(tstate->interp);
+    if (_PyStatus_EXCEPTION(status)) {
+        return status;
+    }
+    status = init_fs_encoding(tstate);
      if (_PyStatus_EXCEPTION(status)) {
          return status;
      }
diff --git a/Python/codecs.c b/Python/codecs.c

index d8fe7b22063a809cfb44ca99c4977f0d1e6c4a6f..bed245366f923429c6e68535f4485b44e729bf71 100644 (file)
--- a/Python/codecs.c
+++ b/Python/codecs.c
@@ -11,6 +11,7 @@ Copyright (c) Corporation for National Research Initiatives.
  #include "Python.h"
  #include "pycore_call.h"          // _PyObject_CallNoArgs()
  #include "pycore_interp.h"        // PyInterpreterState.codec_search_path
+#include "pycore_lock.h"          // PyMutex
  #include "pycore_pyerrors.h"      // _PyErr_FormatNote()
  #include "pycore_pystate.h"       // _PyInterpreterState_GET()
  #include "pycore_ucnhash.h"       // _PyUnicode_Name_CAPI
@@ -19,24 +20,10 @@ const char *Py_hexdigits = "0123456789abcdef";
  
  /* --- Codec Registry ----------------------------------------------------- */
  
-/* Import the standard encodings package which will register the first
-   codec search function.
-
-   This is done in a lazy way so that the Unicode implementation does
-   not downgrade startup time of scripts not needing it.
-
-   ImportErrors are silently ignored by this function. Only one try is
-   made.
-
-*/
-
-static int _PyCodecRegistry_Init(void); /* Forward */
-
  int PyCodec_Register(PyObject *search_function)
  {
      PyInterpreterState *interp = _PyInterpreterState_GET();
-    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
-        goto onError;
+    assert(interp->codecs.initialized);
      if (search_function == NULL) {
          PyErr_BadArgument();
          goto onError;
@@ -45,7 +32,14 @@ int PyCodec_Register(PyObject *search_function)
          PyErr_SetString(PyExc_TypeError, "argument must be callable");
          goto onError;
      }
-    return PyList_Append(interp->codec_search_path, search_function);
+#ifdef Py_GIL_DISABLED
+    PyMutex_Lock(&interp->codecs.search_path_mutex);
+#endif
+    int ret = PyList_Append(interp->codecs.search_path, search_function);
+#ifdef Py_GIL_DISABLED
+    PyMutex_Unlock(&interp->codecs.search_path_mutex);
+#endif
+    return ret;
  
   onError:
      return -1;
@@ -55,22 +49,34 @@ int
  PyCodec_Unregister(PyObject *search_function)
  {
      PyInterpreterState *interp = _PyInterpreterState_GET();
-    PyObject *codec_search_path = interp->codec_search_path;
-    /* Do nothing if codec_search_path is not created yet or was cleared. */
-    if (codec_search_path == NULL) {
+    if (interp->codecs.initialized != 1) {
+        /* Do nothing if codecs state was cleared (only possible during
+           interpreter shutdown). */
          return 0;
      }
  
+    PyObject *codec_search_path = interp->codecs.search_path;
      assert(PyList_CheckExact(codec_search_path));
-    Py_ssize_t n = PyList_GET_SIZE(codec_search_path);
-    for (Py_ssize_t i = 0; i < n; i++) {
-        PyObject *item = PyList_GET_ITEM(codec_search_path, i);
+    for (Py_ssize_t i = 0; i < PyList_GET_SIZE(codec_search_path); i++) {
+#ifdef Py_GIL_DISABLED
+        PyMutex_Lock(&interp->codecs.search_path_mutex);
+#endif
+        PyObject *item = PyList_GetItemRef(codec_search_path, i);
+        int ret = 1;
          if (item == search_function) {
-            if (interp->codec_search_cache != NULL) {
-                assert(PyDict_CheckExact(interp->codec_search_cache));
-                PyDict_Clear(interp->codec_search_cache);
-            }
-            return PyList_SetSlice(codec_search_path, i, i+1, NULL);
+            // We hold a reference to the item, so its destructor can't run
+            // while we hold search_path_mutex.
+            ret = PyList_SetSlice(codec_search_path, i, i+1, NULL);
+        }
+#ifdef Py_GIL_DISABLED
+        PyMutex_Unlock(&interp->codecs.search_path_mutex);
+#endif
+        Py_DECREF(item);
+        if (ret != 1) {
+            assert(interp->codecs.search_cache != NULL);
+            assert(PyDict_CheckExact(interp->codecs.search_cache));
+            PyDict_Clear(interp->codecs.search_cache);
+            return ret;
          }
      }
      return 0;
@@ -132,9 +138,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
      }
  
      PyInterpreterState *interp = _PyInterpreterState_GET();
-    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init()) {
-        return NULL;
-    }
+    assert(interp->codecs.initialized);
  
      /* Convert the encoding to a normalized Python string: all
         characters are converted to lower case, spaces and hyphens are
@@ -147,7 +151,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
  
      /* First, try to lookup the name in the registry dictionary */
      PyObject *result;
-    if (PyDict_GetItemRef(interp->codec_search_cache, v, &result) < 0) {
+    if (PyDict_GetItemRef(interp->codecs.search_cache, v, &result) < 0) {
          goto onError;
      }
      if (result != NULL) {
@@ -156,7 +160,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
      }
  
      /* Next, scan the search functions in order of registration */
-    const Py_ssize_t len = PyList_Size(interp->codec_search_path);
+    const Py_ssize_t len = PyList_Size(interp->codecs.search_path);
      if (len < 0)
          goto onError;
      if (len == 0) {
@@ -170,14 +174,15 @@ PyObject *_PyCodec_Lookup(const char *encoding)
      for (i = 0; i < len; i++) {
          PyObject *func;
  
-        func = PyList_GetItem(interp->codec_search_path, i);
+        func = PyList_GetItemRef(interp->codecs.search_path, i);
          if (func == NULL)
              goto onError;
          result = PyObject_CallOneArg(func, v);
+        Py_DECREF(func);
          if (result == NULL)
              goto onError;
          if (result == Py_None) {
-            Py_DECREF(result);
+            Py_CLEAR(result);
              continue;
          }
          if (!PyTuple_Check(result) || PyTuple_GET_SIZE(result) != 4) {
@@ -188,7 +193,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
          }
          break;
      }
-    if (i == len) {
+    if (result == NULL) {
          /* XXX Perhaps we should cache misses too ? */
          PyErr_Format(PyExc_LookupError,
                       "unknown encoding: %s", encoding);
@@ -196,7 +201,7 @@ PyObject *_PyCodec_Lookup(const char *encoding)
      }
  
      /* Cache and return the result */
-    if (PyDict_SetItem(interp->codec_search_cache, v, result) < 0) {
+    if (PyDict_SetItem(interp->codecs.search_cache, v, result) < 0) {
          Py_DECREF(result);
          goto onError;
      }
@@ -600,13 +605,12 @@ PyObject *_PyCodec_DecodeText(PyObject *object,
  int PyCodec_RegisterError(const char *name, PyObject *error)
  {
      PyInterpreterState *interp = _PyInterpreterState_GET();
-    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
-        return -1;
+    assert(interp->codecs.initialized);
      if (!PyCallable_Check(error)) {
          PyErr_SetString(PyExc_TypeError, "handler must be callable");
          return -1;
      }
-    return PyDict_SetItemString(interp->codec_error_registry,
+    return PyDict_SetItemString(interp->codecs.error_registry,
                                  name, error);
  }
  
@@ -616,13 +620,12 @@ int PyCodec_RegisterError(const char *name, PyObject *error)
  PyObject *PyCodec_LookupError(const char *name)
  {
      PyInterpreterState *interp = _PyInterpreterState_GET();
-    if (interp->codec_search_path == NULL && _PyCodecRegistry_Init())
-        return NULL;
+    assert(interp->codecs.initialized);
  
      if (name==NULL)
          name = "strict";
      PyObject *handler;
-    if (PyDict_GetItemStringRef(interp->codec_error_registry, name, &handler) < 0) {
+    if (PyDict_GetItemStringRef(interp->codecs.error_registry, name, &handler) < 0) {
          return NULL;
      }
      if (handler == NULL) {
@@ -1375,7 +1378,8 @@ static PyObject *surrogateescape_errors(PyObject *self, PyObject *exc)
      return PyCodec_SurrogateEscapeErrors(exc);
  }
  
-static int _PyCodecRegistry_Init(void)
+PyStatus
+_PyCodec_InitRegistry(PyInterpreterState *interp)
  {
      static struct {
          const char *name;
@@ -1463,45 +1467,51 @@ static int _PyCodecRegistry_Init(void)
          }
      };
  
-    PyInterpreterState *interp = _PyInterpreterState_GET();
-    PyObject *mod;
-
-    if (interp->codec_search_path != NULL)
-        return 0;
-
-    interp->codec_search_path = PyList_New(0);
-    if (interp->codec_search_path == NULL) {
-        return -1;
+    assert(interp->codecs.initialized == 0);
+    interp->codecs.search_path = PyList_New(0);
+    if (interp->codecs.search_path == NULL) {
+        return PyStatus_NoMemory();
      }
-
-    interp->codec_search_cache = PyDict_New();
-    if (interp->codec_search_cache == NULL) {
-        return -1;
+    interp->codecs.search_cache = PyDict_New();
+    if (interp->codecs.search_cache == NULL) {
+        return PyStatus_NoMemory();
      }
-
-    interp->codec_error_registry = PyDict_New();
-    if (interp->codec_error_registry == NULL) {
-        return -1;
+    interp->codecs.error_registry = PyDict_New();
+    if (interp->codecs.error_registry == NULL) {
+        return PyStatus_NoMemory();
      }
-
      for (size_t i = 0; i < Py_ARRAY_LENGTH(methods); ++i) {
          PyObject *func = PyCFunction_NewEx(&methods[i].def, NULL, NULL);
-        if (!func) {
-            return -1;
+        if (func == NULL) {
+            return PyStatus_NoMemory();
          }
  
-        int res = PyCodec_RegisterError(methods[i].name, func);
+        int res = PyDict_SetItemString(interp->codecs.error_registry,
+                                       methods[i].name, func);
          Py_DECREF(func);
-        if (res) {
-            return -1;
+        if (res < 0) {
+            return PyStatus_Error("Failed to insert into codec error registry");
          }
      }
  
-    mod = PyImport_ImportModule("encodings");
+    interp->codecs.initialized = 1;
+
+    // Importing `encodings' will call back into this module to register codec
+    // search functions, so this is done after everything else is initialized.
+    PyObject *mod = PyImport_ImportModule("encodings");
      if (mod == NULL) {
-        return -1;
+        return PyStatus_Error("Failed to import encodings module");
      }
      Py_DECREF(mod);
-    interp->codecs_initialized = 1;
-    return 0;
+
+    return PyStatus_Ok();
+}
+
+void
+_PyCodec_Fini(PyInterpreterState *interp)
+{
+    Py_CLEAR(interp->codecs.search_path);
+    Py_CLEAR(interp->codecs.search_cache);
+    Py_CLEAR(interp->codecs.error_registry);
+    interp->codecs.initialized = 0;
  }
diff --git a/Python/pystate.c b/Python/pystate.c

index 3d6e76e88bd73109fc0fe4941bb76016711df61e..f442d87ba3150e20271902dd0f65a782441a495d 100644 (file)
--- a/Python/pystate.c
+++ b/Python/pystate.c
@@ -843,9 +843,7 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate)
      }
  
      PyConfig_Clear(&interp->config);
-    Py_CLEAR(interp->codec_search_path);
-    Py_CLEAR(interp->codec_search_cache);
-    Py_CLEAR(interp->codec_error_registry);
+    _PyCodec_Fini(interp);
  
      assert(interp->imports.modules == NULL);
      assert(interp->imports.modules_by_index == NULL);
diff --git a/Tools/c-analyzer/cpython/ignored.tsv b/Tools/c-analyzer/cpython/ignored.tsv

index 87b695de23e25e2ce6acc2cdfe9a9b0b8047b1b8..c4dbdcfe80bb4a4ecd946bb9c386eae1c27680a3 100644 (file)
--- a/Tools/c-analyzer/cpython/ignored.tsv
+++ b/Tools/c-analyzer/cpython/ignored.tsv
@@ -344,7 +344,7 @@ Python/ceval.c      -       _PyEval_BinaryOps       -
  Python/ceval.c -       _Py_INTERPRETER_TRAMPOLINE_INSTRUCTIONS -
  Python/codecs.c        -       Py_hexdigits    -
  Python/codecs.c        -       ucnhash_capi    -
-Python/codecs.c        _PyCodecRegistry_Init   methods -
+Python/codecs.c        _PyCodec_InitRegistry   methods -
  Python/compile.c       -       NO_LABEL        -
  Python/compile.c       -       NO_LOCATION     -
  Python/dynload_shlib.c -       _PyImport_DynLoadFiletab        -
author	Brett Simmers <swtaarrs@users.noreply.github.com>
	Thu, 2 May 2024 22:25:36 +0000 (15:25 -0700)
committer	GitHub <noreply@github.com>
	Thu, 2 May 2024 22:25:36 +0000 (18:25 -0400)
Include/internal/pycore_codecs.h		patch \| blob \| blame \| history
Include/internal/pycore_interp.h		patch \| blob \| blame \| history
Objects/unicodeobject.c		patch \| blob \| blame \| history
Python/codecs.c		patch \| blob \| blame \| history
Python/pystate.c		patch \| blob \| blame \| history
Tools/c-analyzer/cpython/ignored.tsv		patch \| blob \| blame \| history