]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-115103: Implement delayed memory reclamation (QSBR) (#115180)
authorSam Gross <colesbury@gmail.com>
Fri, 16 Feb 2024 20:25:19 +0000 (15:25 -0500)
committerGitHub <noreply@github.com>
Fri, 16 Feb 2024 20:25:19 +0000 (15:25 -0500)
This adds a safe memory reclamation scheme based on FreeBSD's "GUS" and
quiescent state based reclamation (QSBR). The API provides a mechanism
for callers to detect when it is safe to free memory that may be
concurrently accessed by readers.

18 files changed:
Doc/license.rst
Include/cpython/pyatomic.h
Include/cpython/pyatomic_gcc.h
Include/cpython/pyatomic_msc.h
Include/cpython/pyatomic_std.h
Include/internal/pycore_interp.h
Include/internal/pycore_qsbr.h [new file with mode: 0644]
Include/internal/pycore_runtime_init.h
Include/internal/pycore_tstate.h
Makefile.pre.in
Modules/posixmodule.c
PCbuild/_freeze_module.vcxproj
PCbuild/_freeze_module.vcxproj.filters
PCbuild/pythoncore.vcxproj
PCbuild/pythoncore.vcxproj.filters
Python/ceval_macros.h
Python/pystate.c
Python/qsbr.c [new file with mode: 0644]

index 9fc0ff7161a5914628bb86eebb372b7d8a67135a..cbe918bd1acfe30ed4403d25fc5248d50b46f416 100644 (file)
@@ -1095,3 +1095,35 @@ which is distributed under the MIT license::
   LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
   OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
   WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+Global Unbounded Sequences (GUS)
+--------------------------------
+
+The file :file:`Python/qsbr.c` is adapted from FreeBSD's "Global Unbounded
+Sequences" safe memory reclamation scheme in
+`subr_smr.c <https://github.com/freebsd/freebsd-src/blob/main/sys/kern/subr_smr.c>`_.
+The file is distributed under the 2-Clause BSD License::
+
+  Copyright (c) 2019,2020 Jeffrey Roberson <jeff@FreeBSD.org>
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+  1. Redistributions of source code must retain the above copyright
+     notice unmodified, this list of conditions, and the following
+     disclaimer.
+  2. Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+
+  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
index 9b5774190ee99e9da0649391d303b04b5f501aec..737eed8b12dd817a5b42149dbf84ab4b2fcb334d 100644 (file)
@@ -475,6 +475,12 @@ _Py_atomic_store_int_release(int *obj, int value);
 static inline int
 _Py_atomic_load_int_acquire(const int *obj);
 
+static inline void
+_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value);
+
+static inline uint64_t
+_Py_atomic_load_uint64_acquire(const uint64_t *obj);
+
 static inline uint32_t
 _Py_atomic_load_uint32_acquire(const uint32_t *obj);
 
index bc74149f73e83cf996534dc10c004ac4f16aca1b..de23edfc6877d23e4a26ca67e7e701dad065e55e 100644 (file)
@@ -504,6 +504,14 @@ static inline int
 _Py_atomic_load_int_acquire(const int *obj)
 { return __atomic_load_n(obj, __ATOMIC_ACQUIRE); }
 
+static inline void
+_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value)
+{ __atomic_store_n(obj, value, __ATOMIC_RELEASE); }
+
+static inline uint64_t
+_Py_atomic_load_uint64_acquire(const uint64_t *obj)
+{ return __atomic_load_n(obj, __ATOMIC_ACQUIRE); }
+
 static inline uint32_t
 _Py_atomic_load_uint32_acquire(const uint32_t *obj)
 { return __atomic_load_n(obj, __ATOMIC_ACQUIRE); }
index 6ab6401cf81e8acf6a8a2503d7ab78522ce2ca19..9809d9806d7b574d1cd53a81859967b04aac816c 100644 (file)
@@ -952,13 +952,39 @@ _Py_atomic_load_int_acquire(const int *obj)
 #endif
 }
 
+static inline void
+_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    *(uint64_t volatile *)obj = value;
+#elif defined(_M_ARM64)
+    _Py_atomic_ASSERT_ARG_TYPE(unsigned __int64);
+    __stlr64((unsigned __int64 volatile *)obj, (unsigned __int64)value);
+#else
+#  error "no implementation of _Py_atomic_store_uint64_release"
+#endif
+}
+
+static inline uint64_t
+_Py_atomic_load_uint64_acquire(const uint64_t *obj)
+{
+#if defined(_M_X64) || defined(_M_IX86)
+    return *(uint64_t volatile *)obj;
+#elif defined(_M_ARM64)
+    _Py_atomic_ASSERT_ARG_TYPE(__int64);
+    return (uint64_t)__ldar64((unsigned __int64 volatile *)obj);
+#else
+#  error "no implementation of _Py_atomic_load_uint64_acquire"
+#endif
+}
+
 static inline uint32_t
 _Py_atomic_load_uint32_acquire(const uint32_t *obj)
 {
 #if defined(_M_X64) || defined(_M_IX86)
     return *(uint32_t volatile *)obj;
 #elif defined(_M_ARM64)
-    return (int)__ldar32((uint32_t volatile *)obj);
+    return (uint32_t)__ldar32((uint32_t volatile *)obj);
 #else
 #  error "no implementation of _Py_atomic_load_uint32_acquire"
 #endif
index d3004dbd24ed09a90d32dc4d949f846a783ec4b2..f5bd73a8a49e31b0644bb54df07087cd922457c8 100644 (file)
@@ -887,6 +887,22 @@ _Py_atomic_load_int_acquire(const int *obj)
                                 memory_order_acquire);
 }
 
+static inline void
+_Py_atomic_store_uint64_release(uint64_t *obj, uint64_t value)
+{
+    _Py_USING_STD;
+    atomic_store_explicit((_Atomic(uint64_t)*)obj, value,
+                          memory_order_release);
+}
+
+static inline uint64_t
+_Py_atomic_load_uint64_acquire(const uint64_t *obj)
+{
+    _Py_USING_STD;
+    return atomic_load_explicit((const _Atomic(uint64_t)*)obj,
+                                memory_order_acquire);
+}
+
 static inline uint32_t
 _Py_atomic_load_uint32_acquire(const uint32_t *obj)
 {
index c07447183d6209060d4a92a9130830839be251cf..567d6a9bd510ab4f63558ed7c2fddbc99e839ce7 100644 (file)
@@ -30,6 +30,7 @@ extern "C" {
 #include "pycore_mimalloc.h"      // struct _mimalloc_interp_state
 #include "pycore_object_state.h"  // struct _py_object_state
 #include "pycore_obmalloc.h"      // struct _obmalloc_state
+#include "pycore_qsbr.h"          // struct _qsbr_state
 #include "pycore_tstate.h"        // _PyThreadStateImpl
 #include "pycore_tuple.h"         // struct _Py_tuple_state
 #include "pycore_typeobject.h"    // struct types_state
@@ -197,6 +198,7 @@ struct _is {
     struct _warnings_runtime_state warnings;
     struct atexit_state atexit;
     struct _stoptheworld_state stoptheworld;
+    struct _qsbr_shared qsbr;
 
 #if defined(Py_GIL_DISABLED)
     struct _mimalloc_interp_state mimalloc;
diff --git a/Include/internal/pycore_qsbr.h b/Include/internal/pycore_qsbr.h
new file mode 100644 (file)
index 0000000..475f00d
--- /dev/null
@@ -0,0 +1,139 @@
+// The QSBR APIs (quiescent state-based reclamation) provide a mechanism for
+// the free-threaded build to safely reclaim memory when there may be
+// concurrent accesses.
+//
+// Many operations in the free-threaded build are protected by locks. However,
+// in some cases, we want to allow reads to happen concurrently with updates.
+// In this case, we need to delay freeing ("reclaiming") any memory that may be
+// concurrently accessed by a reader. The QSBR APIs provide a way to do this.
+#ifndef Py_INTERNAL_QSBR_H
+#define Py_INTERNAL_QSBR_H
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "pycore_lock.h"        // PyMutex
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef Py_BUILD_CORE
+#  error "this header requires Py_BUILD_CORE define"
+#endif
+
+// The shared write sequence is always odd and incremented by two. Detached
+// threads are indicated by a read sequence of zero. This avoids collisions
+// between the offline state and any valid sequence number even if the
+// sequences numbers wrap around.
+#define QSBR_OFFLINE 0
+#define QSBR_INITIAL 1
+#define QSBR_INCR    2
+
+struct _qsbr_shared;
+struct _PyThreadStateImpl;  // forward declare to avoid circular dependency
+
+// Per-thread state
+struct _qsbr_thread_state {
+    // Last observed write sequence (or 0 if detached)
+    uint64_t seq;
+
+    // Shared (per-interpreter) QSBR state
+    struct _qsbr_shared *shared;
+
+    // Thread state (or NULL)
+    PyThreadState *tstate;
+
+    // Used to defer advancing write sequence a fixed number of times
+    int deferrals;
+
+    // Is this thread state allocated?
+    bool allocated;
+    struct _qsbr_thread_state *freelist_next;
+};
+
+// Padding to avoid false sharing
+struct _qsbr_pad {
+    struct _qsbr_thread_state qsbr;
+    char __padding[64 - sizeof(struct _qsbr_thread_state)];
+};
+
+// Per-interpreter state
+struct _qsbr_shared {
+    // Write sequence: always odd, incremented by two
+    uint64_t wr_seq;
+
+    // Minimum observed read sequence of all QSBR thread states
+    uint64_t rd_seq;
+
+    // Array of QSBR thread states.
+    struct _qsbr_pad *array;
+    Py_ssize_t size;
+
+    // Freelist of unused _qsbr_thread_states (protected by mutex)
+    PyMutex mutex;
+    struct _qsbr_thread_state *freelist;
+};
+
+static inline uint64_t
+_Py_qsbr_shared_current(struct _qsbr_shared *shared)
+{
+    return _Py_atomic_load_uint64_acquire(&shared->wr_seq);
+}
+
+// Reports a quiescent state: the caller no longer holds any pointer to shared
+// data not protected by locks or reference counts.
+static inline void
+_Py_qsbr_quiescent_state(struct _qsbr_thread_state *qsbr)
+{
+    uint64_t seq = _Py_qsbr_shared_current(qsbr->shared);
+    _Py_atomic_store_uint64_release(&qsbr->seq, seq);
+}
+
+// Advance the write sequence and return the new goal. This should be called
+// after data is removed. The returned goal is used with `_Py_qsbr_poll()` to
+// determine when it is safe to reclaim (free) the memory.
+extern uint64_t
+_Py_qsbr_advance(struct _qsbr_shared *shared);
+
+// Batches requests to advance the write sequence. This advances the write
+// sequence every N calls, which reduces overhead but increases time to
+// reclamation. Returns the new goal.
+extern uint64_t
+_Py_qsbr_deferred_advance(struct _qsbr_thread_state *qsbr);
+
+// Have the read sequences advanced to the given goal? If this returns true,
+// it safe to reclaim any memory tagged with the goal (or earlier goal).
+extern bool
+_Py_qsbr_poll(struct _qsbr_thread_state *qsbr, uint64_t goal);
+
+// Called when thread attaches to interpreter
+extern void
+_Py_qsbr_attach(struct _qsbr_thread_state *qsbr);
+
+// Called when thread detaches from interpreter
+extern void
+_Py_qsbr_detach(struct _qsbr_thread_state *qsbr);
+
+// Reserves (allocates) a QSBR state and returns its index.
+extern Py_ssize_t
+_Py_qsbr_reserve(PyInterpreterState *interp);
+
+// Associates a PyThreadState with the QSBR state at the given index
+extern void
+_Py_qsbr_register(struct _PyThreadStateImpl *tstate,
+                  PyInterpreterState *interp, Py_ssize_t index);
+
+// Disassociates a PyThreadState from the QSBR state and frees the QSBR state.
+extern void
+_Py_qsbr_unregister(struct _PyThreadStateImpl *tstate);
+
+extern void
+_Py_qsbr_fini(PyInterpreterState *interp);
+
+extern void
+_Py_qsbr_after_fork(struct _PyThreadStateImpl *tstate);
+
+#ifdef __cplusplus
+}
+#endif
+#endif   /* !Py_INTERNAL_QSBR_H */
index 7a05c105d7bf1285a8402ebb0a833c4bf43064dc..be81604d653814c56917ad9cfa58d38ce1d919f0 100644 (file)
@@ -17,6 +17,7 @@ extern "C" {
 #include "pycore_pyhash.h"        // pyhash_state_INIT
 #include "pycore_pymem_init.h"    // _pymem_allocators_standard_INIT
 #include "pycore_pythread.h"      // _pythread_RUNTIME_INIT
+#include "pycore_qsbr.h"          // QSBR_INITIAL
 #include "pycore_runtime_init_generated.h"  // _Py_bytes_characters_INIT
 #include "pycore_signal.h"        // _signals_RUNTIME_INIT
 #include "pycore_tracemalloc.h"   // _tracemalloc_runtime_state_INIT
@@ -169,6 +170,10 @@ extern PyTypeObject _PyExc_MemoryError;
                 { .threshold = 10, }, \
             }, \
         }, \
+        .qsbr = { \
+            .wr_seq = QSBR_INITIAL, \
+            .rd_seq = QSBR_INITIAL, \
+        }, \
         .dtoa = _dtoa_state_INIT(&(INTERP)), \
         .dict_state = _dict_state_INIT, \
         .func_state = { \
index 7fb9ab2056704e10843f79cc42fb90ae4978aeb4..d0f980ed49ee3e236779d1a608a8c786d886bd79 100644 (file)
@@ -8,9 +8,10 @@ extern "C" {
 #  error "this header requires Py_BUILD_CORE define"
 #endif
 
+#include "pycore_brc.h"           // struct _brc_thread_state
 #include "pycore_freelist.h"      // struct _Py_freelist_state
 #include "pycore_mimalloc.h"      // struct _mimalloc_thread_state
-#include "pycore_brc.h"           // struct _brc_thread_state
+#include "pycore_qsbr.h"          // struct qsbr
 
 
 static inline void
@@ -27,6 +28,8 @@ typedef struct _PyThreadStateImpl {
     // semi-public fields are in PyThreadState.
     PyThreadState base;
 
+    struct _qsbr_thread_state *qsbr;  // only used by free-threaded build
+
 #ifdef Py_GIL_DISABLED
     struct _gc_thread_state gc;
     struct _mimalloc_thread_state mimalloc;
index 8252e6631c5af58fd428610463d1570ec09e1230..66c4266b2f8f97be759ef67df20889e6e3569a14 100644 (file)
@@ -458,6 +458,7 @@ PYTHON_OBJS=        \
                Python/pystate.o \
                Python/pythonrun.o \
                Python/pytime.o \
+               Python/qsbr.o \
                Python/bootstrap_hash.o \
                Python/specialize.o \
                Python/structmember.o \
@@ -1162,6 +1163,7 @@ PYTHON_HEADERS= \
                $(srcdir)/Include/internal/pycore_pystats.h \
                $(srcdir)/Include/internal/pycore_pythonrun.h \
                $(srcdir)/Include/internal/pycore_pythread.h \
+               $(srcdir)/Include/internal/pycore_qsbr.h \
                $(srcdir)/Include/internal/pycore_range.h \
                $(srcdir)/Include/internal/pycore_runtime.h \
                $(srcdir)/Include/internal/pycore_runtime_init.h \
index 958b5a5e6e240663ddbdd385ae8fae0add77dfa3..9d9c9bd76b7fff6b8d8035db9f8a359f6078e7cb 100644 (file)
@@ -645,6 +645,7 @@ PyOS_AfterFork_Child(void)
 
 #ifdef Py_GIL_DISABLED
     _Py_brc_after_fork(tstate->interp);
+    _Py_qsbr_after_fork((_PyThreadStateImpl *)tstate);
 #endif
 
     status = _PyEval_ReInitThreads(tstate);
index 49f529ebbc2f9b053191d8fb122008b789bfbbe0..00ad3e2472af04cf07f1bd008dea53b02aa166c5 100644 (file)
     <ClCompile Include="..\Python\pythonrun.c" />
     <ClCompile Include="..\Python\Python-tokenize.c" />
     <ClCompile Include="..\Python\pytime.c" />
+    <ClCompile Include="..\Python\qsbr.c" />
     <ClCompile Include="..\Python\specialize.c" />
     <ClCompile Include="..\Python\structmember.c" />
     <ClCompile Include="..\Python\suggestions.c" />
index 5b1bd7552b4cd9851cce352410802b2cf3d1a929..aea5f730607658e0624602e3a5a342fad556354c 100644 (file)
     <ClCompile Include="..\Python\pytime.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\Python\qsbr.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\Objects\rangeobject.c">
       <Filter>Source Files</Filter>
     </ClCompile>
index abfafbb2a32f45c7b67a1cc6c597274273a6bfd7..c7b698f0e17a396acb2634ee825e3092da42daa1 100644 (file)
     <ClInclude Include="..\Include\internal\pycore_pystats.h" />
     <ClInclude Include="..\Include\internal\pycore_pythonrun.h" />
     <ClInclude Include="..\Include\internal\pycore_pythread.h" />
+    <ClInclude Include="..\Include\internal\pycore_qsbr.h" />
     <ClInclude Include="..\Include\internal\pycore_range.h" />
     <ClInclude Include="..\Include\internal\pycore_runtime.h" />
     <ClInclude Include="..\Include\internal\pycore_runtime_init.h" />
     <ClCompile Include="..\Python\pystrcmp.c" />
     <ClCompile Include="..\Python\pystrhex.c" />
     <ClCompile Include="..\Python\pystrtod.c" />
+    <ClCompile Include="..\Python\qsbr.c" />
     <ClCompile Include="..\Python\dtoa.c" />
     <ClCompile Include="..\Python\Python-ast.c" />
     <ClCompile Include="..\Python\Python-tokenize.c" />
index d14f5a6d7fb0fca06c303c8e7d1308c7e2bfcff3..ffe93dc787a8b8d993a13874d2a2c0dce0f9012a 100644 (file)
     <ClInclude Include="..\Include\internal\pycore_pythread.h">
       <Filter>Include\internal</Filter>
     </ClInclude>
+    <ClInclude Include="..\Include\internal\pycore_qsbr.h">
+      <Filter>Include\internal</Filter>
+    </ClInclude>
     <ClInclude Include="..\Include\internal\pycore_range.h">
       <Filter>Include\internal</Filter>
     </ClInclude>
     <ClCompile Include="..\Python\pystrtod.c">
       <Filter>Python</Filter>
     </ClCompile>
+    <ClCompile Include="..\Python\qsbr.c">
+      <Filter>Python</Filter>
+    </ClCompile>
     <ClCompile Include="..\Python\dtoa.c">
       <Filter>Python</Filter>
     </ClCompile>
index c2550f53ad6eaaa101429ed086e0e822ec63c6fd..1043966c9a82771224b028e02abc9b46f8e59aa3 100644 (file)
 #define PRE_DISPATCH_GOTO() ((void)0)
 #endif
 
+#ifdef Py_GIL_DISABLED
+#define QSBR_QUIESCENT_STATE(tstate) _Py_qsbr_quiescent_state(((_PyThreadStateImpl *)tstate)->qsbr)
+#else
+#define QSBR_QUIESCENT_STATE(tstate)
+#endif
+
 
 /* Do interpreter dispatch accounting for tracing and instrumentation */
 #define DISPATCH() \
 
 #define CHECK_EVAL_BREAKER() \
     _Py_CHECK_EMSCRIPTEN_SIGNALS_PERIODICALLY(); \
+    QSBR_QUIESCENT_STATE(tstate); \
     if (_Py_atomic_load_uintptr_relaxed(&tstate->interp->ceval.eval_breaker) & _PY_EVAL_EVENTS_MASK) { \
         if (_Py_HandlePending(tstate) != 0) { \
             GOTO_ERROR(error); \
index 24f9b7790915ab5d4f61957b75701af573b8fda9..c2ccc276449d4f36d7f456e2b7fbbe6da4d0b2b4 100644 (file)
@@ -953,6 +953,8 @@ PyInterpreterState_Delete(PyInterpreterState *interp)
         PyThread_free_lock(interp->id_mutex);
     }
 
+    _Py_qsbr_fini(interp);
+
     _PyObject_FiniState(interp);
 
     free_interpreter(interp);
@@ -1386,6 +1388,14 @@ new_threadstate(PyInterpreterState *interp, int whence)
     if (new_tstate == NULL) {
         return NULL;
     }
+#ifdef Py_GIL_DISABLED
+    Py_ssize_t qsbr_idx = _Py_qsbr_reserve(interp);
+    if (qsbr_idx < 0) {
+        PyMem_RawFree(new_tstate);
+        return NULL;
+    }
+#endif
+
     /* We serialize concurrent creation to protect global state. */
     HEAD_LOCK(runtime);
 
@@ -1420,6 +1430,12 @@ new_threadstate(PyInterpreterState *interp, int whence)
         // Must be called with lock unlocked to avoid re-entrancy deadlock.
         PyMem_RawFree(new_tstate);
     }
+
+#ifdef Py_GIL_DISABLED
+    // Must be called with lock unlocked to avoid lock ordering deadlocks.
+    _Py_qsbr_register(tstate, interp, qsbr_idx);
+#endif
+
     return (PyThreadState *)tstate;
 }
 
@@ -1611,6 +1627,10 @@ tstate_delete_common(PyThreadState *tstate)
     }
     HEAD_UNLOCK(runtime);
 
+#ifdef Py_GIL_DISABLED
+    _Py_qsbr_unregister((_PyThreadStateImpl *)tstate);
+#endif
+
     // XXX Unbind in PyThreadState_Clear(), or earlier
     // (and assert not-equal here)?
     if (tstate->_status.bound_gilstate) {
@@ -1652,6 +1672,9 @@ void
 _PyThreadState_DeleteCurrent(PyThreadState *tstate)
 {
     _Py_EnsureTstateNotNULL(tstate);
+#ifdef Py_GIL_DISABLED
+    _Py_qsbr_detach(((_PyThreadStateImpl *)tstate)->qsbr);
+#endif
     tstate_set_detached(tstate);
     tstate_delete_common(tstate);
     current_fast_clear(tstate->interp->runtime);
@@ -1873,6 +1896,10 @@ _PyThreadState_Attach(PyThreadState *tstate)
         tstate_wait_attach(tstate);
     }
 
+#ifdef Py_GIL_DISABLED
+    _Py_qsbr_attach(((_PyThreadStateImpl *)tstate)->qsbr);
+#endif
+
     // Resume previous critical section. This acquires the lock(s) from the
     // top-most critical section.
     if (tstate->critical_section != 0) {
@@ -1893,6 +1920,9 @@ detach_thread(PyThreadState *tstate, int detached_state)
     if (tstate->critical_section != 0) {
         _PyCriticalSection_SuspendAll(tstate);
     }
+#ifdef Py_GIL_DISABLED
+    _Py_qsbr_detach(((_PyThreadStateImpl *)tstate)->qsbr);
+#endif
     tstate_deactivate(tstate);
     tstate_set_detached(tstate);
     current_fast_clear(&_PyRuntime);
diff --git a/Python/qsbr.c b/Python/qsbr.c
new file mode 100644 (file)
index 0000000..7f7ae03
--- /dev/null
@@ -0,0 +1,286 @@
+/*
+ * Implementation of safe memory reclamation scheme using
+ * quiescent states.
+ *
+ * This is dervied from the "GUS" safe memory reclamation technique
+ * in FreeBSD written by Jeffrey Roberson. It is heavily modified. Any bugs
+ * in this code are likely due to the modifications.
+ *
+ * The original copyright is preserved below.
+ *
+ * Copyright (c) 2019,2020 Jeffrey Roberson <jeff@FreeBSD.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "Python.h"
+#include "pycore_initconfig.h"      // _PyStatus_NO_MEMORY()
+#include "pycore_lock.h"            // PyMutex_Lock()
+#include "pycore_qsbr.h"
+#include "pycore_pystate.h"         // _PyThreadState_GET()
+
+
+// Wrap-around safe comparison. This is a holdover from the FreeBSD
+// implementation, which uses 32-bit sequence numbers. We currently use 64-bit
+// sequence numbers, so wrap-around is unlikely.
+#define QSBR_LT(a, b) ((int64_t)((a)-(b)) < 0)
+#define QSBR_LEQ(a, b) ((int64_t)((a)-(b)) <= 0)
+
+// Starting size of the array of qsbr thread states
+#define MIN_ARRAY_SIZE 8
+
+// For _Py_qsbr_deferred_advance(): the number of deferrals before advancing
+// the write sequence.
+#define QSBR_DEFERRED_LIMIT 10
+
+// Allocate a QSBR thread state from the freelist
+static struct _qsbr_thread_state *
+qsbr_allocate(struct _qsbr_shared *shared)
+{
+    struct _qsbr_thread_state *qsbr = shared->freelist;
+    if (qsbr == NULL) {
+        return NULL;
+    }
+    shared->freelist = qsbr->freelist_next;
+    qsbr->freelist_next = NULL;
+    qsbr->shared = shared;
+    qsbr->allocated = true;
+    return qsbr;
+}
+
+// Initialize (or reintialize) the freelist of QSBR thread states
+static void
+initialize_new_array(struct _qsbr_shared *shared)
+{
+    for (Py_ssize_t i = 0; i != shared->size; i++) {
+        struct _qsbr_thread_state *qsbr = &shared->array[i].qsbr;
+        if (qsbr->tstate != NULL) {
+            // Update the thread state pointer to its QSBR state
+            _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)qsbr->tstate;
+            tstate->qsbr = qsbr;
+        }
+        if (!qsbr->allocated) {
+            // Push to freelist
+            qsbr->freelist_next = shared->freelist;
+            shared->freelist = qsbr;
+        }
+    }
+}
+
+// Grow the array of QSBR thread states. Returns 0 on success, -1 on failure.
+static int
+grow_thread_array(struct _qsbr_shared *shared)
+{
+    Py_ssize_t new_size = shared->size * 2;
+    if (new_size < MIN_ARRAY_SIZE) {
+        new_size = MIN_ARRAY_SIZE;
+    }
+
+    struct _qsbr_pad *array = PyMem_RawCalloc(new_size, sizeof(*array));
+    if (array == NULL) {
+        return -1;
+    }
+
+    struct _qsbr_pad *old = shared->array;
+    if (old != NULL) {
+        memcpy(array, shared->array, shared->size * sizeof(*array));
+    }
+
+    shared->array = array;
+    shared->size = new_size;
+    shared->freelist = NULL;
+    initialize_new_array(shared);
+
+    PyMem_RawFree(old);
+    return 0;
+}
+
+uint64_t
+_Py_qsbr_advance(struct _qsbr_shared *shared)
+{
+    // NOTE: with 64-bit sequence numbers, we don't have to worry too much
+    // about the wr_seq getting too far ahead of rd_seq, but if we ever use
+    // 32-bit sequence numbers, we'll need to be more careful.
+    return _Py_atomic_add_uint64(&shared->wr_seq, QSBR_INCR) + QSBR_INCR;
+}
+
+uint64_t
+_Py_qsbr_deferred_advance(struct _qsbr_thread_state *qsbr)
+{
+    if (++qsbr->deferrals < QSBR_DEFERRED_LIMIT) {
+        return _Py_qsbr_shared_current(qsbr->shared) + QSBR_INCR;
+    }
+    qsbr->deferrals = 0;
+    return _Py_qsbr_advance(qsbr->shared);
+}
+
+static uint64_t
+qsbr_poll_scan(struct _qsbr_shared *shared)
+{
+    // Synchronize with store in _Py_qsbr_attach(). We need to ensure that
+    // the reads from each thread's sequence number are not reordered to see
+    // earlier "offline" states.
+    _Py_atomic_fence_seq_cst();
+
+    // Compute the minimum sequence number of all attached threads
+    uint64_t min_seq = _Py_atomic_load_uint64(&shared->wr_seq);
+    struct _qsbr_pad *array = shared->array;
+    for (Py_ssize_t i = 0, size = shared->size; i != size; i++) {
+        struct _qsbr_thread_state *qsbr = &array[i].qsbr;
+
+        uint64_t seq = _Py_atomic_load_uint64(&qsbr->seq);
+        if (seq != QSBR_OFFLINE && QSBR_LT(seq, min_seq)) {
+            min_seq = seq;
+        }
+    }
+
+    // Update the shared read sequence
+    uint64_t rd_seq = _Py_atomic_load_uint64(&shared->rd_seq);
+    if (QSBR_LT(rd_seq, min_seq)) {
+        // It's okay if the compare-exchange failed: another thread updated it
+        (void)_Py_atomic_compare_exchange_uint64(&shared->rd_seq, &rd_seq, min_seq);
+        rd_seq = min_seq;
+    }
+
+    return rd_seq;
+}
+
+bool
+_Py_qsbr_poll(struct _qsbr_thread_state *qsbr, uint64_t goal)
+{
+    assert(_PyThreadState_GET()->state == _Py_THREAD_ATTACHED);
+
+    uint64_t rd_seq = _Py_atomic_load_uint64(&qsbr->shared->rd_seq);
+    if (QSBR_LEQ(goal, rd_seq)) {
+        return true;
+    }
+
+    rd_seq = qsbr_poll_scan(qsbr->shared);
+    return QSBR_LEQ(goal, rd_seq);
+}
+
+void
+_Py_qsbr_attach(struct _qsbr_thread_state *qsbr)
+{
+    assert(qsbr->seq == 0 && "already attached");
+
+    uint64_t seq = _Py_qsbr_shared_current(qsbr->shared);
+    _Py_atomic_store_uint64(&qsbr->seq, seq);  // needs seq_cst
+}
+
+void
+_Py_qsbr_detach(struct _qsbr_thread_state *qsbr)
+{
+    assert(qsbr->seq != 0 && "already detached");
+
+    _Py_atomic_store_uint64_release(&qsbr->seq, QSBR_OFFLINE);
+}
+
+Py_ssize_t
+_Py_qsbr_reserve(PyInterpreterState *interp)
+{
+    struct _qsbr_shared *shared = &interp->qsbr;
+
+    PyMutex_Lock(&shared->mutex);
+    // Try allocating from our internal freelist
+    struct _qsbr_thread_state *qsbr = qsbr_allocate(shared);
+
+    // If there are no free entries, we pause all threads, grow the array,
+    // and update the pointers in PyThreadState to entries in the new array.
+    if (qsbr == NULL) {
+        _PyEval_StopTheWorld(interp);
+        if (grow_thread_array(shared) == 0) {
+            qsbr = qsbr_allocate(shared);
+        }
+        _PyEval_StartTheWorld(interp);
+    }
+    PyMutex_Unlock(&shared->mutex);
+
+    if (qsbr == NULL) {
+        return -1;
+    }
+
+    // Return an index rather than the pointer because the array may be
+    // resized and the pointer invalidated.
+    return (struct _qsbr_pad *)qsbr - shared->array;
+}
+
+void
+_Py_qsbr_register(_PyThreadStateImpl *tstate, PyInterpreterState *interp,
+                  Py_ssize_t index)
+{
+    // Associate the QSBR state with the thread state
+    struct _qsbr_shared *shared = &interp->qsbr;
+
+    PyMutex_Lock(&shared->mutex);
+    struct _qsbr_thread_state *qsbr = &interp->qsbr.array[index].qsbr;
+    assert(qsbr->allocated && qsbr->tstate == NULL);
+    qsbr->tstate = (PyThreadState *)tstate;
+    tstate->qsbr = qsbr;
+    PyMutex_Unlock(&shared->mutex);
+}
+
+void
+_Py_qsbr_unregister(_PyThreadStateImpl *tstate)
+{
+    struct _qsbr_thread_state *qsbr = tstate->qsbr;
+    struct _qsbr_shared *shared = qsbr->shared;
+
+    assert(qsbr->seq == 0 && "thread state must be detached");
+
+    PyMutex_Lock(&shared->mutex);
+    assert(qsbr->allocated && qsbr->tstate == (PyThreadState *)tstate);
+    tstate->qsbr = NULL;
+    qsbr->tstate = NULL;
+    qsbr->allocated = false;
+    qsbr->freelist_next = shared->freelist;
+    shared->freelist = qsbr;
+    PyMutex_Unlock(&shared->mutex);
+}
+
+void
+_Py_qsbr_fini(PyInterpreterState *interp)
+{
+    struct _qsbr_shared *shared = &interp->qsbr;
+    PyMem_RawFree(shared->array);
+    shared->array = NULL;
+    shared->size = 0;
+    shared->freelist = NULL;
+}
+
+void
+_Py_qsbr_after_fork(_PyThreadStateImpl *tstate)
+{
+    struct _qsbr_thread_state *this_qsbr = tstate->qsbr;
+    struct _qsbr_shared *shared = this_qsbr->shared;
+
+    _PyMutex_at_fork_reinit(&shared->mutex);
+
+    for (Py_ssize_t i = 0; i != shared->size; i++) {
+        struct _qsbr_thread_state *qsbr = &shared->array[i].qsbr;
+        if (qsbr != this_qsbr && qsbr->allocated) {
+            qsbr->tstate = NULL;
+            qsbr->allocated = false;
+            qsbr->freelist_next = shared->freelist;
+            shared->freelist = qsbr;
+        }
+    }
+}