We replace _PyRuntime.tstate_current with a thread-local variable. As part of this change, we add a _Py_thread_local macro in pyport.h (only for the core runtime) to smooth out the compiler differences. The main motivation here is in support of a per-interpreter GIL, but this change also provides some performance improvement opportunities.
Note that we do not provide a fallback to the thread-local, either falling back to the old tstate_current or to thread-specific storage (PyThread_tss_*()). If that proves problematic then we can circle back. I consider it unlikely, but will run the buildbots to double-check.
Also note that this does not change any of the code related to the GILState API, where it uses a thread state stored in thread-specific storage. I suspect we can combine that with _Py_tss_tstate (from here). However, that can be addressed separately and is not urgent (nor critical).
(While this change was mostly done independently, I did take some inspiration from earlier (~2020) work by @markshannon (main...markshannon:threadstate_in_tls) and @vstinner (#23976).)
/* Variable and macro for in-line access to current thread
and interpreter state */
-static inline PyThreadState*
-_PyRuntimeState_GetThreadState(_PyRuntimeState *runtime)
-{
- return (PyThreadState*)_Py_atomic_load_relaxed(&runtime->tstate_current);
-}
+#if defined(HAVE_THREAD_LOCAL) && !defined(Py_BUILD_CORE_MODULE)
+extern _Py_thread_local PyThreadState *_Py_tss_tstate;
+#endif
+PyAPI_DATA(PyThreadState *) _PyThreadState_GetCurrent(void);
/* Get the current Python thread state.
- Efficient macro reading directly the 'tstate_current' atomic
- variable. The macro is unsafe: it does not check for error and it can
- return NULL.
+ This function is unsafe: it does not check for error and it can return NULL.
The caller must hold the GIL.
static inline PyThreadState*
_PyThreadState_GET(void)
{
- return _PyRuntimeState_GetThreadState(&_PyRuntime);
+#if defined(HAVE_THREAD_LOCAL) && !defined(Py_BUILD_CORE_MODULE)
+ return _Py_tss_tstate;
+#else
+ return _PyThreadState_GetCurrent();
+#endif
+}
+
+static inline PyThreadState*
+_PyRuntimeState_GetThreadState(_PyRuntimeState *Py_UNUSED(runtime))
+{
+ return _PyThreadState_GET();
}
+
static inline void
_Py_EnsureFuncTstateNotNULL(const char *func, PyThreadState *tstate)
{
unsigned long main_thread;
- /* Assuming the current thread holds the GIL, this is the
- PyThreadState for the current thread. */
- _Py_atomic_address tstate_current;
/* Used for the thread state bound to the current thread. */
Py_tss_t autoTSSkey;
# define WITH_THREAD
#endif
+#ifdef WITH_THREAD
+# ifdef Py_BUILD_CORE
+# ifdef HAVE_THREAD_LOCAL
+# error "HAVE_THREAD_LOCAL is already defined"
+# endif
+# define HAVE_THREAD_LOCAL 1
+# ifdef thread_local
+# define _Py_thread_local thread_local
+# elif __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
+# define _Py_thread_local _Thread_local
+# elif defined(_MSC_VER) /* AKA NT_THREADS */
+# define _Py_thread_local __declspec(thread)
+# elif defined(__GNUC__) /* includes clang */
+# define _Py_thread_local __thread
+# else
+ // fall back to the PyThread_tss_*() API, or ignore.
+# undef HAVE_THREAD_LOCAL
+# endif
+# endif
+#endif
+
/* Check that ALT_SOABI is consistent with Py_TRACE_REFS:
./configure --with-trace-refs should must be used to define Py_TRACE_REFS */
#if defined(ALT_SOABI) && defined(Py_TRACE_REFS)
--- /dev/null
+We've replaced our use of ``_PyRuntime.tstate_current`` with a thread-local
+variable. This is a fairly low-level implementation detail, and there
+should be no change in behavior.
For each of these functions, the GIL must be held by the current thread.
*/
+
+#ifdef HAVE_THREAD_LOCAL
+_Py_thread_local PyThreadState *_Py_tss_tstate = NULL;
+#endif
+
static inline PyThreadState *
-current_fast_get(_PyRuntimeState *runtime)
+current_fast_get(_PyRuntimeState *Py_UNUSED(runtime))
{
- return (PyThreadState*)_Py_atomic_load_relaxed(&runtime->tstate_current);
+#ifdef HAVE_THREAD_LOCAL
+ return _Py_tss_tstate;
+#else
+ // XXX Fall back to the PyThread_tss_*() API.
+# error "no supported thread-local variable storage classifier"
+#endif
}
static inline void
-current_fast_set(_PyRuntimeState *runtime, PyThreadState *tstate)
+current_fast_set(_PyRuntimeState *Py_UNUSED(runtime), PyThreadState *tstate)
{
assert(tstate != NULL);
- _Py_atomic_store_relaxed(&runtime->tstate_current, (uintptr_t)tstate);
+#ifdef HAVE_THREAD_LOCAL
+ _Py_tss_tstate = tstate;
+#else
+ // XXX Fall back to the PyThread_tss_*() API.
+# error "no supported thread-local variable storage classifier"
+#endif
}
static inline void
-current_fast_clear(_PyRuntimeState *runtime)
+current_fast_clear(_PyRuntimeState *Py_UNUSED(runtime))
{
- _Py_atomic_store_relaxed(&runtime->tstate_current, (uintptr_t)NULL);
+#ifdef HAVE_THREAD_LOCAL
+ _Py_tss_tstate = NULL;
+#else
+ // XXX Fall back to the PyThread_tss_*() API.
+# error "no supported thread-local variable storage classifier"
+#endif
}
#define tstate_verify_not_active(tstate) \
_Py_FatalErrorFormat(__func__, "tstate %p is still current", tstate); \
}
+PyThreadState *
+_PyThreadState_GetCurrent(void)
+{
+ return current_fast_get(&_PyRuntime);
+}
+
//------------------------------------------------
// the thread state bound to the current OS thread