From: Hai Zhu <35182391+cocolato@users.noreply.github.com> Date: Thu, 8 Jan 2026 19:38:21 +0000 (+0800) Subject: gh-143421: Move `JitOptContext` from stack allocation to per-thread heap allocation... X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=aeb34035633b24afb27d3888fdd12abdf2bdc339;p=thirdparty%2FPython%2Fcpython.git gh-143421: Move `JitOptContext` from stack allocation to per-thread heap allocation (GH-143536) * move JitOptContext to _PyThreadStateImpl * make _PyUOpInstruction buffer a part of _PyThreadStateImpl Co-authored-by: Kumar Aditya --- diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index 6a0fc1a59e79..d1d22c77507c 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -12,6 +12,7 @@ extern "C" { #include "pycore_uop.h" // _PyUOpInstruction #include "pycore_uop_ids.h" #include "pycore_stackref.h" // _PyStackRef +#include "pycore_optimizer_types.h" #include @@ -84,7 +85,7 @@ PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyInterpreterState *interp); #define JIT_CLEANUP_THRESHOLD 1000 int _Py_uop_analyze_and_optimize( - PyFunctionObject *func, + _PyThreadStateImpl *tstate, _PyUOpInstruction *trace, int trace_len, int curr_stackentries, _PyBloomFilter *dependencies); @@ -112,86 +113,6 @@ static inline uint16_t uop_get_error_target(const _PyUOpInstruction *inst) return inst->error_target; } -// Holds locals, stack, locals, stack ... co_consts (in that order) -#define MAX_ABSTRACT_INTERP_SIZE 4096 - -#define TY_ARENA_SIZE (UOP_MAX_TRACE_LENGTH * 5) - -// Need extras for root frame and for overflow frame (see TRACE_STACK_PUSH()) -#define MAX_ABSTRACT_FRAME_DEPTH (16) - -// The maximum number of side exits that we can take before requiring forward -// progress (and inserting a new ENTER_EXECUTOR instruction). In practice, this -// is the "maximum amount of polymorphism" that an isolated trace tree can -// handle before rejoining the rest of the program. -#define MAX_CHAIN_DEPTH 4 - -/* Symbols */ -/* See explanation in optimizer_symbols.c */ - - -typedef enum _JitSymType { - JIT_SYM_UNKNOWN_TAG = 1, - JIT_SYM_NULL_TAG = 2, - JIT_SYM_NON_NULL_TAG = 3, - JIT_SYM_BOTTOM_TAG = 4, - JIT_SYM_TYPE_VERSION_TAG = 5, - JIT_SYM_KNOWN_CLASS_TAG = 6, - JIT_SYM_KNOWN_VALUE_TAG = 7, - JIT_SYM_TUPLE_TAG = 8, - JIT_SYM_TRUTHINESS_TAG = 9, - JIT_SYM_COMPACT_INT = 10, -} JitSymType; - -typedef struct _jit_opt_known_class { - uint8_t tag; - uint32_t version; - PyTypeObject *type; -} JitOptKnownClass; - -typedef struct _jit_opt_known_version { - uint8_t tag; - uint32_t version; -} JitOptKnownVersion; - -typedef struct _jit_opt_known_value { - uint8_t tag; - PyObject *value; -} JitOptKnownValue; - -#define MAX_SYMBOLIC_TUPLE_SIZE 7 - -typedef struct _jit_opt_tuple { - uint8_t tag; - uint8_t length; - uint16_t items[MAX_SYMBOLIC_TUPLE_SIZE]; -} JitOptTuple; - -typedef struct { - uint8_t tag; - bool invert; - uint16_t value; -} JitOptTruthiness; - -typedef struct { - uint8_t tag; -} JitOptCompactInt; - -typedef union _jit_opt_symbol { - uint8_t tag; - JitOptKnownClass cls; - JitOptKnownValue value; - JitOptKnownVersion version; - JitOptTuple tuple; - JitOptTruthiness truthiness; - JitOptCompactInt compact; -} JitOptSymbol; - - -// This mimics the _PyStackRef API -typedef union { - uintptr_t bits; -} JitOptRef; #define REF_IS_BORROWED 1 @@ -238,48 +159,6 @@ PyJitRef_IsBorrowed(JitOptRef ref) return (ref.bits & REF_IS_BORROWED) == REF_IS_BORROWED; } -struct _Py_UOpsAbstractFrame { - bool globals_watched; - // The version number of the globals dicts, once checked. 0 if unchecked. - uint32_t globals_checked_version; - // Max stacklen - int stack_len; - int locals_len; - PyFunctionObject *func; - PyCodeObject *code; - - JitOptRef *stack_pointer; - JitOptRef *stack; - JitOptRef *locals; -}; - -typedef struct _Py_UOpsAbstractFrame _Py_UOpsAbstractFrame; - -typedef struct ty_arena { - int ty_curr_number; - int ty_max_number; - JitOptSymbol arena[TY_ARENA_SIZE]; -} ty_arena; - -typedef struct _JitOptContext { - char done; - char out_of_space; - bool contradiction; - // Has the builtins dict been watched? - bool builtins_watched; - // The current "executing" frame. - _Py_UOpsAbstractFrame *frame; - _Py_UOpsAbstractFrame frames[MAX_ABSTRACT_FRAME_DEPTH]; - int curr_frame_depth; - - // Arena for the symbolic types. - ty_arena t_arena; - - JitOptRef *n_consumed; - JitOptRef *limit; - JitOptRef locals_and_stack[MAX_ABSTRACT_INTERP_SIZE]; -} JitOptContext; - extern bool _Py_uop_sym_is_null(JitOptRef sym); extern bool _Py_uop_sym_is_not_null(JitOptRef sym); extern bool _Py_uop_sym_is_const(JitOptContext *ctx, JitOptRef sym); diff --git a/Include/internal/pycore_optimizer_types.h b/Include/internal/pycore_optimizer_types.h new file mode 100644 index 000000000000..de8e50921e33 --- /dev/null +++ b/Include/internal/pycore_optimizer_types.h @@ -0,0 +1,137 @@ +#ifndef Py_INTERNAL_OPTIMIZER_TYPES_H +#define Py_INTERNAL_OPTIMIZER_TYPES_H +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +#include "pycore_uop.h" // UOP_MAX_TRACE_LENGTH + +// Holds locals, stack, locals, stack ... co_consts (in that order) +#define MAX_ABSTRACT_INTERP_SIZE 4096 + +#define TY_ARENA_SIZE (UOP_MAX_TRACE_LENGTH * 5) + +// Need extras for root frame and for overflow frame (see TRACE_STACK_PUSH()) +#define MAX_ABSTRACT_FRAME_DEPTH (16) + +// The maximum number of side exits that we can take before requiring forward +// progress (and inserting a new ENTER_EXECUTOR instruction). In practice, this +// is the "maximum amount of polymorphism" that an isolated trace tree can +// handle before rejoining the rest of the program. +#define MAX_CHAIN_DEPTH 4 + +/* Symbols */ +/* See explanation in optimizer_symbols.c */ + + +typedef enum _JitSymType { + JIT_SYM_UNKNOWN_TAG = 1, + JIT_SYM_NULL_TAG = 2, + JIT_SYM_NON_NULL_TAG = 3, + JIT_SYM_BOTTOM_TAG = 4, + JIT_SYM_TYPE_VERSION_TAG = 5, + JIT_SYM_KNOWN_CLASS_TAG = 6, + JIT_SYM_KNOWN_VALUE_TAG = 7, + JIT_SYM_TUPLE_TAG = 8, + JIT_SYM_TRUTHINESS_TAG = 9, + JIT_SYM_COMPACT_INT = 10, +} JitSymType; + +typedef struct _jit_opt_known_class { + uint8_t tag; + uint32_t version; + PyTypeObject *type; +} JitOptKnownClass; + +typedef struct _jit_opt_known_version { + uint8_t tag; + uint32_t version; +} JitOptKnownVersion; + +typedef struct _jit_opt_known_value { + uint8_t tag; + PyObject *value; +} JitOptKnownValue; + +#define MAX_SYMBOLIC_TUPLE_SIZE 7 + +typedef struct _jit_opt_tuple { + uint8_t tag; + uint8_t length; + uint16_t items[MAX_SYMBOLIC_TUPLE_SIZE]; +} JitOptTuple; + +typedef struct { + uint8_t tag; + bool invert; + uint16_t value; +} JitOptTruthiness; + +typedef struct { + uint8_t tag; +} JitOptCompactInt; + +typedef union _jit_opt_symbol { + uint8_t tag; + JitOptKnownClass cls; + JitOptKnownValue value; + JitOptKnownVersion version; + JitOptTuple tuple; + JitOptTruthiness truthiness; + JitOptCompactInt compact; +} JitOptSymbol; + +// This mimics the _PyStackRef API +typedef union { + uintptr_t bits; +} JitOptRef; + +typedef struct _Py_UOpsAbstractFrame { + bool globals_watched; + // The version number of the globals dicts, once checked. 0 if unchecked. + uint32_t globals_checked_version; + // Max stacklen + int stack_len; + int locals_len; + PyFunctionObject *func; + PyCodeObject *code; + + JitOptRef *stack_pointer; + JitOptRef *stack; + JitOptRef *locals; +} _Py_UOpsAbstractFrame; + +typedef struct ty_arena { + int ty_curr_number; + int ty_max_number; + JitOptSymbol arena[TY_ARENA_SIZE]; +} ty_arena; + +typedef struct _JitOptContext { + char done; + char out_of_space; + bool contradiction; + // Has the builtins dict been watched? + bool builtins_watched; + // The current "executing" frame. + _Py_UOpsAbstractFrame *frame; + _Py_UOpsAbstractFrame frames[MAX_ABSTRACT_FRAME_DEPTH]; + int curr_frame_depth; + + // Arena for the symbolic types. + ty_arena t_arena; + + JitOptRef *n_consumed; + JitOptRef *limit; + JitOptRef locals_and_stack[MAX_ABSTRACT_INTERP_SIZE]; +} JitOptContext; + + +#ifdef __cplusplus +} +#endif +#endif /* !Py_INTERNAL_OPTIMIZER_TYPES_H */ diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index d8f4bfef98af..81cabb4dca47 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -12,6 +12,7 @@ extern "C" { #include "pycore_freelist_state.h" // struct _Py_freelists #include "pycore_interpframe_structs.h" // _PyInterpreterFrame #include "pycore_mimalloc.h" // struct _mimalloc_thread_state +#include "pycore_optimizer_types.h" // JitOptContext #include "pycore_qsbr.h" // struct qsbr #include "pycore_uop.h" // struct _PyUOpInstruction #include "pycore_structs.h" @@ -52,10 +53,11 @@ typedef struct _PyJitTracerTranslatorState { } _PyJitTracerTranslatorState; typedef struct _PyJitTracerState { - _PyUOpInstruction *code_buffer; _PyJitTracerInitialState initial_state; _PyJitTracerPreviousState prev_state; _PyJitTracerTranslatorState translator_state; + JitOptContext opt_context; + _PyUOpInstruction code_buffer[UOP_MAX_TRACE_LENGTH]; } _PyJitTracerState; #endif diff --git a/Python/optimizer.c b/Python/optimizer.c index d32fae2e489a..73617f6ca264 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -1025,13 +1025,6 @@ _PyJit_TryInitializeTracing( if (oparg > 0xFFFF) { return 0; } - if (_tstate->jit_tracer_state.code_buffer == NULL) { - _tstate->jit_tracer_state.code_buffer = (_PyUOpInstruction *)_PyObject_VirtualAlloc(UOP_BUFFER_SIZE); - if (_tstate->jit_tracer_state.code_buffer == NULL) { - // Don't error, just go to next instruction. - return 0; - } - } PyObject *func = PyStackRef_AsPyObjectBorrow(frame->f_funcobj); if (func == NULL) { return 0; @@ -1484,8 +1477,8 @@ uop_optimize( OPT_STAT_INC(traces_created); if (!is_noopt) { length = _Py_uop_analyze_and_optimize( - _tstate->jit_tracer_state.initial_state.func, - buffer,length, + _tstate, + buffer, length, curr_stackentries, dependencies); if (length <= 0) { return length; diff --git a/Python/optimizer_analysis.c b/Python/optimizer_analysis.c index 29a088e43c2a..56d4f9945d69 100644 --- a/Python/optimizer_analysis.c +++ b/Python/optimizer_analysis.c @@ -18,6 +18,7 @@ #include "pycore_opcode_metadata.h" #include "pycore_opcode_utils.h" #include "pycore_pystate.h" // _PyInterpreterState_GET() +#include "pycore_tstate.h" // _PyThreadStateImpl #include "pycore_uop_metadata.h" #include "pycore_long.h" #include "pycore_interpframe.h" // _PyFrame_GetCode @@ -334,7 +335,7 @@ _Py_opt_assert_within_stack_bounds( /* >0 (length) for success, 0 for not ready, clears all possible errors. */ static int optimize_uops( - PyFunctionObject *func, + _PyThreadStateImpl *tstate, _PyUOpInstruction *trace, int trace_len, int curr_stacklen, @@ -342,9 +343,9 @@ optimize_uops( ) { assert(!PyErr_Occurred()); + PyFunctionObject *func = tstate->jit_tracer_state.initial_state.func; - JitOptContext context; - JitOptContext *ctx = &context; + JitOptContext *ctx = &tstate->jit_tracer_state.opt_context; uint32_t opcode = UINT16_MAX; // Make sure that watchers are set up @@ -574,7 +575,7 @@ remove_unneeded_uops(_PyUOpInstruction *buffer, int buffer_size) // > 0 - length of optimized trace int _Py_uop_analyze_and_optimize( - PyFunctionObject *func, + _PyThreadStateImpl *tstate, _PyUOpInstruction *buffer, int length, int curr_stacklen, @@ -584,7 +585,7 @@ _Py_uop_analyze_and_optimize( OPT_STAT_INC(optimizer_attempts); length = optimize_uops( - func, buffer, + tstate, buffer, length, curr_stacklen, dependencies); if (length == 0) { diff --git a/Python/pystate.c b/Python/pystate.c index 23853f697924..74507efa5b4c 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -1553,7 +1553,6 @@ init_threadstate(_PyThreadStateImpl *_tstate, init_policy(&_tstate->policy.jit.side_exit_initial_backoff, "PYTHON_JIT_SIDE_EXIT_INITIAL_BACKOFF", SIDE_EXIT_INITIAL_BACKOFF, 0, MAX_BACKOFF); - _tstate->jit_tracer_state.code_buffer = NULL; #endif tstate->delete_later = NULL; @@ -1868,14 +1867,6 @@ tstate_delete_common(PyThreadState *tstate, int release_gil) assert(tstate_impl->refcounts.values == NULL); #endif -#if _Py_TIER2 - _PyThreadStateImpl *_tstate = (_PyThreadStateImpl *)tstate; - if (_tstate->jit_tracer_state.code_buffer != NULL) { - _PyObject_VirtualFree(_tstate->jit_tracer_state.code_buffer, UOP_BUFFER_SIZE); - _tstate->jit_tracer_state.code_buffer = NULL; - } -#endif - HEAD_UNLOCK(runtime); // XXX Unbind in PyThreadState_Clear(), or earlier