From 7e2bc1d53befeb98616d7f9df34a51d1b3eb02ba Mon Sep 17 00:00:00 2001 From: Ken Jin <28750310+Fidget-Spinner@users.noreply.github.com> Date: Sat, 8 Nov 2025 16:49:31 +0000 Subject: [PATCH] Some fixups --- Include/internal/pycore_optimizer.h | 4 +--- Python/optimizer.c | 6 ++++-- Python/optimizer_bytecodes.c | 15 ++++++++++++++- Python/optimizer_cases.c.h | 14 +++++++++++++- 4 files changed, 32 insertions(+), 7 deletions(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index eea5608621e9..e4219ebc86ae 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -88,8 +88,6 @@ PyAPI_FUNC(void) _Py_Executors_InvalidateCold(PyInterpreterState *interp); // This value is arbitrary and was not optimized. #define JIT_CLEANUP_THRESHOLD 1000 -#define TRACE_STACK_SIZE 5 - int _Py_uop_analyze_and_optimize( PyFunctionObject *func, _PyUOpInstruction *trace, int trace_len, int curr_stackentries, @@ -125,7 +123,7 @@ static inline uint16_t uop_get_error_target(const _PyUOpInstruction *inst) #define TY_ARENA_SIZE (UOP_MAX_TRACE_LENGTH * 5) // Need extras for root frame and for overflow frame (see TRACE_STACK_PUSH()) -#define MAX_ABSTRACT_FRAME_DEPTH (TRACE_STACK_SIZE + 2) +#define MAX_ABSTRACT_FRAME_DEPTH (16) // The maximum number of side exits that we can take before requiring forward // progress (and inserting a new ENTER_EXECUTOR instruction). In practice, this diff --git a/Python/optimizer.c b/Python/optimizer.c index e73753ca8305..5a7de32f4607 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -616,7 +616,7 @@ _PyJit_translate_single_bytecode_to_trace( bool needs_guard_ip = OPCODE_HAS_NEEDS_GUARD_IP(opcode); if (has_dynamic_jump_taken && !needs_guard_ip) { - DPRINTF(2, "Unsupported: dynamic jump taken\n"); + DPRINTF(2, "Unsupported: dynamic jump taken %s\n", _PyOpcode_OpName[opcode]); goto unsupported; } DPRINTF(2, "%p %d: %s(%d) %d %d\n", old_code, target, _PyOpcode_OpName[opcode], oparg, needs_guard_ip, old_stack_level); @@ -749,6 +749,8 @@ _PyJit_translate_single_bytecode_to_trace( if ((next_instr != _tstate->jit_state.initial_state.close_loop_instr) && (next_instr != _tstate->jit_state.initial_state.start_instr) && _tstate->jit_state.prev_state.code_curr_size > 5 && + // For side exits, we don't want to terminate them early. + _tstate->jit_state.initial_state.exit == NULL && // These are coroutines, and we want to unroll those usually. opcode != JUMP_BACKWARD_NO_INTERRUPT) { // We encountered a JUMP_BACKWARD but not to the top of our own loop. @@ -867,7 +869,7 @@ _PyJit_translate_single_bytecode_to_trace( if (frame->owner < FRAME_OWNED_BY_INTERPRETER) { // Don't add nested code objects to the dependency. // It causes endless re-traces. - if (new_func != NULL && !(new_code->co_flags & CO_NESTED)) { + if (new_func != NULL && !Py_IsNone((PyObject*)new_func) && !(new_code->co_flags & CO_NESTED)) { operand = (uintptr_t)new_func; DPRINTF(2, "Adding %p func to op\n", (void *)operand); _Py_BloomFilter_Add(dependencies, new_func); diff --git a/Python/optimizer_bytecodes.c b/Python/optimizer_bytecodes.c index a8b58dbaa1e6..eca5c0b69696 100644 --- a/Python/optimizer_bytecodes.c +++ b/Python/optimizer_bytecodes.c @@ -770,7 +770,7 @@ dummy_func(void) { } op(_CREATE_INIT_FRAME, (init, self, args[oparg] -- init_frame)) { - _Py_UOpsAbstractFrame *old_frame = ctx->frame; + ctx->frame->stack_pointer = stack_pointer - oparg - 2; _Py_UOpsAbstractFrame *shim = frame_new(ctx, (PyCodeObject *)&_Py_InitCleanup, 0, NULL, 0); if (shim == NULL) { break; @@ -799,6 +799,13 @@ dummy_func(void) { } _Py_BloomFilter_Add(dependencies, returning_code); int returning_stacklevel = this_instr->operand1; + if (ctx->curr_frame_depth >= 2) { + PyCodeObject *expected_code = ctx->frames[ctx->curr_frame_depth - 2].code; + if (expected_code == returning_code) { + assert((this_instr + 1)->opcode == _GUARD_IP_RETURN_VALUE); + REPLACE_OP((this_instr + 1), _NOP, 0, 0); + } + } if (frame_pop(ctx, returning_code, returning_stacklevel)) { break; } @@ -898,6 +905,12 @@ dummy_func(void) { _Py_BloomFilter_Add(dependencies, co); ctx->frame->func = func; } + // Fixed calls don't need IP guards. + if ((this_instr-1)->opcode == _SAVE_RETURN_OFFSET || + (this_instr-1)->opcode == _CREATE_INIT_FRAME) { + assert((this_instr+1)->opcode == _GUARD_IP__PUSH_FRAME); + REPLACE_OP(this_instr+1, _NOP, 0, 0); + } } op(_UNPACK_SEQUENCE, (seq -- values[oparg], top[0])) { diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h index 879758e3e4fb..765303cdeb99 100644 --- a/Python/optimizer_cases.c.h +++ b/Python/optimizer_cases.c.h @@ -1124,6 +1124,13 @@ } _Py_BloomFilter_Add(dependencies, returning_code); int returning_stacklevel = this_instr->operand1; + if (ctx->curr_frame_depth >= 2) { + PyCodeObject *expected_code = ctx->frames[ctx->curr_frame_depth - 2].code; + if (expected_code == returning_code) { + assert((this_instr + 1)->opcode == _GUARD_IP_RETURN_VALUE); + REPLACE_OP((this_instr + 1), _NOP, 0, 0); + } + } if (frame_pop(ctx, returning_code, returning_stacklevel)) { break; } @@ -2639,6 +2646,11 @@ _Py_BloomFilter_Add(dependencies, co); ctx->frame->func = func; } + if ((this_instr-1)->opcode == _SAVE_RETURN_OFFSET || + (this_instr-1)->opcode == _CREATE_INIT_FRAME) { + assert((this_instr+1)->opcode == _GUARD_IP__PUSH_FRAME); + REPLACE_OP(this_instr+1, _NOP, 0, 0); + } break; } @@ -2776,7 +2788,7 @@ JitOptRef init_frame; args = &stack_pointer[-oparg]; self = stack_pointer[-1 - oparg]; - _Py_UOpsAbstractFrame *old_frame = ctx->frame; + ctx->frame->stack_pointer = stack_pointer - oparg - 2; _Py_UOpsAbstractFrame *shim = frame_new(ctx, (PyCodeObject *)&_Py_InitCleanup, 0, NULL, 0); if (shim == NULL) { break; -- 2.47.3