GH-118093: Handle some polymorphism before requiring progress in tier two (GH-122843)

author Brandt Bucher <brandtbucher@microsoft.com>

Mon, 12 Aug 2024 19:39:31 +0000 (12:39 -0700)

committer GitHub <noreply@github.com>

Mon, 12 Aug 2024 19:39:31 +0000 (12:39 -0700)
author Brandt Bucher <brandtbucher@microsoft.com>
Mon, 12 Aug 2024 19:39:31 +0000 (12:39 -0700)
committer GitHub <noreply@github.com>
Mon, 12 Aug 2024 19:39:31 +0000 (12:39 -0700)
diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h

index b6da27c067727f4595e5e26d0d156d4574b7b22f..19e54bf122a8bb23ae952a5d6bb7599a82f1093a 100644 (file)
--- a/Include/internal/pycore_optimizer.h
+++ b/Include/internal/pycore_optimizer.h
@@ -29,8 +29,9 @@ typedef struct {
  typedef struct {
      uint8_t opcode;
      uint8_t oparg;
-    uint8_t valid;
-    uint8_t linked;
+    uint16_t valid:1;
+    uint16_t linked:1;
+    uint16_t chain_depth:14;  // Must be big engough for MAX_CHAIN_DEPTH - 1.
      int index;           // Index of ENTER_EXECUTOR (if code isn't NULL, below).
      _PyBloomFilter bloom;
      _PyExecutorLinkListNode links;
@@ -83,7 +84,7 @@ typedef struct _PyOptimizerObject _PyOptimizerObject;
  typedef int (*_Py_optimize_func)(
      _PyOptimizerObject* self, struct _PyInterpreterFrame *frame,
      _Py_CODEUNIT *instr, _PyExecutorObject **exec_ptr,
-    int curr_stackentries);
+    int curr_stackentries, bool progress_needed);
  
  struct _PyOptimizerObject {
      PyObject_HEAD
@@ -182,6 +183,12 @@ static inline uint16_t uop_get_error_target(const _PyUOpInstruction *inst)
  // Need extras for root frame and for overflow frame (see TRACE_STACK_PUSH())
  #define MAX_ABSTRACT_FRAME_DEPTH (TRACE_STACK_SIZE + 2)
  
+// The maximum number of side exits that we can take before requiring forward
+// progress (and inserting a new ENTER_EXECUTOR instruction). In practice, this
+// is the "maximum amount of polymorphism" that an isolated trace tree can
+// handle before rejoining the rest of the program.
+#define MAX_CHAIN_DEPTH 4
+
  typedef struct _Py_UopsSymbol _Py_UopsSymbol;
  
  struct _Py_UOpsAbstractFrame {
@@ -257,7 +264,7 @@ extern int _Py_uop_frame_pop(_Py_UOpsContext *ctx);
  
  PyAPI_FUNC(PyObject *) _Py_uop_symbols_test(PyObject *self, PyObject *ignored);
  
-PyAPI_FUNC(int) _PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, _PyStackRef *stack_pointer, _PyExecutorObject **exec_ptr);
+PyAPI_FUNC(int) _PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, _PyStackRef *stack_pointer, _PyExecutorObject **exec_ptr, int chain_depth);
  
  static inline int is_terminator(const _PyUOpInstruction *uop)
  {
diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2024-08-08-16-02-28.gh-issue-118093.m6Mrvy.rst b/Misc/NEWS.d/next/Core_and_Builtins/2024-08-08-16-02-28.gh-issue-118093.m6Mrvy.rst

new file mode 100644 (file)

index 0000000..dacc727
--- /dev/null
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2024-08-08-16-02-28.gh-issue-118093.m6Mrvy.rst
@@ -0,0 +1 @@
+Improve the experimental JIT's handling of polymorphic code.
diff --git a/Python/bytecodes.c b/Python/bytecodes.c

index 7c448f430796d71ae7466bd49abbc6299305f465..ffa53bb5e4b7c23d35765401f64298e3e7774852 100644 (file)
--- a/Python/bytecodes.c
+++ b/Python/bytecodes.c
@@ -2501,7 +2501,7 @@ dummy_func(
                      start--;
                  }
                  _PyExecutorObject *executor;
-                int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor);
+                int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor, 0);
                  ERROR_IF(optimized < 0, error);
                  if (optimized) {
                      assert(tstate->previous_executor == NULL);
@@ -4543,7 +4543,8 @@ dummy_func(
                      Py_INCREF(executor);
                  }
                  else {
-                    int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor);
+                    int chain_depth = current_executor->vm_data.chain_depth + 1;
+                    int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, chain_depth);
                      if (optimized <= 0) {
                          exit->temperature = restart_backoff_counter(temperature);
                          if (optimized < 0) {
@@ -4626,7 +4627,7 @@ dummy_func(
                      exit->temperature = advance_backoff_counter(exit->temperature);
                      GOTO_TIER_ONE(target);
                  }
-                int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor);
+                int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, 0);
                  if (optimized <= 0) {
                      exit->temperature = restart_backoff_counter(exit->temperature);
                      if (optimized < 0) {
diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h

index 9102d684b14bae63a6a882790a91d21c65999402..0bccaf992fb010fa2b5facce062a27c9fbd9fbca 100644 (file)
--- a/Python/executor_cases.c.h
+++ b/Python/executor_cases.c.h
@@ -5015,7 +5015,8 @@
                      Py_INCREF(executor);
                  }
                  else {
-                    int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor);
+                    int chain_depth = current_executor->vm_data.chain_depth + 1;
+                    int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, chain_depth);
                      if (optimized <= 0) {
                          exit->temperature = restart_backoff_counter(temperature);
                          if (optimized < 0) {
@@ -5147,7 +5148,7 @@
                      exit->temperature = advance_backoff_counter(exit->temperature);
                      GOTO_TIER_ONE(target);
                  }
-                int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor);
+                int optimized = _PyOptimizer_Optimize(frame, target, stack_pointer, &executor, 0);
                  if (optimized <= 0) {
                      exit->temperature = restart_backoff_counter(exit->temperature);
                      if (optimized < 0) {
diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h

index 11ebd25da0fea358ed8b4737153d0864f5519a71..ef00f6f55a3bcfbacf17f20065e5bddfc19e91ef 100644 (file)
--- a/Python/generated_cases.c.h
+++ b/Python/generated_cases.c.h
@@ -4259,7 +4259,7 @@
                      start--;
                  }
                  _PyExecutorObject *executor;
-                int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor);
+                int optimized = _PyOptimizer_Optimize(frame, start, stack_pointer, &executor, 0);
                  if (optimized < 0) goto error;
                  if (optimized) {
                      assert(tstate->previous_executor == NULL);
diff --git a/Python/optimizer.c b/Python/optimizer.c

index e9cbfc5497189c0eb3c71ba022dc769010b9ae06..1758329c832a885fdaa3f5171eda4bc4c18634e3 100644 (file)
--- a/Python/optimizer.c
+++ b/Python/optimizer.c
@@ -111,7 +111,8 @@ never_optimize(
      _PyInterpreterFrame *frame,
      _Py_CODEUNIT *instr,
      _PyExecutorObject **exec,
-    int Py_UNUSED(stack_entries))
+    int Py_UNUSED(stack_entries),
+    bool Py_UNUSED(progress_needed))
  {
      // This may be called if the optimizer is reset
      return 0;
@@ -176,32 +177,44 @@ _Py_SetTier2Optimizer(_PyOptimizerObject *optimizer)
  int
  _PyOptimizer_Optimize(
      _PyInterpreterFrame *frame, _Py_CODEUNIT *start,
-    _PyStackRef *stack_pointer, _PyExecutorObject **executor_ptr)
+    _PyStackRef *stack_pointer, _PyExecutorObject **executor_ptr, int chain_depth)
  {
+    // The first executor in a chain and the MAX_CHAIN_DEPTH'th executor *must*
+    // make progress in order to avoid infinite loops or excessively-long
+    // side-exit chains. We can only insert the executor into the bytecode if
+    // this is true, since a deopt won't infinitely re-enter the executor:
+    chain_depth %= MAX_CHAIN_DEPTH;
+    bool progress_needed = chain_depth == 0;
      PyCodeObject *code = _PyFrame_GetCode(frame);
      assert(PyCode_Check(code));
      PyInterpreterState *interp = _PyInterpreterState_GET();
-    if (!has_space_for_executor(code, start)) {
+    if (progress_needed && !has_space_for_executor(code, start)) {
          return 0;
      }
      _PyOptimizerObject *opt = interp->optimizer;
-    int err = opt->optimize(opt, frame, start, executor_ptr, (int)(stack_pointer - _PyFrame_Stackbase(frame)));
+    int err = opt->optimize(opt, frame, start, executor_ptr, (int)(stack_pointer - _PyFrame_Stackbase(frame)), progress_needed);
      if (err <= 0) {
          return err;
      }
      assert(*executor_ptr != NULL);
-    int index = get_index_for_executor(code, start);
-    if (index < 0) {
-        /* Out of memory. Don't raise and assume that the
-         * error will show up elsewhere.
-         *
-         * If an optimizer has already produced an executor,
-         * it might get confused by the executor disappearing,
-         * but there is not much we can do about that here. */
-        Py_DECREF(*executor_ptr);
-        return 0;
+    if (progress_needed) {
+        int index = get_index_for_executor(code, start);
+        if (index < 0) {
+            /* Out of memory. Don't raise and assume that the
+             * error will show up elsewhere.
+             *
+             * If an optimizer has already produced an executor,
+             * it might get confused by the executor disappearing,
+             * but there is not much we can do about that here. */
+            Py_DECREF(*executor_ptr);
+            return 0;
+        }
+        insert_executor(code, start, index, *executor_ptr);
      }
-    insert_executor(code, start, index, *executor_ptr);
+    else {
+        (*executor_ptr)->vm_data.code = NULL;
+    }
+    (*executor_ptr)->vm_data.chain_depth = chain_depth;
      assert((*executor_ptr)->vm_data.valid);
      return 1;
  }
@@ -530,9 +543,9 @@ translate_bytecode_to_trace(
      _Py_CODEUNIT *instr,
      _PyUOpInstruction *trace,
      int buffer_size,
-    _PyBloomFilter *dependencies)
+    _PyBloomFilter *dependencies, bool progress_needed)
  {
-    bool progress_needed = true;
+    bool first = true;
      PyCodeObject *code = _PyFrame_GetCode(frame);
      PyFunctionObject *func = (PyFunctionObject *)frame->f_funcobj;
      assert(PyFunction_Check(func));
@@ -576,7 +589,7 @@ translate_bytecode_to_trace(
          uint32_t opcode = instr->op.code;
          uint32_t oparg = instr->op.arg;
  
-        if (!progress_needed && instr == initial_instr) {
+        if (!first && instr == initial_instr) {
              // We have looped around to the start:
              RESERVE(1);
              ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0);
@@ -585,14 +598,6 @@ translate_bytecode_to_trace(
  
          DPRINTF(2, "%d: %s(%d)\n", target, _PyOpcode_OpName[opcode], oparg);
  
-        if (opcode == ENTER_EXECUTOR) {
-            assert(oparg < 256);
-            _PyExecutorObject *executor = code->co_executors->executors[oparg];
-            opcode = executor->vm_data.opcode;
-            DPRINTF(2, "  * ENTER_EXECUTOR -> %s\n",  _PyOpcode_OpName[opcode]);
-            oparg = executor->vm_data.oparg;
-        }
-
          if (opcode == EXTENDED_ARG) {
              instr++;
              opcode = instr->op.code;
@@ -602,13 +607,27 @@ translate_bytecode_to_trace(
                  goto done;
              }
          }
+        if (opcode == ENTER_EXECUTOR) {
+            // We have a couple of options here. We *could* peek "underneath"
+            // this executor and continue tracing, which could give us a longer,
+            // more optimizeable trace (at the expense of lots of duplicated
+            // tier two code). Instead, we choose to just end here and stitch to
+            // the other trace, which allows a side-exit traces to rejoin the
+            // "main" trace periodically (and also helps protect us against
+            // pathological behavior where the amount of tier two code explodes
+            // for a medium-length, branchy code path). This seems to work
+            // better in practice, but in the future we could be smarter about
+            // what we do here:
+            goto done;
+        }
          assert(opcode != ENTER_EXECUTOR && opcode != EXTENDED_ARG);
          RESERVE_RAW(2, "_CHECK_VALIDITY_AND_SET_IP");
          ADD_TO_TRACE(_CHECK_VALIDITY_AND_SET_IP, 0, (uintptr_t)instr, target);
  
          /* Special case the first instruction,
           * so that we can guarantee forward progress */
-        if (progress_needed) {
+        if (first && progress_needed) {
+            assert(first);
              if (OPCODE_HAS_EXIT(opcode) || OPCODE_HAS_DEOPT(opcode)) {
                  opcode = _PyOpcode_Deopt[opcode];
              }
@@ -903,7 +922,7 @@ translate_bytecode_to_trace(
          }
      top:
          // Jump here after _PUSH_FRAME or likely branches.
-        progress_needed = false;
+        first = false;
      }  // End for (;;)
  
  done:
@@ -912,7 +931,7 @@ done:
      }
      assert(code == initial_code);
      // Skip short traces where we can't even translate a single instruction:
-    if (progress_needed) {
+    if (first) {
          OPT_STAT_INC(trace_too_short);
          DPRINTF(2,
                  "No trace for %s (%s:%d) at byte offset %d (no progress)\n",
@@ -1225,13 +1244,14 @@ uop_optimize(
      _PyInterpreterFrame *frame,
      _Py_CODEUNIT *instr,
      _PyExecutorObject **exec_ptr,
-    int curr_stackentries)
+    int curr_stackentries,
+    bool progress_needed)
  {
      _PyBloomFilter dependencies;
      _Py_BloomFilter_Init(&dependencies);
      _PyUOpInstruction buffer[UOP_MAX_TRACE_LENGTH];
      OPT_STAT_INC(attempts);
-    int length = translate_bytecode_to_trace(frame, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies);
+    int length = translate_bytecode_to_trace(frame, instr, buffer, UOP_MAX_TRACE_LENGTH, &dependencies, progress_needed);
      if (length <= 0) {
          // Error or nothing translated
          return length;
@@ -1328,7 +1348,8 @@ counter_optimize(
      _PyInterpreterFrame *frame,
      _Py_CODEUNIT *instr,
      _PyExecutorObject **exec_ptr,
-    int Py_UNUSED(curr_stackentries)
+    int Py_UNUSED(curr_stackentries),
+    bool Py_UNUSED(progress_needed)
  )
  {
      PyCodeObject *code = _PyFrame_GetCode(frame);
author	Brandt Bucher <brandtbucher@microsoft.com>
	Mon, 12 Aug 2024 19:39:31 +0000 (12:39 -0700)
committer	GitHub <noreply@github.com>
	Mon, 12 Aug 2024 19:39:31 +0000 (12:39 -0700)
Include/internal/pycore_optimizer.h		patch \| blob \| blame \| history
Misc/NEWS.d/next/Core_and_Builtins/2024-08-08-16-02-28.gh-issue-118093.m6Mrvy.rst	[new file with mode: 0644]	patch \| blob
Python/bytecodes.c		patch \| blob \| blame \| history
Python/executor_cases.c.h		patch \| blob \| blame \| history
Python/generated_cases.c.h		patch \| blob \| blame \| history
Python/optimizer.c		patch \| blob \| blame \| history