gh-149584: Fix excessive overhead in the Tachyon profiler regarding the cache behavio...

author Pablo Galindo Salgado <Pablogsal@gmail.com>

Wed, 20 May 2026 11:32:08 +0000 (04:32 -0700)

committer GitHub <noreply@github.com>

Wed, 20 May 2026 11:32:08 +0000 (04:32 -0700)
author Pablo Galindo Salgado <Pablogsal@gmail.com>
Wed, 20 May 2026 11:32:08 +0000 (04:32 -0700)
committer GitHub <noreply@github.com>
Wed, 20 May 2026 11:32:08 +0000 (04:32 -0700)
diff --git a/Lib/profiling/sampling/sample.py b/Lib/profiling/sampling/sample.py

index b9e7e2625d09e478f02d25612e970d0a19884c34..2d379e1e16a35e349c06963d50685c622bddd9a4 100644 (file)
--- a/Lib/profiling/sampling/sample.py
+++ b/Lib/profiling/sampling/sample.py
@@ -327,6 +327,33 @@ class SampleProfiler:
          print(f"    Hits:             {code_hits:n} ({ANSIColors.GREEN}{fmt(code_hits_pct)}%{ANSIColors.RESET})")
          print(f"    Misses:           {code_misses:n} ({ANSIColors.RED}{fmt(code_misses_pct)}%{ANSIColors.RESET})")
  
+        batched_attempts = stats.get('batched_read_attempts', 0)
+        batched_successes = stats.get('batched_read_successes', 0)
+        batched_misses = stats.get('batched_read_misses', 0)
+        segments_requested = stats.get('batched_read_segments_requested', 0)
+        segments_completed = stats.get('batched_read_segments_completed', 0)
+        if batched_attempts > 0:
+            batched_success_rate = stats.get('batched_read_success_rate', 0.0)
+            batched_miss_rate = 100.0 - batched_success_rate
+            segment_completion_rate = stats.get(
+                'batched_read_segment_completion_rate', 0.0
+            )
+
+            print(f"  {ANSIColors.CYAN}Batched Reads:{ANSIColors.RESET}")
+            print(f"    Attempts:         {batched_attempts:n}")
+            print(
+                f"    Successes:        {batched_successes:n} "
+                f"({ANSIColors.GREEN}{fmt(batched_success_rate)}%{ANSIColors.RESET})"
+            )
+            print(
+                f"    Misses:           {batched_misses:n} "
+                f"({ANSIColors.RED}{fmt(batched_miss_rate)}%{ANSIColors.RESET})"
+            )
+            print(
+                f"    Segments read:    {segments_completed:n}/{segments_requested:n} "
+                f"({ANSIColors.GREEN}{fmt(segment_completion_rate)}%{ANSIColors.RESET})"
+            )
+
          # Memory operations
          memory_reads = stats.get('memory_reads', 0)
          memory_bytes = stats.get('memory_bytes_read', 0)
diff --git a/Lib/test/test_external_inspection.py b/Lib/test/test_external_inspection.py

index a29e6cdbbf6c78545c1d0cff0a693ca07611360d..6b1529aa173f01c25bf7515a05d41744dbcd4568 100644 (file)
--- a/Lib/test/test_external_inspection.py
+++ b/Lib/test/test_external_inspection.py
@@ -3767,6 +3767,13 @@ recurse({depth})
              "frames_read_from_cache",
              "frames_read_from_memory",
              "frame_cache_hit_rate",
+            "batched_read_attempts",
+            "batched_read_successes",
+            "batched_read_misses",
+            "batched_read_segments_requested",
+            "batched_read_segments_completed",
+            "batched_read_success_rate",
+            "batched_read_segment_completion_rate",
          ]
          for key in expected_keys:
              self.assertIn(key, stats)
diff --git a/Misc/NEWS.d/next/Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst b/Misc/NEWS.d/next/Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst

new file mode 100644 (file)

index 0000000..6734250
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst
@@ -0,0 +1,4 @@
+Fix excessive overhead in the Tachyon profiler when inspecting a remote
+process by avoiding repeated remote page-cache scans, batching predicted
+remote reads, and reusing cached profiler result objects. Patch by Pablo
+Galindo and Maurycy Pawłowski-Wieroński.
diff --git a/Modules/_remote_debugging/_remote_debugging.h b/Modules/_remote_debugging/_remote_debugging.h

index 7369cd1514c296d4c7f2e515ffd506a2fae50b92..d91ce54a18c813a2fe21639b973faa107fcfebfc 100644 (file)
--- a/Modules/_remote_debugging/_remote_debugging.h
+++ b/Modules/_remote_debugging/_remote_debugging.h
@@ -30,6 +30,7 @@ extern "C" {
  #include "internal/pycore_llist.h"          // struct llist_node
  #include "internal/pycore_long.h"           // _PyLong_GetZero
  #include "internal/pycore_pyerrors.h"       // _PyErr_FormatFromCause
+#include "internal/pycore_pyhash.h"        // _Py_HashPointerRaw
  #include "internal/pycore_stackref.h"       // Py_TAG_BITS
  #include "../../Python/remote_debug.h"
  
@@ -215,6 +216,8 @@ typedef struct {
      PyObject *file_name;
      int first_lineno;
      PyObject *linetable;  // bytes
+    PyObject *last_frame_info;
+    ptrdiff_t last_addrq;
      uintptr_t addr_code_adaptive;
  } CachedCodeMetadata;
  
@@ -224,11 +227,41 @@ typedef struct {
  
  typedef struct {
      uint64_t thread_id;                      // 0 = empty slot
+    uintptr_t thread_state_addr;
      uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
      Py_ssize_t num_addrs;
+    PyObject *thread_id_obj;                 // owned reference, NULL if empty
      PyObject *frame_list;                    // owned reference, NULL if empty
  } FrameCacheEntry;
  
+#define INTERPRETER_THREAD_CACHE_SIZE 32
+#if (INTERPRETER_THREAD_CACHE_SIZE & (INTERPRETER_THREAD_CACHE_SIZE - 1)) != 0
+#  error "INTERPRETER_THREAD_CACHE_SIZE must be a power of two"
+#endif
+
+// The two per-interpreter L2 caches below are split into per-field tables so
+// that a writer rebinding one slot cannot leave stale data in a field owned by
+// the other when the slot is reused across interpreters.
+typedef struct {
+    uintptr_t interpreter_addr;
+    uintptr_t thread_state_addr;
+} InterpreterTstateCacheEntry;
+typedef struct {
+    uintptr_t interpreter_addr;
+    uint64_t code_object_generation;
+} InterpreterGenerationCacheEntry;
+
+// Carries already-read thread state and/or frame buffers across helpers so the
+// downstream callee can skip a remote read. Address fields are caller-supplied
+// inputs; buffer pointers (tstate, frame) are NULL unless a prior batched read
+// successfully populated them.
+typedef struct {
+    const char *tstate;
+    uintptr_t tstate_addr;
+    const char *frame;
+    uintptr_t frame_addr;
+} RemoteReadPrefetch;
+
  /* Statistics for profiling performance analysis */
  typedef struct {
      uint64_t total_samples;                  // Total number of get_stack_trace calls
@@ -242,14 +275,44 @@ typedef struct {
      uint64_t code_object_cache_hits;         // Code object cache hits
      uint64_t code_object_cache_misses;       // Code object cache misses
      uint64_t stale_cache_invalidations;      // Times stale entries were cleared
+    uint64_t batched_read_attempts;          // Batched remote-read attempts
+    uint64_t batched_read_successes;         // Attempts that read all requested segments
+    uint64_t batched_read_misses;            // Attempts that fell back or partially read
+    uint64_t batched_read_segments_requested; // Segments requested by batched reads
+    uint64_t batched_read_segments_completed; // Segments completed by batched reads
  } UnwinderStats;
  
+#if defined(__GNUC__) || defined(__clang__)
+#  define REMOTE_DEBUG_UNLIKELY(value) __builtin_expect(!!(value), 0)
+#else
+#  define REMOTE_DEBUG_UNLIKELY(value) (value)
+#endif
+
  /* Stats tracking macros - no-op when stats collection is disabled */
  #define STATS_INC(unwinder, field) \
-    do { if ((unwinder)->collect_stats) (unwinder)->stats.field++; } while(0)
+    do { if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) (unwinder)->stats.field++; } while(0)
  
  #define STATS_ADD(unwinder, field, val) \
-    do { if ((unwinder)->collect_stats) (unwinder)->stats.field += (val); } while(0)
+    do { if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) (unwinder)->stats.field += (val); } while(0)
+
+#if HAVE_PROCESS_VM_READV
+#  define STATS_BATCHED_READ(unwinder, requested, completed) \
+    do { \
+        if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) { \
+            (unwinder)->stats.batched_read_attempts++; \
+            (unwinder)->stats.batched_read_segments_requested += (uint64_t)(requested); \
+            (unwinder)->stats.batched_read_segments_completed += (uint64_t)(completed); \
+            if ((completed) == (requested)) { \
+                (unwinder)->stats.batched_read_successes++; \
+            } \
+            else { \
+                (unwinder)->stats.batched_read_misses++; \
+            } \
+        } \
+    } while(0)
+#else
+#  define STATS_BATCHED_READ(unwinder, requested, completed) ((void)0)
+#endif
  
  typedef struct {
      PyTypeObject *RemoteDebugging_Type;
@@ -290,7 +353,6 @@ typedef struct {
      struct _Py_AsyncioModuleDebugOffsets async_debug_offsets;
      uintptr_t interpreter_addr;
      uintptr_t tstate_addr;
-    uint64_t code_object_generation;
      _Py_hashtable_t *code_object_cache;
      int debug;
      int only_active_thread;
@@ -302,9 +364,17 @@ typedef struct {
      int cache_frames;
      int collect_stats;  // whether to collect statistics
      uint32_t stale_invalidation_counter;  // counter for throttling frame_cache_invalidate_stale
+    // L1 single-entry shortcut over cached_tstates[]: most workloads sample one
+    // interpreter, so check these pairs before hashing into the table below.
+    uintptr_t cached_tstate_interpreter_addr;
+    uintptr_t cached_tstate_addr;
+    uintptr_t cached_generation_interpreter_addr;
+    uint64_t cached_code_object_generation;
      RemoteDebuggingState *cached_state;
      FrameCacheEntry *frame_cache;  // preallocated array of FRAME_CACHE_MAX_THREADS entries
      UnwinderStats stats;  // statistics for performance analysis
+    InterpreterTstateCacheEntry cached_tstates[INTERPRETER_THREAD_CACHE_SIZE];
+    InterpreterGenerationCacheEntry cached_generations[INTERPRETER_THREAD_CACHE_SIZE];
  #ifdef Py_GIL_DISABLED
      uint32_t tlbc_generation;
      _Py_hashtable_t *tlbc_cache;
@@ -361,11 +431,13 @@ typedef struct {
  typedef struct {
      /* Inputs */
      uintptr_t frame_addr;           // Starting frame address
+    uintptr_t thread_state_addr;    // Owning thread state address
      uintptr_t base_frame_addr;      // Sentinel at bottom (for validation)
      uintptr_t gc_frame;             // GC frame address (0 if not tracking)
      uintptr_t last_profiled_frame;  // Last cached frame (0 if no cache)
      StackChunkList *chunks;         // Pre-copied stack chunks
      int skip_first_frame;           // Skip frame_addr itself (continue from its caller)
+    RemoteReadPrefetch prefetch;     // Optional already-read thread/frame buffers
  
      /* Outputs */
      PyObject *frame_info;           // List to append FrameInfo objects
@@ -548,6 +620,7 @@ extern int process_frame_chain(
  extern int frame_cache_init(RemoteUnwinderObject *unwinder);
  extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder);
  extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id);
+extern FrameCacheEntry *frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr);
  extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder);
  extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result);
  extern int frame_cache_lookup_and_extend(
@@ -566,6 +639,7 @@ extern int frame_cache_store(
      PyObject *frame_list,
      const uintptr_t *addrs,
      Py_ssize_t num_addrs,
+    uintptr_t thread_state_addr,
      uintptr_t base_frame_addr,
      uintptr_t last_frame_visited);
  
@@ -605,7 +679,8 @@ extern PyObject* unwind_stack_for_thread(
      uintptr_t *current_tstate,
      uintptr_t gil_holder_tstate,
      uintptr_t gc_frame,
-    uintptr_t main_thread_tstate
+    uintptr_t main_thread_tstate,
+    const RemoteReadPrefetch *prefetch
  );
  
  /* Thread stopping functions (for blocking mode) */
diff --git a/Modules/_remote_debugging/clinic/module.c.h b/Modules/_remote_debugging/clinic/module.c.h

index d56622fb82ab56757396282506580ce8854109ad..78b1d3e8d80962edb1934ceb189176bb489cdeee 100644 (file)
--- a/Modules/_remote_debugging/clinic/module.c.h
+++ b/Modules/_remote_debugging/clinic/module.c.h
@@ -411,8 +411,15 @@ PyDoc_STRVAR(_remote_debugging_RemoteUnwinder_get_stats__doc__,
  "        - code_object_cache_hits: Code object cache hits\n"
  "        - code_object_cache_misses: Code object cache misses\n"
  "        - stale_cache_invalidations: Times stale cache entries were cleared\n"
+"        - batched_read_attempts: Batched remote-read attempts\n"
+"        - batched_read_successes: Attempts that read all requested segments\n"
+"        - batched_read_misses: Attempts that fell back or partially read\n"
+"        - batched_read_segments_requested: Segments requested by batched reads\n"
+"        - batched_read_segments_completed: Segments completed by batched reads\n"
  "        - frame_cache_hit_rate: Percentage of samples that hit the cache\n"
  "        - code_object_cache_hit_rate: Percentage of code object lookups that hit cache\n"
+"        - batched_read_success_rate: Percentage of batched reads that completed all segments\n"
+"        - batched_read_segment_completion_rate: Percentage of requested segments read by batched reads\n"
  "\n"
  "Raises:\n"
  "    RuntimeError: If stats collection was not enabled (stats=False)");
@@ -1540,4 +1547,4 @@ skip_optional_kwonly:
  exit:
      return return_value;
  }
-/*[clinic end generated code: output=5e2a29746a0c5d65 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=884914b100e9c90c input=a9049054013a1b77]*/
diff --git a/Modules/_remote_debugging/code_objects.c b/Modules/_remote_debugging/code_objects.c

index 97c6ba772e88f1d5f4feafa671f6624dd0cdd9fc..3af58f2b3c379ec8dab4335e94e97e858fb4a858 100644 (file)
--- a/Modules/_remote_debugging/code_objects.c
+++ b/Modules/_remote_debugging/code_objects.c
@@ -405,6 +405,8 @@ parse_code_object(RemoteUnwinderObject *unwinder,
          meta->func_name = func;
          meta->file_name = file;
          meta->linetable = linetable;
+        meta->last_frame_info = NULL;
+        meta->last_addrq = -1;
          meta->first_lineno = GET_MEMBER(int, code_object, unwinder->debug_offsets.code_object.firstlineno);
          meta->addr_code_adaptive = real_address + (uintptr_t)unwinder->debug_offsets.code_object.co_code_adaptive;
  
@@ -482,6 +484,12 @@ done_tlbc:
      addrq = (uint16_t *)ip - (uint16_t *)meta->addr_code_adaptive;
  #endif
      ;  // Empty statement to avoid C23 extension warning
+
+    if (!unwinder->opcodes && meta->last_frame_info != NULL && meta->last_addrq == addrq) {
+        *result = Py_NewRef(meta->last_frame_info);
+        return 0;
+    }
+
      LocationInfo info = {0};
      bool ok = parse_linetable(addrq, PyBytes_AS_STRING(meta->linetable),
                                PyBytes_GET_SIZE(meta->linetable),
@@ -529,6 +537,11 @@ done_tlbc:
          goto error;
      }
  
+    if (!unwinder->opcodes) {
+        Py_XSETREF(meta->last_frame_info, Py_NewRef(tuple));
+        meta->last_addrq = addrq;
+    }
+
      *result = tuple;
      return 0;
  
diff --git a/Modules/_remote_debugging/frame_cache.c b/Modules/_remote_debugging/frame_cache.c

index b6566d7cff7b5431c0f663ffe33a10456f724453..19fc406bca9ac965e4a49af16a7ebcec8fdc0159 100644 (file)
--- a/Modules/_remote_debugging/frame_cache.c
+++ b/Modules/_remote_debugging/frame_cache.c
@@ -30,6 +30,7 @@ frame_cache_cleanup(RemoteUnwinderObject *unwinder)
          return;
      }
      for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
          Py_CLEAR(unwinder->frame_cache[i].frame_list);
      }
      PyMem_Free(unwinder->frame_cache);
@@ -53,6 +54,21 @@ frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id)
      return NULL;
  }
  
+FrameCacheEntry *
+frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr)
+{
+    if (!unwinder->frame_cache || tstate_addr == 0) {
+        return NULL;
+    }
+    for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+        if (unwinder->frame_cache[i].thread_state_addr == tstate_addr) {
+            assert(unwinder->frame_cache[i].num_addrs <= FRAME_CACHE_MAX_FRAMES);
+            return &unwinder->frame_cache[i];
+        }
+    }
+    return NULL;
+}
+
  // Allocate a cache slot for a thread
  // Returns NULL if cache is full (graceful degradation)
  static FrameCacheEntry *
@@ -127,8 +143,10 @@ frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result)
          }
          if (!found) {
              // Clear this entry
+            Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
              Py_CLEAR(unwinder->frame_cache[i].frame_list);
              unwinder->frame_cache[i].thread_id = 0;
+            unwinder->frame_cache[i].thread_state_addr = 0;
              unwinder->frame_cache[i].num_addrs = 0;
              STATS_INC(unwinder, stale_cache_invalidations);
          }
@@ -216,6 +234,7 @@ frame_cache_store(
      PyObject *frame_list,
      const uintptr_t *addrs,
      Py_ssize_t num_addrs,
+    uintptr_t thread_state_addr,
      uintptr_t base_frame_addr,
      uintptr_t last_frame_visited)
  {
@@ -257,6 +276,13 @@ frame_cache_store(
          return -1;
      }
      entry->thread_id = thread_id;
+    entry->thread_state_addr = thread_state_addr;
+    if (entry->thread_id_obj == NULL) {
+        entry->thread_id_obj = PyLong_FromUnsignedLongLong(thread_id);
+        if (entry->thread_id_obj == NULL) {
+            return -1;
+        }
+    }
      memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t));
      entry->num_addrs = num_addrs;
      assert(entry->num_addrs == num_addrs);
diff --git a/Modules/_remote_debugging/frames.c b/Modules/_remote_debugging/frames.c

index bbdfce3f7201d9d275f5f9c57dd25e1c0cd31814..8d8019396b3e31abbf8929972101adc2b073324f 100644 (file)
--- a/Modules/_remote_debugging/frames.c
+++ b/Modules/_remote_debugging/frames.c
@@ -186,30 +186,16 @@ is_frame_valid(
      return 1;
  }
  
-int
-parse_frame_object(
+static int
+parse_frame_buffer(
      RemoteUnwinderObject *unwinder,
      PyObject** result,
-    uintptr_t address,
+    const char *frame,
      uintptr_t* address_of_code_object,
      uintptr_t* previous_frame
  ) {
-    char frame[SIZEOF_INTERP_FRAME];
      *address_of_code_object = 0;
  
-    Py_ssize_t bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
-        &unwinder->handle,
-        address,
-        SIZEOF_INTERP_FRAME,
-        frame
-    );
-    if (bytes_read < 0) {
-        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame");
-        return -1;
-    }
-    STATS_INC(unwinder, memory_reads);
-    STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
-
      *previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous);
      uintptr_t code_object = GET_MEMBER_NO_TAG(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable);
      int frame_valid = is_frame_valid(unwinder, (uintptr_t)frame, code_object);
@@ -237,6 +223,31 @@ parse_frame_object(
      return parse_code_object(unwinder, result, &code_ctx);
  }
  
+int
+parse_frame_object(
+    RemoteUnwinderObject *unwinder,
+    PyObject** result,
+    uintptr_t address,
+    uintptr_t* address_of_code_object,
+    uintptr_t* previous_frame
+) {
+    char frame[SIZEOF_INTERP_FRAME];
+    Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
+        &unwinder->handle,
+        address,
+        SIZEOF_INTERP_FRAME,
+        frame
+    );
+    if (bytes_read < 0) {
+        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame");
+        return -1;
+    }
+    STATS_INC(unwinder, memory_reads);
+    STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
+
+    return parse_frame_buffer(unwinder, result, frame, address_of_code_object, previous_frame);
+}
+
  int
  parse_frame_from_chunks(
      RemoteUnwinderObject *unwinder,
@@ -312,15 +323,32 @@ process_frame_chain(
          }
          assert(frame_count <= MAX_FRAMES);
  
-        if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, ctx->chunks) < 0) {
+        if (ctx->chunks && ctx->chunks->count > 0) {
+            if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, ctx->chunks) == 0) {
+                goto parsed_frame;
+            }
              PyErr_Clear();
+        }
+        {
              uintptr_t address_of_code_object = 0;
-            if (parse_frame_object(unwinder, &frame, frame_addr, &address_of_code_object, &next_frame_addr) < 0) {
+            int parse_result;
+            if (ctx->prefetch.frame && ctx->prefetch.frame_addr == frame_addr) {
+                parse_result = parse_frame_buffer(
+                    unwinder, &frame, ctx->prefetch.frame,
+                    &address_of_code_object, &next_frame_addr);
+            }
+            else {
+                parse_result = parse_frame_object(
+                    unwinder, &frame, frame_addr,
+                    &address_of_code_object, &next_frame_addr);
+            }
+            if (parse_result < 0) {
                  set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to parse frame object in chain");
                  return -1;
              }
          }
  
+parsed_frame:
          // Skip first frame if requested (used for cache miss continuation)
          if (ctx->skip_first_frame && frame_count == 1) {
              Py_XDECREF(frame);
@@ -501,41 +529,37 @@ try_full_cache_hit(
      PyObject *current_frame = NULL;
      uintptr_t code_object_addr = 0;
      uintptr_t previous_frame = 0;
-    int parse_result = parse_frame_object(unwinder, &current_frame, ctx->frame_addr,
+    int parse_result;
+    if (ctx->prefetch.frame && ctx->prefetch.frame_addr == ctx->frame_addr) {
+        parse_result = parse_frame_buffer(unwinder, &current_frame,
+                                          ctx->prefetch.frame,
                                            &code_object_addr, &previous_frame);
+    }
+    else {
+        parse_result = parse_frame_object(unwinder, &current_frame, ctx->frame_addr,
+                                          &code_object_addr, &previous_frame);
+    }
      if (parse_result < 0) {
          return -1;
      }
  
-    Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list);
-    PyObject *parent_slice = NULL;
-    if (cached_size > 1) {
-        parent_slice = PyList_GetSlice(entry->frame_list, 1, cached_size);
-        if (!parent_slice) {
-            Py_XDECREF(current_frame);
-            return -1;
-        }
-    }
-
      if (current_frame != NULL) {
          if (PyList_Append(ctx->frame_info, current_frame) < 0) {
              Py_DECREF(current_frame);
-            Py_XDECREF(parent_slice);
              return -1;
          }
          Py_DECREF(current_frame);
          STATS_ADD(unwinder, frames_read_from_memory, 1);
      }
  
-    if (parent_slice) {
-        Py_ssize_t cur_size = PyList_GET_SIZE(ctx->frame_info);
-        int result = PyList_SetSlice(ctx->frame_info, cur_size, cur_size, parent_slice);
-        Py_DECREF(parent_slice);
-        if (result < 0) {
+    Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list);
+    for (Py_ssize_t i = 1; i < cached_size; i++) {
+        PyObject *cached_frame = PyList_GET_ITEM(entry->frame_list, i);
+        if (PyList_Append(ctx->frame_info, cached_frame) < 0) {
              return -1;
          }
-        STATS_ADD(unwinder, frames_read_from_cache, cached_size - 1);
      }
+    STATS_ADD(unwinder, frames_read_from_cache, cached_size > 1 ? cached_size - 1 : 0);
  
      STATS_INC(unwinder, frame_cache_hits);
      return 1;
@@ -606,7 +630,8 @@ collect_frames_with_cache(
      }
  
      if (frame_cache_store(unwinder, thread_id, ctx->frame_info, ctx->frame_addrs, ctx->num_addrs,
-                          ctx->base_frame_addr, ctx->last_frame_visited) < 0) {
+                          ctx->thread_state_addr, ctx->base_frame_addr,
+                          ctx->last_frame_visited) < 0) {
          return -1;
      }
  
diff --git a/Modules/_remote_debugging/module.c b/Modules/_remote_debugging/module.c

index efdd2e1a2d7b7a6fa2f8f5443605d614c47f8643..ae2f7e7f31ba77991533366f3e010aaec213bf0f 100644 (file)
--- a/Modules/_remote_debugging/module.c
+++ b/Modules/_remote_debugging/module.c
@@ -166,6 +166,7 @@ cached_code_metadata_destroy(void *ptr)
      Py_DECREF(meta->func_name);
      Py_DECREF(meta->file_name);
      Py_DECREF(meta->linetable);
+    Py_XDECREF(meta->last_frame_info);
      PyMem_RawFree(meta);
  }
  
@@ -360,6 +361,10 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
      self->cache_frames = cache_frames;
      self->collect_stats = stats;
      self->stale_invalidation_counter = 0;
+    self->cached_tstate_interpreter_addr = 0;
+    self->cached_tstate_addr = 0;
+    memset(self->cached_tstates, 0, sizeof(self->cached_tstates));
+    memset(self->cached_generations, 0, sizeof(self->cached_generations));
      self->debug = debug;
      self->only_active_thread = only_active_thread;
      self->mode = mode;
@@ -473,6 +478,172 @@ _remote_debugging_RemoteUnwinder___init___impl(RemoteUnwinderObject *self,
      return 0;
  }
  
+static inline size_t
+interpreter_thread_cache_index(uintptr_t interpreter_addr)
+{
+    // Direct-mapped table indexed by the remote interpreter address. Each entry
+    // stores the full address and verifies it on lookup, so hash collisions
+    // degrade to misses and cannot return a value from the wrong interpreter.
+    return (size_t)_Py_HashPointerRaw((const void *)interpreter_addr)
+        & (INTERPRETER_THREAD_CACHE_SIZE - 1);
+}
+
+static inline uintptr_t
+get_cached_tstate_for_interpreter(
+    RemoteUnwinderObject *self,
+    uintptr_t interpreter_addr)
+{
+    if (interpreter_addr == 0) {
+        return 0;
+    }
+
+    if (self->cached_tstate_interpreter_addr == interpreter_addr) {
+        return self->cached_tstate_addr;
+    }
+
+    InterpreterTstateCacheEntry *entry =
+        &self->cached_tstates[interpreter_thread_cache_index(interpreter_addr)];
+    if (entry->interpreter_addr == interpreter_addr) {
+        self->cached_tstate_interpreter_addr = interpreter_addr;
+        self->cached_tstate_addr = entry->thread_state_addr;
+        return entry->thread_state_addr;
+    }
+    return 0;
+}
+
+static inline void
+set_cached_tstate_for_interpreter(
+    RemoteUnwinderObject *self,
+    uintptr_t interpreter_addr,
+    uintptr_t thread_state_addr)
+{
+    if (interpreter_addr == 0 || thread_state_addr == 0) {
+        return;
+    }
+
+    self->cached_tstate_interpreter_addr = interpreter_addr;
+    self->cached_tstate_addr = thread_state_addr;
+
+    InterpreterTstateCacheEntry *entry =
+        &self->cached_tstates[interpreter_thread_cache_index(interpreter_addr)];
+    entry->interpreter_addr = interpreter_addr;
+    entry->thread_state_addr = thread_state_addr;
+}
+
+static void
+refresh_generation_caches_from_interp_state(
+    RemoteUnwinderObject *self,
+    uintptr_t interpreter_addr,
+    const char *interp_state_buffer)
+{
+    uint64_t code_object_generation = GET_MEMBER(uint64_t, interp_state_buffer,
+            self->debug_offsets.interpreter_state.code_object_generation);
+
+    if (self->cached_generation_interpreter_addr == interpreter_addr) {
+        if (code_object_generation != self->cached_code_object_generation) {
+            self->cached_code_object_generation = code_object_generation;
+            _Py_hashtable_clear(self->code_object_cache);
+        }
+    }
+    else {
+        InterpreterGenerationCacheEntry *entry =
+            &self->cached_generations[interpreter_thread_cache_index(interpreter_addr)];
+        // A slot rebound from another interpreter must be treated as changed:
+        // the code_object_cache is global, so even if the new generation
+        // numerically matches what the previous occupant had, stale entries
+        // from that occupant could still be served.
+        int changed = entry->interpreter_addr != interpreter_addr
+                   || entry->code_object_generation != code_object_generation;
+        entry->interpreter_addr = interpreter_addr;
+        entry->code_object_generation = code_object_generation;
+        if (changed) {
+            _Py_hashtable_clear(self->code_object_cache);
+        }
+        self->cached_generation_interpreter_addr = interpreter_addr;
+        self->cached_code_object_generation = code_object_generation;
+    }
+
+#ifdef Py_GIL_DISABLED
+    uint32_t current_tlbc_generation = GET_MEMBER(uint32_t, interp_state_buffer,
+                                                  self->debug_offsets.interpreter_state.tlbc_generation);
+    if (current_tlbc_generation != self->tlbc_generation) {
+        self->tlbc_generation = current_tlbc_generation;
+        _Py_hashtable_clear(self->tlbc_cache);
+    }
+#endif
+}
+
+static int
+refresh_generation_caches_for_interpreter(
+    RemoteUnwinderObject *self,
+    uintptr_t interpreter_addr)
+{
+    char interp_state_buffer[INTERP_STATE_BUFFER_SIZE];
+    if (_Py_RemoteDebug_ReadRemoteMemory(
+            &self->handle,
+            interpreter_addr,
+            INTERP_STATE_BUFFER_SIZE,
+            interp_state_buffer) < 0) {
+        set_exception_cause(self, PyExc_RuntimeError,
+                            "Failed to read interpreter state buffer");
+        return -1;
+    }
+    refresh_generation_caches_from_interp_state(self, interpreter_addr, interp_state_buffer);
+    return 0;
+}
+
+static int
+read_interp_state_and_maybe_thread_frame(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t interpreter_addr,
+    char *interp_state_buffer,
+    char *tstate_buffer,
+    char *frame_buffer,
+    RemoteReadPrefetch *prefetch)
+{
+    prefetch->tstate = NULL;
+    prefetch->frame = NULL;
+    if (prefetch->tstate_addr != 0) {
+        size_t tstate_size = (size_t)unwinder->debug_offsets.thread_state.size;
+        _Py_RemoteReadSegment segments[3] = {
+            {interpreter_addr, interp_state_buffer, INTERP_STATE_BUFFER_SIZE},
+            {prefetch->tstate_addr, tstate_buffer, tstate_size},
+            {prefetch->frame_addr, frame_buffer, SIZEOF_INTERP_FRAME},
+        };
+        int nsegs = prefetch->frame_addr != 0 ? 3 : 2;
+        Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory(
+            &unwinder->handle, segments, nsegs);
+        int completed = 0;
+        if (nread >= (Py_ssize_t)INTERP_STATE_BUFFER_SIZE) {
+            completed = 1;
+            Py_ssize_t with_tstate = (Py_ssize_t)INTERP_STATE_BUFFER_SIZE
+                + (Py_ssize_t)tstate_size;
+            if (nread >= with_tstate) {
+                completed = 2;
+            }
+            if (nsegs == 3
+                    && nread == with_tstate + (Py_ssize_t)SIZEOF_INTERP_FRAME) {
+                completed = 3;
+            }
+        }
+        STATS_BATCHED_READ(unwinder, nsegs, completed);
+        if (completed >= 1) {
+            if (completed >= 2) {
+                prefetch->tstate = tstate_buffer;
+            }
+            if (completed >= 3) {
+                prefetch->frame = frame_buffer;
+            }
+            return 0;
+        }
+    }
+    return _Py_RemoteDebug_ReadRemoteMemory(
+        &unwinder->handle,
+        interpreter_addr,
+        INTERP_STATE_BUFFER_SIZE,
+        interp_state_buffer);
+}
+
  /*[clinic input]
  @permit_long_docstring_body
  @critical_section
@@ -537,15 +708,32 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
      while (current_interpreter != 0) {
          // Read interpreter state to get the interpreter ID
          char interp_state_buffer[INTERP_STATE_BUFFER_SIZE];
-        if (_Py_RemoteDebug_PagedReadRemoteMemory(
-                &self->handle,
+        char prefetched_tstate[SIZEOF_THREAD_STATE];
+        char prefetched_frame[SIZEOF_INTERP_FRAME];
+        RemoteReadPrefetch prefetch = {0};
+        if (self->cache_frames) {
+            prefetch.tstate_addr = get_cached_tstate_for_interpreter(
+                self, current_interpreter);
+        }
+        if (prefetch.tstate_addr != 0) {
+            FrameCacheEntry *entry = frame_cache_find_by_tstate(self, prefetch.tstate_addr);
+            if (entry && entry->num_addrs > 0) {
+                prefetch.frame_addr = entry->addrs[0];
+            }
+        }
+
+        if (read_interp_state_and_maybe_thread_frame(
+                self,
                  current_interpreter,
-                INTERP_STATE_BUFFER_SIZE,
-                interp_state_buffer) < 0) {
+                interp_state_buffer,
+                prefetched_tstate,
+                prefetched_frame,
+                &prefetch) < 0) {
              set_exception_cause(self, PyExc_RuntimeError, "Failed to read interpreter state buffer");
              Py_CLEAR(result);
              goto exit;
          }
+        refresh_generation_caches_from_interp_state(self, current_interpreter, interp_state_buffer);
  
          uintptr_t gc_frame = 0;
          if (self->gc) {
@@ -557,25 +745,6 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
          int64_t interpreter_id = GET_MEMBER(int64_t, interp_state_buffer,
                  self->debug_offsets.interpreter_state.id);
  
-        // Get code object generation from buffer
-        uint64_t code_object_generation = GET_MEMBER(uint64_t, interp_state_buffer,
-                self->debug_offsets.interpreter_state.code_object_generation);
-
-        if (code_object_generation != self->code_object_generation) {
-            self->code_object_generation = code_object_generation;
-            _Py_hashtable_clear(self->code_object_cache);
-        }
-
-#ifdef Py_GIL_DISABLED
-        // Check TLBC generation and invalidate cache if needed
-        uint32_t current_tlbc_generation = GET_MEMBER(uint32_t, interp_state_buffer,
-                                                      self->debug_offsets.interpreter_state.tlbc_generation);
-        if (current_tlbc_generation != self->tlbc_generation) {
-            self->tlbc_generation = current_tlbc_generation;
-            _Py_hashtable_clear(self->tlbc_cache);
-        }
-#endif
-
          // Create a list to hold threads for this interpreter
          PyObject *interpreter_threads = PyList_New(0);
          if (!interpreter_threads) {
@@ -611,6 +780,9 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
              // Target specific thread (only process first interpreter)
              current_tstate = self->tstate_addr;
          }
+        if (current_tstate != 0 && self->cache_frames) {
+            set_cached_tstate_for_interpreter(self, current_interpreter, current_tstate);
+        }
  
          // Acquire main thread state information
          uintptr_t main_thread_tstate = GET_MEMBER(uintptr_t, interp_state_buffer,
@@ -621,7 +793,8 @@ _remote_debugging_RemoteUnwinder_get_stack_trace_impl(RemoteUnwinderObject *self
              PyObject* frame_info = unwind_stack_for_thread(self, &current_tstate,
                                                             gil_holder_tstate,
                                                             gc_frame,
-                                                           main_thread_tstate);
+                                                           main_thread_tstate,
+                                                           &prefetch);
              if (!frame_info) {
                  // Check if this was an intentional skip due to mode-based filtering
                  if ((self->mode == PROFILING_MODE_CPU || self->mode == PROFILING_MODE_GIL ||
@@ -771,6 +944,9 @@ _remote_debugging_RemoteUnwinder_get_all_awaited_by_impl(RemoteUnwinderObject *s
      if (ensure_async_debug_offsets(self) < 0) {
          return NULL;
      }
+    if (refresh_generation_caches_for_interpreter(self, self->interpreter_addr) < 0) {
+        return NULL;
+    }
  
      PyObject *result = PyList_New(0);
      if (result == NULL) {
@@ -860,6 +1036,9 @@ _remote_debugging_RemoteUnwinder_get_async_stack_trace_impl(RemoteUnwinderObject
      if (ensure_async_debug_offsets(self) < 0) {
          return NULL;
      }
+    if (refresh_generation_caches_for_interpreter(self, self->interpreter_addr) < 0) {
+        return NULL;
+    }
  
      PyObject *result = PyList_New(0);
      if (result == NULL) {
@@ -904,8 +1083,15 @@ Returns:
          - code_object_cache_hits: Code object cache hits
          - code_object_cache_misses: Code object cache misses
          - stale_cache_invalidations: Times stale cache entries were cleared
+        - batched_read_attempts: Batched remote-read attempts
+        - batched_read_successes: Attempts that read all requested segments
+        - batched_read_misses: Attempts that fell back or partially read
+        - batched_read_segments_requested: Segments requested by batched reads
+        - batched_read_segments_completed: Segments completed by batched reads
          - frame_cache_hit_rate: Percentage of samples that hit the cache
          - code_object_cache_hit_rate: Percentage of code object lookups that hit cache
+        - batched_read_success_rate: Percentage of batched reads that completed all segments
+        - batched_read_segment_completion_rate: Percentage of requested segments read by batched reads
  
  Raises:
      RuntimeError: If stats collection was not enabled (stats=False)
@@ -913,7 +1099,7 @@ Raises:
  
  static PyObject *
  _remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self)
-/*[clinic end generated code: output=21e36477122be2a0 input=75fef4134c12a8c9]*/
+/*[clinic end generated code: output=21e36477122be2a0 input=0392d62b278e9c35]*/
  {
      if (!self->collect_stats) {
          PyErr_SetString(PyExc_RuntimeError,
@@ -948,9 +1134,24 @@ _remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self)
      ADD_STAT(code_object_cache_hits);
      ADD_STAT(code_object_cache_misses);
      ADD_STAT(stale_cache_invalidations);
+    ADD_STAT(batched_read_attempts);
+    ADD_STAT(batched_read_successes);
+    ADD_STAT(batched_read_misses);
+    ADD_STAT(batched_read_segments_requested);
+    ADD_STAT(batched_read_segments_completed);
  
  #undef ADD_STAT
  
+#define ADD_DERIVED_STAT(name, value) do { \
+    PyObject *val = PyFloat_FromDouble(value); \
+    if (!val || PyDict_SetItemString(result, name, val) < 0) { \
+        Py_XDECREF(val); \
+        Py_DECREF(result); \
+        return NULL; \
+    } \
+    Py_DECREF(val); \
+} while(0)
+
      // Calculate and add derived statistics
      // Hit rate is calculated as (hits + partial_hits) / total_cache_lookups
      double frame_cache_hit_rate = 0.0;
@@ -959,26 +1160,33 @@ _remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self)
          frame_cache_hit_rate = 100.0 * (double)(self->stats.frame_cache_hits + self->stats.frame_cache_partial_hits)
                                 / (double)total_cache_lookups;
      }
-    PyObject *hit_rate = PyFloat_FromDouble(frame_cache_hit_rate);
-    if (!hit_rate || PyDict_SetItemString(result, "frame_cache_hit_rate", hit_rate) < 0) {
-        Py_XDECREF(hit_rate);
-        Py_DECREF(result);
-        return NULL;
-    }
-    Py_DECREF(hit_rate);
+    ADD_DERIVED_STAT("frame_cache_hit_rate", frame_cache_hit_rate);
  
      double code_object_hit_rate = 0.0;
      uint64_t total_code_lookups = self->stats.code_object_cache_hits + self->stats.code_object_cache_misses;
      if (total_code_lookups > 0) {
          code_object_hit_rate = 100.0 * (double)self->stats.code_object_cache_hits / (double)total_code_lookups;
      }
-    PyObject *code_hit_rate = PyFloat_FromDouble(code_object_hit_rate);
-    if (!code_hit_rate || PyDict_SetItemString(result, "code_object_cache_hit_rate", code_hit_rate) < 0) {
-        Py_XDECREF(code_hit_rate);
-        Py_DECREF(result);
-        return NULL;
+    ADD_DERIVED_STAT("code_object_cache_hit_rate", code_object_hit_rate);
+
+    double batched_read_success_rate = 0.0;
+    if (self->stats.batched_read_attempts > 0) {
+        batched_read_success_rate =
+            100.0 * (double)self->stats.batched_read_successes
+            / (double)self->stats.batched_read_attempts;
      }
-    Py_DECREF(code_hit_rate);
+    ADD_DERIVED_STAT("batched_read_success_rate", batched_read_success_rate);
+
+    double batched_read_segment_completion_rate = 0.0;
+    if (self->stats.batched_read_segments_requested > 0) {
+        batched_read_segment_completion_rate =
+            100.0 * (double)self->stats.batched_read_segments_completed
+            / (double)self->stats.batched_read_segments_requested;
+    }
+    ADD_DERIVED_STAT("batched_read_segment_completion_rate",
+                     batched_read_segment_completion_rate);
+
+#undef ADD_DERIVED_STAT
  
      return result;
  }
diff --git a/Modules/_remote_debugging/threads.c b/Modules/_remote_debugging/threads.c

index 4daa5e5f92bcd9764c8f173694da1833f277a3d8..7284f43042061a0b4f4f18bc7afaa3d036cfc60c 100644 (file)
--- a/Modules/_remote_debugging/threads.c
+++ b/Modules/_remote_debugging/threads.c
@@ -289,28 +289,110 @@ typedef struct {
      unsigned int :24;
  } _thread_status;
  
+static int
+read_thread_state_and_maybe_frame(
+    RemoteUnwinderObject *unwinder,
+    uintptr_t tstate_addr,
+    size_t tstate_size,
+    char *tstate_buffer,
+    uintptr_t predicted_frame_addr,
+    char *frame_buffer,
+    int *frame_read)
+{
+    *frame_read = 0;
+    if (predicted_frame_addr != 0) {
+        _Py_RemoteReadSegment segments[2] = {
+            {tstate_addr, tstate_buffer, tstate_size},
+            {predicted_frame_addr, frame_buffer, SIZEOF_INTERP_FRAME},
+        };
+        Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory(
+            &unwinder->handle, segments, 2);
+        int completed = 0;
+        if (nread >= (Py_ssize_t)tstate_size) {
+            completed = 1;
+            if (nread == (Py_ssize_t)(tstate_size + SIZEOF_INTERP_FRAME)) {
+                completed = 2;
+            }
+        }
+        STATS_BATCHED_READ(unwinder, 2, completed);
+        if (completed >= 1) {
+            *frame_read = completed == 2;
+            return 0;
+        }
+    }
+    return _Py_RemoteDebug_ReadRemoteMemory(
+        &unwinder->handle, tstate_addr, tstate_size, tstate_buffer);
+}
+
  PyObject*
  unwind_stack_for_thread(
      RemoteUnwinderObject *unwinder,
      uintptr_t *current_tstate,
      uintptr_t gil_holder_tstate,
      uintptr_t gc_frame,
-    uintptr_t main_thread_tstate
+    uintptr_t main_thread_tstate,
+    const RemoteReadPrefetch *prefetch
  ) {
      PyObject *frame_info = NULL;
      PyObject *thread_id = NULL;
      PyObject *result = NULL;
      StackChunkList chunks = {0};
  
-    char ts[SIZEOF_THREAD_STATE];
-    int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
-        &unwinder->handle, *current_tstate, (size_t)unwinder->debug_offsets.thread_state.size, ts);
-    if (bytes_read < 0) {
-        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state");
-        goto error;
+    char local_ts[SIZEOF_THREAD_STATE];
+    char local_prefetched_frame[SIZEOF_INTERP_FRAME];
+    const char *ts;
+    RemoteReadPrefetch ctx_prefetch = {0};
+    if (prefetch->tstate && prefetch->tstate_addr == *current_tstate) {
+        ts = prefetch->tstate;
+        if (prefetch->frame) {
+            ctx_prefetch.frame = prefetch->frame;
+            ctx_prefetch.frame_addr = prefetch->frame_addr;
+        }
+    }
+    else if (unwinder->cache_frames) {
+        uintptr_t predicted_frame_addr = 0;
+        int have_prefetched_frame = 0;
+        FrameCacheEntry *entry = frame_cache_find_by_tstate(unwinder, *current_tstate);
+        if (entry && entry->num_addrs > 0) {
+            predicted_frame_addr = entry->addrs[0];
+        }
+
+        int rc = read_thread_state_and_maybe_frame(
+            unwinder,
+            *current_tstate,
+            (size_t)unwinder->debug_offsets.thread_state.size,
+            local_ts,
+            predicted_frame_addr,
+            local_prefetched_frame,
+            &have_prefetched_frame);
+        if (rc < 0) {
+            set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state");
+            goto error;
+        }
+        ts = local_ts;
+        if (have_prefetched_frame) {
+            ctx_prefetch.frame = local_prefetched_frame;
+            ctx_prefetch.frame_addr = predicted_frame_addr;
+        }
+    }
+    else {
+        int rc = _Py_RemoteDebug_ReadRemoteMemory(
+            &unwinder->handle,
+            *current_tstate,
+            (size_t)unwinder->debug_offsets.thread_state.size,
+            local_ts);
+        if (rc < 0) {
+            set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state");
+            goto error;
+        }
+        ts = local_ts;
      }
      STATS_INC(unwinder, memory_reads);
      STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.thread_state.size);
+    if (ctx_prefetch.frame) {
+        STATS_INC(unwinder, memory_reads);
+        STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
+    }
  
      long tid = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id);
  
@@ -432,9 +514,11 @@ unwind_stack_for_thread(
      uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
      FrameWalkContext ctx = {
          .frame_addr = frame_addr,
+        .thread_state_addr = *current_tstate,
          .base_frame_addr = base_frame_addr,
          .gc_frame = gc_frame,
          .chunks = &chunks,
+        .prefetch = ctx_prefetch,
          .frame_info = frame_info,
          .frame_addrs = addrs,
          .num_addrs = 0,
@@ -469,10 +553,18 @@ unwind_stack_for_thread(
  
      *current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next);
  
-    thread_id = PyLong_FromLongLong(tid);
+    if (unwinder->cache_frames) {
+        FrameCacheEntry *entry = frame_cache_find(unwinder, (uint64_t)tid);
+        if (entry && entry->thread_id_obj) {
+            thread_id = Py_NewRef(entry->thread_id_obj);
+        }
+    }
      if (thread_id == NULL) {
-        set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create thread ID");
-        goto error;
+        thread_id = PyLong_FromLongLong(tid);
+        if (thread_id == NULL) {
+            set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create thread ID");
+            goto error;
+        }
      }
  
      RemoteDebuggingState *state = RemoteDebugging_GetStateFromObject((PyObject*)unwinder);
diff --git a/Python/remote_debug.h b/Python/remote_debug.h

index 6c089a834dcd40d0a511395212f029813c8bc7f7..7b2c4f3bcb8077a8377fb35f47ec4538c53f208c 100644 (file)
--- a/Python/remote_debug.h
+++ b/Python/remote_debug.h
@@ -147,6 +147,7 @@ typedef struct {
      int memfd;
  #endif
      page_cache_entry_t pages[MAX_PAGES];
+    int page_cache_count;
      Py_ssize_t page_size;
  } proc_handle_t;
  
@@ -185,14 +186,16 @@ _Py_RemoteDebug_FreePageCache(proc_handle_t *handle)
          handle->pages[i].data = NULL;
          handle->pages[i].valid = 0;
      }
+    handle->page_cache_count = 0;
  }
  
  UNUSED static void
  _Py_RemoteDebug_ClearCache(proc_handle_t *handle)
  {
-    for (int i = 0; i < MAX_PAGES; i++) {
+    for (int i = 0; i < handle->page_cache_count; i++) {
          handle->pages[i].valid = 0;
      }
+    handle->page_cache_count = 0;
  }
  
  #if defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX
@@ -222,6 +225,7 @@ _Py_RemoteDebug_InitProcHandle(proc_handle_t *handle, pid_t pid) {
      handle->memfd = -1;
  #endif
      handle->page_size = get_page_size();
+    handle->page_cache_count = 0;
      for (int i = 0; i < MAX_PAGES; i++) {
          handle->pages[i].data = NULL;
          handle->pages[i].valid = 0;
@@ -1287,8 +1291,9 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle,
          return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out);
      }
  
-    // Search for valid cached page
-    for (int i = 0; i < MAX_PAGES; i++) {
+    // Search only the pages used since the last clear. The cache is cleared
+    // between profiler samples, so entries are packed at the front.
+    for (int i = 0; i < handle->page_cache_count; i++) {
          page_cache_entry_t *entry = &handle->pages[i];
          if (entry->valid && entry->page_addr == page_base) {
              memcpy(out, entry->data + offset_in_page, size);
@@ -1296,33 +1301,31 @@ _Py_RemoteDebug_PagedReadRemoteMemory(proc_handle_t *handle,
          }
      }
  
-    // Find reusable slot
-    for (int i = 0; i < MAX_PAGES; i++) {
-        page_cache_entry_t *entry = &handle->pages[i];
-        if (!entry->valid) {
+    if (handle->page_cache_count < MAX_PAGES) {
+        page_cache_entry_t *entry = &handle->pages[handle->page_cache_count];
+        if (entry->data == NULL) {
+            entry->data = PyMem_RawMalloc(page_size);
              if (entry->data == NULL) {
-                entry->data = PyMem_RawMalloc(page_size);
-                if (entry->data == NULL) {
-                    PyErr_NoMemory();
-                    _set_debug_exception_cause(PyExc_MemoryError,
-                        "Cannot allocate %zu bytes for page cache entry "
-                        "during read from PID %d at address 0x%lx",
-                        page_size, handle->pid, addr);
-                    return -1;
-                }
-            }
-
-            if (_Py_RemoteDebug_ReadRemoteMemory(handle, page_base, page_size, entry->data) < 0) {
-                // Try to just copy the exact amount as a fallback
-                PyErr_Clear();
-                goto fallback;
+                PyErr_NoMemory();
+                _set_debug_exception_cause(PyExc_MemoryError,
+                    "Cannot allocate %zu bytes for page cache entry "
+                    "during read from PID %d at address 0x%lx",
+                    page_size, handle->pid, addr);
+                return -1;
              }
+        }
  
-            entry->page_addr = page_base;
-            entry->valid = 1;
-            memcpy(out, entry->data + offset_in_page, size);
-            return 0;
+        if (_Py_RemoteDebug_ReadRemoteMemory(handle, page_base, page_size, entry->data) < 0) {
+            // Try to just copy the exact amount as a fallback
+            PyErr_Clear();
+            goto fallback;
          }
+
+        entry->page_addr = page_base;
+        entry->valid = 1;
+        handle->page_cache_count++;
+        memcpy(out, entry->data + offset_in_page, size);
+        return 0;
      }
  
  fallback:
@@ -1330,6 +1333,49 @@ fallback:
      return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out);
  }
  
+typedef struct {
+    uintptr_t remote_addr;
+    void *local_buf;
+    size_t size;
+} _Py_RemoteReadSegment;
+
+#define _PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS 4
+
+// Batched read of multiple remote regions in a single syscall when supported.
+// Returns total bytes read (>= 0) on success, -1 if batched reads are
+// unavailable or the syscall failed. Callers compare the return value against
+// cumulative segment sizes to determine which segments were fully populated.
+UNUSED static Py_ssize_t
+_Py_RemoteDebug_BatchedReadRemoteMemory(
+    proc_handle_t *handle,
+    const _Py_RemoteReadSegment *segments,
+    int nsegs)
+{
+#if defined(__linux__) && HAVE_PROCESS_VM_READV
+    if (handle->memfd == -1
+        && nsegs > 0
+        && nsegs <= _PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS) {
+        struct iovec local[_PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS];
+        struct iovec remote[_PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS];
+        for (int i = 0; i < nsegs; i++) {
+            local[i].iov_base = segments[i].local_buf;
+            local[i].iov_len = segments[i].size;
+            remote[i].iov_base = (void *)segments[i].remote_addr;
+            remote[i].iov_len = segments[i].size;
+        }
+        ssize_t nread = process_vm_readv(handle->pid, local, nsegs, remote, nsegs, 0);
+        if (nread >= 0) {
+            return (Py_ssize_t)nread;
+        }
+    }
+#else
+    (void)handle;
+    (void)segments;
+    (void)nsegs;
+#endif
+    return -1;
+}
+
  UNUSED static int
  _Py_RemoteDebug_ReadDebugOffsets(
      proc_handle_t *handle,
diff --git a/Tools/inspection/benchmark_external_inspection.py b/Tools/inspection/benchmark_external_inspection.py

index fee3435496da0bde0eb5f710511c466e8fd914a5..8e367422a961da26facf9508ed45dd593a107d88 100644 (file)
--- a/Tools/inspection/benchmark_external_inspection.py
+++ b/Tools/inspection/benchmark_external_inspection.py
@@ -151,6 +151,45 @@ while True:
      time.sleep(0.05)
  '''
  
+ASYNC_CODE = '''\
+import asyncio
+import contextlib
+import math
+
+def compute_slice(seed):
+    result = 0.0
+    for i in range(2000):
+        result += math.sin(seed + i) * math.sqrt(i + 1)
+    return result
+
+async def leaf_task(seed):
+    total = 0.0
+    while True:
+        total += compute_slice(seed)
+        await asyncio.sleep(0)
+
+async def parent_task(seed):
+    child = asyncio.create_task(leaf_task(seed + 1000), name=f"leaf-{seed}")
+    try:
+        while True:
+            compute_slice(seed)
+            await asyncio.sleep(0.001)
+    finally:
+        child.cancel()
+        with contextlib.suppress(asyncio.CancelledError):
+            await child
+
+async def main():
+    tasks = [
+        asyncio.create_task(parent_task(i), name=f"parent-{i}")
+        for i in range(8)
+    ]
+    await asyncio.gather(*tasks)
+
+if __name__ == "__main__":
+    asyncio.run(main())
+'''
+
  CODE_EXAMPLES = {
      "basic": {
          "code": CODE,
@@ -164,10 +203,29 @@ CODE_EXAMPLES = {
          "code": CODE_WITH_TONS_OF_THREADS,
          "description": "Tons of threads doing mixed CPU/IO work",
      },
+    "asyncio": {
+        "code": ASYNC_CODE,
+        "description": "Asyncio tasks with active and awaited coroutine chains",
+    },
+}
+
+OPERATIONS = {
+    "stack_trace": {
+        "method": "get_stack_trace",
+        "label": "get_stack_trace()",
+    },
+    "async_stack_trace": {
+        "method": "get_async_stack_trace",
+        "label": "get_async_stack_trace()",
+    },
+    "all_awaited_by": {
+        "method": "get_all_awaited_by",
+        "label": "get_all_awaited_by()",
+    },
  }
  
  
-def benchmark(unwinder, duration_seconds=10, blocking=False):
+def benchmark(unwinder, duration_seconds=10, blocking=False, operation="stack_trace"):
      """Benchmark mode - measure raw sampling speed for specified duration"""
      sample_count = 0
      fail_count = 0
@@ -175,11 +233,14 @@ def benchmark(unwinder, duration_seconds=10, blocking=False):
      start_time = time.perf_counter()
      end_time = start_time + duration_seconds
      total_attempts = 0
+    operation_info = OPERATIONS[operation]
+    operation_method = getattr(unwinder, operation_info["method"])
  
      colors = get_colors(can_colorize())
  
      print(
-        f"{colors.BOLD_BLUE}Benchmarking sampling speed for {duration_seconds} seconds...{colors.RESET}"
+        f"{colors.BOLD_BLUE}Benchmarking {operation_info['label']} speed "
+        f"for {duration_seconds} seconds...{colors.RESET}"
      )
  
      try:
@@ -190,8 +251,8 @@ def benchmark(unwinder, duration_seconds=10, blocking=False):
                  if blocking:
                      unwinder.pause_threads()
                  try:
-                    stack_trace = unwinder.get_stack_trace()
-                    if stack_trace:
+                    sample = operation_method()
+                    if sample:
                          sample_count += 1
                  finally:
                      if blocking:
@@ -239,6 +300,7 @@ def benchmark(unwinder, duration_seconds=10, blocking=False):
              (sample_count / total_attempts) * 100 if total_attempts > 0 else 0
          ),
          "total_work_time": total_work_time,
+        "operation": operation_info["label"],
          "avg_work_time_us": (
              (total_work_time / total_attempts) * 1e6 if total_attempts > 0 else 0
          ),
@@ -252,7 +314,7 @@ def print_benchmark_results(results):
      colors = get_colors(can_colorize())
  
      print(f"\n{colors.BOLD_GREEN}{'='*60}{colors.RESET}")
-    print(f"{colors.BOLD_GREEN}get_stack_trace() Benchmark Results{colors.RESET}")
+    print(f"{colors.BOLD_GREEN}{results['operation']} Benchmark Results{colors.RESET}")
      print(f"{colors.BOLD_GREEN}{'='*60}{colors.RESET}")
  
      # Basic statistics
@@ -329,6 +391,8 @@ Examples:
    %(prog)s -d 60                     # Run basic benchmark for 60 seconds
    %(prog)s --code deep_static        # Run deep static call stack benchmark
    %(prog)s --code deep_static -d 30  # Run deep static benchmark for 30 seconds
+  %(prog)s --operation async_stack_trace
+  %(prog)s --operation all_awaited_by
  
  Available code examples:
  {examples_desc}
@@ -348,8 +412,15 @@ Available code examples:
          "--code",
          "-c",
          choices=list(CODE_EXAMPLES.keys()),
-        default="basic",
-        help="Code example to benchmark (default: basic)",
+        default=None,
+        help="Code example to benchmark (default: basic, or asyncio for async operations)",
+    )
+
+    parser.add_argument(
+        "--operation",
+        choices=list(OPERATIONS.keys()),
+        default="stack_trace",
+        help="Remote unwinder operation to benchmark (default: stack_trace)",
      )
  
      parser.add_argument(
@@ -365,7 +436,10 @@ Available code examples:
          help="Stop all threads before sampling for consistent snapshots",
      )
  
-    return parser.parse_args()
+    args = parser.parse_args()
+    if args.code is None:
+        args.code = "asyncio" if args.operation != "stack_trace" else "basic"
+    return args
  
  
  def create_target_process(temp_file, code_example="basic"):
@@ -420,6 +494,9 @@ def main():
      print(
          f"{colors.CYAN}Benchmark Duration:{colors.RESET} {colors.YELLOW}{args.duration}{colors.RESET} seconds"
      )
+    print(
+        f"{colors.CYAN}Operation:{colors.RESET} {colors.GREEN}{OPERATIONS[args.operation]['label']}{colors.RESET}"
+    )
      print(
          f"{colors.CYAN}Blocking Mode:{colors.RESET} {colors.GREEN if args.blocking else colors.YELLOW}{'enabled' if args.blocking else 'disabled'}{colors.RESET}"
      )
@@ -451,7 +528,12 @@ def main():
                      unwinder = _remote_debugging.RemoteUnwinder(
                          process.pid, cache_frames=True, **kwargs
                      )
-                    results = benchmark(unwinder, duration_seconds=args.duration, blocking=args.blocking)
+                    results = benchmark(
+                        unwinder,
+                        duration_seconds=args.duration,
+                        blocking=args.blocking,
+                        operation=args.operation,
+                    )
                  finally:
                      cleanup_process(process, temp_file_path)
author	Pablo Galindo Salgado <Pablogsal@gmail.com>
	Wed, 20 May 2026 11:32:08 +0000 (04:32 -0700)
committer	GitHub <noreply@github.com>
	Wed, 20 May 2026 11:32:08 +0000 (04:32 -0700)
Lib/profiling/sampling/sample.py		patch \| blob \| blame \| history
Lib/test/test_external_inspection.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2026-05-10-19-26-50.gh-issue-149584.x7Qm9A.rst	[new file with mode: 0644]	patch \| blob
Modules/_remote_debugging/_remote_debugging.h		patch \| blob \| blame \| history
Modules/_remote_debugging/clinic/module.c.h		patch \| blob \| blame \| history
Modules/_remote_debugging/code_objects.c		patch \| blob \| blame \| history
Modules/_remote_debugging/frame_cache.c		patch \| blob \| blame \| history
Modules/_remote_debugging/frames.c		patch \| blob \| blame \| history
Modules/_remote_debugging/module.c		patch \| blob \| blame \| history
Modules/_remote_debugging/threads.c		patch \| blob \| blame \| history
Python/remote_debug.h		patch \| blob \| blame \| history
Tools/inspection/benchmark_external_inspection.py		patch \| blob \| blame \| history