print(f" Hits: {code_hits:n} ({ANSIColors.GREEN}{fmt(code_hits_pct)}%{ANSIColors.RESET})")
print(f" Misses: {code_misses:n} ({ANSIColors.RED}{fmt(code_misses_pct)}%{ANSIColors.RESET})")
+ batched_attempts = stats.get('batched_read_attempts', 0)
+ batched_successes = stats.get('batched_read_successes', 0)
+ batched_misses = stats.get('batched_read_misses', 0)
+ segments_requested = stats.get('batched_read_segments_requested', 0)
+ segments_completed = stats.get('batched_read_segments_completed', 0)
+ if batched_attempts > 0:
+ batched_success_rate = stats.get('batched_read_success_rate', 0.0)
+ batched_miss_rate = 100.0 - batched_success_rate
+ segment_completion_rate = stats.get(
+ 'batched_read_segment_completion_rate', 0.0
+ )
+
+ print(f" {ANSIColors.CYAN}Batched Reads:{ANSIColors.RESET}")
+ print(f" Attempts: {batched_attempts:n}")
+ print(
+ f" Successes: {batched_successes:n} "
+ f"({ANSIColors.GREEN}{fmt(batched_success_rate)}%{ANSIColors.RESET})"
+ )
+ print(
+ f" Misses: {batched_misses:n} "
+ f"({ANSIColors.RED}{fmt(batched_miss_rate)}%{ANSIColors.RESET})"
+ )
+ print(
+ f" Segments read: {segments_completed:n}/{segments_requested:n} "
+ f"({ANSIColors.GREEN}{fmt(segment_completion_rate)}%{ANSIColors.RESET})"
+ )
+
# Memory operations
memory_reads = stats.get('memory_reads', 0)
memory_bytes = stats.get('memory_bytes_read', 0)
"frames_read_from_cache",
"frames_read_from_memory",
"frame_cache_hit_rate",
+ "batched_read_attempts",
+ "batched_read_successes",
+ "batched_read_misses",
+ "batched_read_segments_requested",
+ "batched_read_segments_completed",
+ "batched_read_success_rate",
+ "batched_read_segment_completion_rate",
]
for key in expected_keys:
self.assertIn(key, stats)
--- /dev/null
+Fix excessive overhead in the Tachyon profiler when inspecting a remote
+process by avoiding repeated remote page-cache scans, batching predicted
+remote reads, and reusing cached profiler result objects. Patch by Pablo
+Galindo and Maurycy Pawłowski-Wieroński.
#include "internal/pycore_llist.h" // struct llist_node
#include "internal/pycore_long.h" // _PyLong_GetZero
#include "internal/pycore_pyerrors.h" // _PyErr_FormatFromCause
+#include "internal/pycore_pyhash.h" // _Py_HashPointerRaw
#include "internal/pycore_stackref.h" // Py_TAG_BITS
#include "../../Python/remote_debug.h"
PyObject *file_name;
int first_lineno;
PyObject *linetable; // bytes
+ PyObject *last_frame_info;
+ ptrdiff_t last_addrq;
uintptr_t addr_code_adaptive;
} CachedCodeMetadata;
typedef struct {
uint64_t thread_id; // 0 = empty slot
+ uintptr_t thread_state_addr;
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
Py_ssize_t num_addrs;
+ PyObject *thread_id_obj; // owned reference, NULL if empty
PyObject *frame_list; // owned reference, NULL if empty
} FrameCacheEntry;
+#define INTERPRETER_THREAD_CACHE_SIZE 32
+#if (INTERPRETER_THREAD_CACHE_SIZE & (INTERPRETER_THREAD_CACHE_SIZE - 1)) != 0
+# error "INTERPRETER_THREAD_CACHE_SIZE must be a power of two"
+#endif
+
+// The two per-interpreter L2 caches below are split into per-field tables so
+// that a writer rebinding one slot cannot leave stale data in a field owned by
+// the other when the slot is reused across interpreters.
+typedef struct {
+ uintptr_t interpreter_addr;
+ uintptr_t thread_state_addr;
+} InterpreterTstateCacheEntry;
+typedef struct {
+ uintptr_t interpreter_addr;
+ uint64_t code_object_generation;
+} InterpreterGenerationCacheEntry;
+
+// Carries already-read thread state and/or frame buffers across helpers so the
+// downstream callee can skip a remote read. Address fields are caller-supplied
+// inputs; buffer pointers (tstate, frame) are NULL unless a prior batched read
+// successfully populated them.
+typedef struct {
+ const char *tstate;
+ uintptr_t tstate_addr;
+ const char *frame;
+ uintptr_t frame_addr;
+} RemoteReadPrefetch;
+
/* Statistics for profiling performance analysis */
typedef struct {
uint64_t total_samples; // Total number of get_stack_trace calls
uint64_t code_object_cache_hits; // Code object cache hits
uint64_t code_object_cache_misses; // Code object cache misses
uint64_t stale_cache_invalidations; // Times stale entries were cleared
+ uint64_t batched_read_attempts; // Batched remote-read attempts
+ uint64_t batched_read_successes; // Attempts that read all requested segments
+ uint64_t batched_read_misses; // Attempts that fell back or partially read
+ uint64_t batched_read_segments_requested; // Segments requested by batched reads
+ uint64_t batched_read_segments_completed; // Segments completed by batched reads
} UnwinderStats;
+#if defined(__GNUC__) || defined(__clang__)
+# define REMOTE_DEBUG_UNLIKELY(value) __builtin_expect(!!(value), 0)
+#else
+# define REMOTE_DEBUG_UNLIKELY(value) (value)
+#endif
+
/* Stats tracking macros - no-op when stats collection is disabled */
#define STATS_INC(unwinder, field) \
- do { if ((unwinder)->collect_stats) (unwinder)->stats.field++; } while(0)
+ do { if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) (unwinder)->stats.field++; } while(0)
#define STATS_ADD(unwinder, field, val) \
- do { if ((unwinder)->collect_stats) (unwinder)->stats.field += (val); } while(0)
+ do { if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) (unwinder)->stats.field += (val); } while(0)
+
+#if HAVE_PROCESS_VM_READV
+# define STATS_BATCHED_READ(unwinder, requested, completed) \
+ do { \
+ if (REMOTE_DEBUG_UNLIKELY((unwinder)->collect_stats)) { \
+ (unwinder)->stats.batched_read_attempts++; \
+ (unwinder)->stats.batched_read_segments_requested += (uint64_t)(requested); \
+ (unwinder)->stats.batched_read_segments_completed += (uint64_t)(completed); \
+ if ((completed) == (requested)) { \
+ (unwinder)->stats.batched_read_successes++; \
+ } \
+ else { \
+ (unwinder)->stats.batched_read_misses++; \
+ } \
+ } \
+ } while(0)
+#else
+# define STATS_BATCHED_READ(unwinder, requested, completed) ((void)0)
+#endif
typedef struct {
PyTypeObject *RemoteDebugging_Type;
struct _Py_AsyncioModuleDebugOffsets async_debug_offsets;
uintptr_t interpreter_addr;
uintptr_t tstate_addr;
- uint64_t code_object_generation;
_Py_hashtable_t *code_object_cache;
int debug;
int only_active_thread;
int cache_frames;
int collect_stats; // whether to collect statistics
uint32_t stale_invalidation_counter; // counter for throttling frame_cache_invalidate_stale
+ // L1 single-entry shortcut over cached_tstates[]: most workloads sample one
+ // interpreter, so check these pairs before hashing into the table below.
+ uintptr_t cached_tstate_interpreter_addr;
+ uintptr_t cached_tstate_addr;
+ uintptr_t cached_generation_interpreter_addr;
+ uint64_t cached_code_object_generation;
RemoteDebuggingState *cached_state;
FrameCacheEntry *frame_cache; // preallocated array of FRAME_CACHE_MAX_THREADS entries
UnwinderStats stats; // statistics for performance analysis
+ InterpreterTstateCacheEntry cached_tstates[INTERPRETER_THREAD_CACHE_SIZE];
+ InterpreterGenerationCacheEntry cached_generations[INTERPRETER_THREAD_CACHE_SIZE];
#ifdef Py_GIL_DISABLED
uint32_t tlbc_generation;
_Py_hashtable_t *tlbc_cache;
typedef struct {
/* Inputs */
uintptr_t frame_addr; // Starting frame address
+ uintptr_t thread_state_addr; // Owning thread state address
uintptr_t base_frame_addr; // Sentinel at bottom (for validation)
uintptr_t gc_frame; // GC frame address (0 if not tracking)
uintptr_t last_profiled_frame; // Last cached frame (0 if no cache)
StackChunkList *chunks; // Pre-copied stack chunks
int skip_first_frame; // Skip frame_addr itself (continue from its caller)
+ RemoteReadPrefetch prefetch; // Optional already-read thread/frame buffers
/* Outputs */
PyObject *frame_info; // List to append FrameInfo objects
extern int frame_cache_init(RemoteUnwinderObject *unwinder);
extern void frame_cache_cleanup(RemoteUnwinderObject *unwinder);
extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id);
+extern FrameCacheEntry *frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr);
extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder);
extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result);
extern int frame_cache_lookup_and_extend(
PyObject *frame_list,
const uintptr_t *addrs,
Py_ssize_t num_addrs,
+ uintptr_t thread_state_addr,
uintptr_t base_frame_addr,
uintptr_t last_frame_visited);
uintptr_t *current_tstate,
uintptr_t gil_holder_tstate,
uintptr_t gc_frame,
- uintptr_t main_thread_tstate
+ uintptr_t main_thread_tstate,
+ const RemoteReadPrefetch *prefetch
);
/* Thread stopping functions (for blocking mode) */
" - code_object_cache_hits: Code object cache hits\n"
" - code_object_cache_misses: Code object cache misses\n"
" - stale_cache_invalidations: Times stale cache entries were cleared\n"
+" - batched_read_attempts: Batched remote-read attempts\n"
+" - batched_read_successes: Attempts that read all requested segments\n"
+" - batched_read_misses: Attempts that fell back or partially read\n"
+" - batched_read_segments_requested: Segments requested by batched reads\n"
+" - batched_read_segments_completed: Segments completed by batched reads\n"
" - frame_cache_hit_rate: Percentage of samples that hit the cache\n"
" - code_object_cache_hit_rate: Percentage of code object lookups that hit cache\n"
+" - batched_read_success_rate: Percentage of batched reads that completed all segments\n"
+" - batched_read_segment_completion_rate: Percentage of requested segments read by batched reads\n"
"\n"
"Raises:\n"
" RuntimeError: If stats collection was not enabled (stats=False)");
exit:
return return_value;
}
-/*[clinic end generated code: output=5e2a29746a0c5d65 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=884914b100e9c90c input=a9049054013a1b77]*/
meta->func_name = func;
meta->file_name = file;
meta->linetable = linetable;
+ meta->last_frame_info = NULL;
+ meta->last_addrq = -1;
meta->first_lineno = GET_MEMBER(int, code_object, unwinder->debug_offsets.code_object.firstlineno);
meta->addr_code_adaptive = real_address + (uintptr_t)unwinder->debug_offsets.code_object.co_code_adaptive;
addrq = (uint16_t *)ip - (uint16_t *)meta->addr_code_adaptive;
#endif
; // Empty statement to avoid C23 extension warning
+
+ if (!unwinder->opcodes && meta->last_frame_info != NULL && meta->last_addrq == addrq) {
+ *result = Py_NewRef(meta->last_frame_info);
+ return 0;
+ }
+
LocationInfo info = {0};
bool ok = parse_linetable(addrq, PyBytes_AS_STRING(meta->linetable),
PyBytes_GET_SIZE(meta->linetable),
goto error;
}
+ if (!unwinder->opcodes) {
+ Py_XSETREF(meta->last_frame_info, Py_NewRef(tuple));
+ meta->last_addrq = addrq;
+ }
+
*result = tuple;
return 0;
return;
}
for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+ Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
Py_CLEAR(unwinder->frame_cache[i].frame_list);
}
PyMem_Free(unwinder->frame_cache);
return NULL;
}
+FrameCacheEntry *
+frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr)
+{
+ if (!unwinder->frame_cache || tstate_addr == 0) {
+ return NULL;
+ }
+ for (int i = 0; i < FRAME_CACHE_MAX_THREADS; i++) {
+ if (unwinder->frame_cache[i].thread_state_addr == tstate_addr) {
+ assert(unwinder->frame_cache[i].num_addrs <= FRAME_CACHE_MAX_FRAMES);
+ return &unwinder->frame_cache[i];
+ }
+ }
+ return NULL;
+}
+
// Allocate a cache slot for a thread
// Returns NULL if cache is full (graceful degradation)
static FrameCacheEntry *
}
if (!found) {
// Clear this entry
+ Py_CLEAR(unwinder->frame_cache[i].thread_id_obj);
Py_CLEAR(unwinder->frame_cache[i].frame_list);
unwinder->frame_cache[i].thread_id = 0;
+ unwinder->frame_cache[i].thread_state_addr = 0;
unwinder->frame_cache[i].num_addrs = 0;
STATS_INC(unwinder, stale_cache_invalidations);
}
PyObject *frame_list,
const uintptr_t *addrs,
Py_ssize_t num_addrs,
+ uintptr_t thread_state_addr,
uintptr_t base_frame_addr,
uintptr_t last_frame_visited)
{
return -1;
}
entry->thread_id = thread_id;
+ entry->thread_state_addr = thread_state_addr;
+ if (entry->thread_id_obj == NULL) {
+ entry->thread_id_obj = PyLong_FromUnsignedLongLong(thread_id);
+ if (entry->thread_id_obj == NULL) {
+ return -1;
+ }
+ }
memcpy(entry->addrs, addrs, num_addrs * sizeof(uintptr_t));
entry->num_addrs = num_addrs;
assert(entry->num_addrs == num_addrs);
return 1;
}
-int
-parse_frame_object(
+static int
+parse_frame_buffer(
RemoteUnwinderObject *unwinder,
PyObject** result,
- uintptr_t address,
+ const char *frame,
uintptr_t* address_of_code_object,
uintptr_t* previous_frame
) {
- char frame[SIZEOF_INTERP_FRAME];
*address_of_code_object = 0;
- Py_ssize_t bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
- &unwinder->handle,
- address,
- SIZEOF_INTERP_FRAME,
- frame
- );
- if (bytes_read < 0) {
- set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame");
- return -1;
- }
- STATS_INC(unwinder, memory_reads);
- STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
-
*previous_frame = GET_MEMBER(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.previous);
uintptr_t code_object = GET_MEMBER_NO_TAG(uintptr_t, frame, unwinder->debug_offsets.interpreter_frame.executable);
int frame_valid = is_frame_valid(unwinder, (uintptr_t)frame, code_object);
return parse_code_object(unwinder, result, &code_ctx);
}
+int
+parse_frame_object(
+ RemoteUnwinderObject *unwinder,
+ PyObject** result,
+ uintptr_t address,
+ uintptr_t* address_of_code_object,
+ uintptr_t* previous_frame
+) {
+ char frame[SIZEOF_INTERP_FRAME];
+ Py_ssize_t bytes_read = _Py_RemoteDebug_ReadRemoteMemory(
+ &unwinder->handle,
+ address,
+ SIZEOF_INTERP_FRAME,
+ frame
+ );
+ if (bytes_read < 0) {
+ set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read interpreter frame");
+ return -1;
+ }
+ STATS_INC(unwinder, memory_reads);
+ STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
+
+ return parse_frame_buffer(unwinder, result, frame, address_of_code_object, previous_frame);
+}
+
int
parse_frame_from_chunks(
RemoteUnwinderObject *unwinder,
}
assert(frame_count <= MAX_FRAMES);
- if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, ctx->chunks) < 0) {
+ if (ctx->chunks && ctx->chunks->count > 0) {
+ if (parse_frame_from_chunks(unwinder, &frame, frame_addr, &next_frame_addr, &stackpointer, ctx->chunks) == 0) {
+ goto parsed_frame;
+ }
PyErr_Clear();
+ }
+ {
uintptr_t address_of_code_object = 0;
- if (parse_frame_object(unwinder, &frame, frame_addr, &address_of_code_object, &next_frame_addr) < 0) {
+ int parse_result;
+ if (ctx->prefetch.frame && ctx->prefetch.frame_addr == frame_addr) {
+ parse_result = parse_frame_buffer(
+ unwinder, &frame, ctx->prefetch.frame,
+ &address_of_code_object, &next_frame_addr);
+ }
+ else {
+ parse_result = parse_frame_object(
+ unwinder, &frame, frame_addr,
+ &address_of_code_object, &next_frame_addr);
+ }
+ if (parse_result < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to parse frame object in chain");
return -1;
}
}
+parsed_frame:
// Skip first frame if requested (used for cache miss continuation)
if (ctx->skip_first_frame && frame_count == 1) {
Py_XDECREF(frame);
PyObject *current_frame = NULL;
uintptr_t code_object_addr = 0;
uintptr_t previous_frame = 0;
- int parse_result = parse_frame_object(unwinder, ¤t_frame, ctx->frame_addr,
+ int parse_result;
+ if (ctx->prefetch.frame && ctx->prefetch.frame_addr == ctx->frame_addr) {
+ parse_result = parse_frame_buffer(unwinder, ¤t_frame,
+ ctx->prefetch.frame,
&code_object_addr, &previous_frame);
+ }
+ else {
+ parse_result = parse_frame_object(unwinder, ¤t_frame, ctx->frame_addr,
+ &code_object_addr, &previous_frame);
+ }
if (parse_result < 0) {
return -1;
}
- Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list);
- PyObject *parent_slice = NULL;
- if (cached_size > 1) {
- parent_slice = PyList_GetSlice(entry->frame_list, 1, cached_size);
- if (!parent_slice) {
- Py_XDECREF(current_frame);
- return -1;
- }
- }
-
if (current_frame != NULL) {
if (PyList_Append(ctx->frame_info, current_frame) < 0) {
Py_DECREF(current_frame);
- Py_XDECREF(parent_slice);
return -1;
}
Py_DECREF(current_frame);
STATS_ADD(unwinder, frames_read_from_memory, 1);
}
- if (parent_slice) {
- Py_ssize_t cur_size = PyList_GET_SIZE(ctx->frame_info);
- int result = PyList_SetSlice(ctx->frame_info, cur_size, cur_size, parent_slice);
- Py_DECREF(parent_slice);
- if (result < 0) {
+ Py_ssize_t cached_size = PyList_GET_SIZE(entry->frame_list);
+ for (Py_ssize_t i = 1; i < cached_size; i++) {
+ PyObject *cached_frame = PyList_GET_ITEM(entry->frame_list, i);
+ if (PyList_Append(ctx->frame_info, cached_frame) < 0) {
return -1;
}
- STATS_ADD(unwinder, frames_read_from_cache, cached_size - 1);
}
+ STATS_ADD(unwinder, frames_read_from_cache, cached_size > 1 ? cached_size - 1 : 0);
STATS_INC(unwinder, frame_cache_hits);
return 1;
}
if (frame_cache_store(unwinder, thread_id, ctx->frame_info, ctx->frame_addrs, ctx->num_addrs,
- ctx->base_frame_addr, ctx->last_frame_visited) < 0) {
+ ctx->thread_state_addr, ctx->base_frame_addr,
+ ctx->last_frame_visited) < 0) {
return -1;
}
Py_DECREF(meta->func_name);
Py_DECREF(meta->file_name);
Py_DECREF(meta->linetable);
+ Py_XDECREF(meta->last_frame_info);
PyMem_RawFree(meta);
}
self->cache_frames = cache_frames;
self->collect_stats = stats;
self->stale_invalidation_counter = 0;
+ self->cached_tstate_interpreter_addr = 0;
+ self->cached_tstate_addr = 0;
+ memset(self->cached_tstates, 0, sizeof(self->cached_tstates));
+ memset(self->cached_generations, 0, sizeof(self->cached_generations));
self->debug = debug;
self->only_active_thread = only_active_thread;
self->mode = mode;
return 0;
}
+static inline size_t
+interpreter_thread_cache_index(uintptr_t interpreter_addr)
+{
+ // Direct-mapped table indexed by the remote interpreter address. Each entry
+ // stores the full address and verifies it on lookup, so hash collisions
+ // degrade to misses and cannot return a value from the wrong interpreter.
+ return (size_t)_Py_HashPointerRaw((const void *)interpreter_addr)
+ & (INTERPRETER_THREAD_CACHE_SIZE - 1);
+}
+
+static inline uintptr_t
+get_cached_tstate_for_interpreter(
+ RemoteUnwinderObject *self,
+ uintptr_t interpreter_addr)
+{
+ if (interpreter_addr == 0) {
+ return 0;
+ }
+
+ if (self->cached_tstate_interpreter_addr == interpreter_addr) {
+ return self->cached_tstate_addr;
+ }
+
+ InterpreterTstateCacheEntry *entry =
+ &self->cached_tstates[interpreter_thread_cache_index(interpreter_addr)];
+ if (entry->interpreter_addr == interpreter_addr) {
+ self->cached_tstate_interpreter_addr = interpreter_addr;
+ self->cached_tstate_addr = entry->thread_state_addr;
+ return entry->thread_state_addr;
+ }
+ return 0;
+}
+
+static inline void
+set_cached_tstate_for_interpreter(
+ RemoteUnwinderObject *self,
+ uintptr_t interpreter_addr,
+ uintptr_t thread_state_addr)
+{
+ if (interpreter_addr == 0 || thread_state_addr == 0) {
+ return;
+ }
+
+ self->cached_tstate_interpreter_addr = interpreter_addr;
+ self->cached_tstate_addr = thread_state_addr;
+
+ InterpreterTstateCacheEntry *entry =
+ &self->cached_tstates[interpreter_thread_cache_index(interpreter_addr)];
+ entry->interpreter_addr = interpreter_addr;
+ entry->thread_state_addr = thread_state_addr;
+}
+
+static void
+refresh_generation_caches_from_interp_state(
+ RemoteUnwinderObject *self,
+ uintptr_t interpreter_addr,
+ const char *interp_state_buffer)
+{
+ uint64_t code_object_generation = GET_MEMBER(uint64_t, interp_state_buffer,
+ self->debug_offsets.interpreter_state.code_object_generation);
+
+ if (self->cached_generation_interpreter_addr == interpreter_addr) {
+ if (code_object_generation != self->cached_code_object_generation) {
+ self->cached_code_object_generation = code_object_generation;
+ _Py_hashtable_clear(self->code_object_cache);
+ }
+ }
+ else {
+ InterpreterGenerationCacheEntry *entry =
+ &self->cached_generations[interpreter_thread_cache_index(interpreter_addr)];
+ // A slot rebound from another interpreter must be treated as changed:
+ // the code_object_cache is global, so even if the new generation
+ // numerically matches what the previous occupant had, stale entries
+ // from that occupant could still be served.
+ int changed = entry->interpreter_addr != interpreter_addr
+ || entry->code_object_generation != code_object_generation;
+ entry->interpreter_addr = interpreter_addr;
+ entry->code_object_generation = code_object_generation;
+ if (changed) {
+ _Py_hashtable_clear(self->code_object_cache);
+ }
+ self->cached_generation_interpreter_addr = interpreter_addr;
+ self->cached_code_object_generation = code_object_generation;
+ }
+
+#ifdef Py_GIL_DISABLED
+ uint32_t current_tlbc_generation = GET_MEMBER(uint32_t, interp_state_buffer,
+ self->debug_offsets.interpreter_state.tlbc_generation);
+ if (current_tlbc_generation != self->tlbc_generation) {
+ self->tlbc_generation = current_tlbc_generation;
+ _Py_hashtable_clear(self->tlbc_cache);
+ }
+#endif
+}
+
+static int
+refresh_generation_caches_for_interpreter(
+ RemoteUnwinderObject *self,
+ uintptr_t interpreter_addr)
+{
+ char interp_state_buffer[INTERP_STATE_BUFFER_SIZE];
+ if (_Py_RemoteDebug_ReadRemoteMemory(
+ &self->handle,
+ interpreter_addr,
+ INTERP_STATE_BUFFER_SIZE,
+ interp_state_buffer) < 0) {
+ set_exception_cause(self, PyExc_RuntimeError,
+ "Failed to read interpreter state buffer");
+ return -1;
+ }
+ refresh_generation_caches_from_interp_state(self, interpreter_addr, interp_state_buffer);
+ return 0;
+}
+
+static int
+read_interp_state_and_maybe_thread_frame(
+ RemoteUnwinderObject *unwinder,
+ uintptr_t interpreter_addr,
+ char *interp_state_buffer,
+ char *tstate_buffer,
+ char *frame_buffer,
+ RemoteReadPrefetch *prefetch)
+{
+ prefetch->tstate = NULL;
+ prefetch->frame = NULL;
+ if (prefetch->tstate_addr != 0) {
+ size_t tstate_size = (size_t)unwinder->debug_offsets.thread_state.size;
+ _Py_RemoteReadSegment segments[3] = {
+ {interpreter_addr, interp_state_buffer, INTERP_STATE_BUFFER_SIZE},
+ {prefetch->tstate_addr, tstate_buffer, tstate_size},
+ {prefetch->frame_addr, frame_buffer, SIZEOF_INTERP_FRAME},
+ };
+ int nsegs = prefetch->frame_addr != 0 ? 3 : 2;
+ Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory(
+ &unwinder->handle, segments, nsegs);
+ int completed = 0;
+ if (nread >= (Py_ssize_t)INTERP_STATE_BUFFER_SIZE) {
+ completed = 1;
+ Py_ssize_t with_tstate = (Py_ssize_t)INTERP_STATE_BUFFER_SIZE
+ + (Py_ssize_t)tstate_size;
+ if (nread >= with_tstate) {
+ completed = 2;
+ }
+ if (nsegs == 3
+ && nread == with_tstate + (Py_ssize_t)SIZEOF_INTERP_FRAME) {
+ completed = 3;
+ }
+ }
+ STATS_BATCHED_READ(unwinder, nsegs, completed);
+ if (completed >= 1) {
+ if (completed >= 2) {
+ prefetch->tstate = tstate_buffer;
+ }
+ if (completed >= 3) {
+ prefetch->frame = frame_buffer;
+ }
+ return 0;
+ }
+ }
+ return _Py_RemoteDebug_ReadRemoteMemory(
+ &unwinder->handle,
+ interpreter_addr,
+ INTERP_STATE_BUFFER_SIZE,
+ interp_state_buffer);
+}
+
/*[clinic input]
@permit_long_docstring_body
@critical_section
while (current_interpreter != 0) {
// Read interpreter state to get the interpreter ID
char interp_state_buffer[INTERP_STATE_BUFFER_SIZE];
- if (_Py_RemoteDebug_PagedReadRemoteMemory(
- &self->handle,
+ char prefetched_tstate[SIZEOF_THREAD_STATE];
+ char prefetched_frame[SIZEOF_INTERP_FRAME];
+ RemoteReadPrefetch prefetch = {0};
+ if (self->cache_frames) {
+ prefetch.tstate_addr = get_cached_tstate_for_interpreter(
+ self, current_interpreter);
+ }
+ if (prefetch.tstate_addr != 0) {
+ FrameCacheEntry *entry = frame_cache_find_by_tstate(self, prefetch.tstate_addr);
+ if (entry && entry->num_addrs > 0) {
+ prefetch.frame_addr = entry->addrs[0];
+ }
+ }
+
+ if (read_interp_state_and_maybe_thread_frame(
+ self,
current_interpreter,
- INTERP_STATE_BUFFER_SIZE,
- interp_state_buffer) < 0) {
+ interp_state_buffer,
+ prefetched_tstate,
+ prefetched_frame,
+ &prefetch) < 0) {
set_exception_cause(self, PyExc_RuntimeError, "Failed to read interpreter state buffer");
Py_CLEAR(result);
goto exit;
}
+ refresh_generation_caches_from_interp_state(self, current_interpreter, interp_state_buffer);
uintptr_t gc_frame = 0;
if (self->gc) {
int64_t interpreter_id = GET_MEMBER(int64_t, interp_state_buffer,
self->debug_offsets.interpreter_state.id);
- // Get code object generation from buffer
- uint64_t code_object_generation = GET_MEMBER(uint64_t, interp_state_buffer,
- self->debug_offsets.interpreter_state.code_object_generation);
-
- if (code_object_generation != self->code_object_generation) {
- self->code_object_generation = code_object_generation;
- _Py_hashtable_clear(self->code_object_cache);
- }
-
-#ifdef Py_GIL_DISABLED
- // Check TLBC generation and invalidate cache if needed
- uint32_t current_tlbc_generation = GET_MEMBER(uint32_t, interp_state_buffer,
- self->debug_offsets.interpreter_state.tlbc_generation);
- if (current_tlbc_generation != self->tlbc_generation) {
- self->tlbc_generation = current_tlbc_generation;
- _Py_hashtable_clear(self->tlbc_cache);
- }
-#endif
-
// Create a list to hold threads for this interpreter
PyObject *interpreter_threads = PyList_New(0);
if (!interpreter_threads) {
// Target specific thread (only process first interpreter)
current_tstate = self->tstate_addr;
}
+ if (current_tstate != 0 && self->cache_frames) {
+ set_cached_tstate_for_interpreter(self, current_interpreter, current_tstate);
+ }
// Acquire main thread state information
uintptr_t main_thread_tstate = GET_MEMBER(uintptr_t, interp_state_buffer,
PyObject* frame_info = unwind_stack_for_thread(self, ¤t_tstate,
gil_holder_tstate,
gc_frame,
- main_thread_tstate);
+ main_thread_tstate,
+ &prefetch);
if (!frame_info) {
// Check if this was an intentional skip due to mode-based filtering
if ((self->mode == PROFILING_MODE_CPU || self->mode == PROFILING_MODE_GIL ||
if (ensure_async_debug_offsets(self) < 0) {
return NULL;
}
+ if (refresh_generation_caches_for_interpreter(self, self->interpreter_addr) < 0) {
+ return NULL;
+ }
PyObject *result = PyList_New(0);
if (result == NULL) {
if (ensure_async_debug_offsets(self) < 0) {
return NULL;
}
+ if (refresh_generation_caches_for_interpreter(self, self->interpreter_addr) < 0) {
+ return NULL;
+ }
PyObject *result = PyList_New(0);
if (result == NULL) {
- code_object_cache_hits: Code object cache hits
- code_object_cache_misses: Code object cache misses
- stale_cache_invalidations: Times stale cache entries were cleared
+ - batched_read_attempts: Batched remote-read attempts
+ - batched_read_successes: Attempts that read all requested segments
+ - batched_read_misses: Attempts that fell back or partially read
+ - batched_read_segments_requested: Segments requested by batched reads
+ - batched_read_segments_completed: Segments completed by batched reads
- frame_cache_hit_rate: Percentage of samples that hit the cache
- code_object_cache_hit_rate: Percentage of code object lookups that hit cache
+ - batched_read_success_rate: Percentage of batched reads that completed all segments
+ - batched_read_segment_completion_rate: Percentage of requested segments read by batched reads
Raises:
RuntimeError: If stats collection was not enabled (stats=False)
static PyObject *
_remote_debugging_RemoteUnwinder_get_stats_impl(RemoteUnwinderObject *self)
-/*[clinic end generated code: output=21e36477122be2a0 input=75fef4134c12a8c9]*/
+/*[clinic end generated code: output=21e36477122be2a0 input=0392d62b278e9c35]*/
{
if (!self->collect_stats) {
PyErr_SetString(PyExc_RuntimeError,
ADD_STAT(code_object_cache_hits);
ADD_STAT(code_object_cache_misses);
ADD_STAT(stale_cache_invalidations);
+ ADD_STAT(batched_read_attempts);
+ ADD_STAT(batched_read_successes);
+ ADD_STAT(batched_read_misses);
+ ADD_STAT(batched_read_segments_requested);
+ ADD_STAT(batched_read_segments_completed);
#undef ADD_STAT
+#define ADD_DERIVED_STAT(name, value) do { \
+ PyObject *val = PyFloat_FromDouble(value); \
+ if (!val || PyDict_SetItemString(result, name, val) < 0) { \
+ Py_XDECREF(val); \
+ Py_DECREF(result); \
+ return NULL; \
+ } \
+ Py_DECREF(val); \
+} while(0)
+
// Calculate and add derived statistics
// Hit rate is calculated as (hits + partial_hits) / total_cache_lookups
double frame_cache_hit_rate = 0.0;
frame_cache_hit_rate = 100.0 * (double)(self->stats.frame_cache_hits + self->stats.frame_cache_partial_hits)
/ (double)total_cache_lookups;
}
- PyObject *hit_rate = PyFloat_FromDouble(frame_cache_hit_rate);
- if (!hit_rate || PyDict_SetItemString(result, "frame_cache_hit_rate", hit_rate) < 0) {
- Py_XDECREF(hit_rate);
- Py_DECREF(result);
- return NULL;
- }
- Py_DECREF(hit_rate);
+ ADD_DERIVED_STAT("frame_cache_hit_rate", frame_cache_hit_rate);
double code_object_hit_rate = 0.0;
uint64_t total_code_lookups = self->stats.code_object_cache_hits + self->stats.code_object_cache_misses;
if (total_code_lookups > 0) {
code_object_hit_rate = 100.0 * (double)self->stats.code_object_cache_hits / (double)total_code_lookups;
}
- PyObject *code_hit_rate = PyFloat_FromDouble(code_object_hit_rate);
- if (!code_hit_rate || PyDict_SetItemString(result, "code_object_cache_hit_rate", code_hit_rate) < 0) {
- Py_XDECREF(code_hit_rate);
- Py_DECREF(result);
- return NULL;
+ ADD_DERIVED_STAT("code_object_cache_hit_rate", code_object_hit_rate);
+
+ double batched_read_success_rate = 0.0;
+ if (self->stats.batched_read_attempts > 0) {
+ batched_read_success_rate =
+ 100.0 * (double)self->stats.batched_read_successes
+ / (double)self->stats.batched_read_attempts;
}
- Py_DECREF(code_hit_rate);
+ ADD_DERIVED_STAT("batched_read_success_rate", batched_read_success_rate);
+
+ double batched_read_segment_completion_rate = 0.0;
+ if (self->stats.batched_read_segments_requested > 0) {
+ batched_read_segment_completion_rate =
+ 100.0 * (double)self->stats.batched_read_segments_completed
+ / (double)self->stats.batched_read_segments_requested;
+ }
+ ADD_DERIVED_STAT("batched_read_segment_completion_rate",
+ batched_read_segment_completion_rate);
+
+#undef ADD_DERIVED_STAT
return result;
}
unsigned int :24;
} _thread_status;
+static int
+read_thread_state_and_maybe_frame(
+ RemoteUnwinderObject *unwinder,
+ uintptr_t tstate_addr,
+ size_t tstate_size,
+ char *tstate_buffer,
+ uintptr_t predicted_frame_addr,
+ char *frame_buffer,
+ int *frame_read)
+{
+ *frame_read = 0;
+ if (predicted_frame_addr != 0) {
+ _Py_RemoteReadSegment segments[2] = {
+ {tstate_addr, tstate_buffer, tstate_size},
+ {predicted_frame_addr, frame_buffer, SIZEOF_INTERP_FRAME},
+ };
+ Py_ssize_t nread = _Py_RemoteDebug_BatchedReadRemoteMemory(
+ &unwinder->handle, segments, 2);
+ int completed = 0;
+ if (nread >= (Py_ssize_t)tstate_size) {
+ completed = 1;
+ if (nread == (Py_ssize_t)(tstate_size + SIZEOF_INTERP_FRAME)) {
+ completed = 2;
+ }
+ }
+ STATS_BATCHED_READ(unwinder, 2, completed);
+ if (completed >= 1) {
+ *frame_read = completed == 2;
+ return 0;
+ }
+ }
+ return _Py_RemoteDebug_ReadRemoteMemory(
+ &unwinder->handle, tstate_addr, tstate_size, tstate_buffer);
+}
+
PyObject*
unwind_stack_for_thread(
RemoteUnwinderObject *unwinder,
uintptr_t *current_tstate,
uintptr_t gil_holder_tstate,
uintptr_t gc_frame,
- uintptr_t main_thread_tstate
+ uintptr_t main_thread_tstate,
+ const RemoteReadPrefetch *prefetch
) {
PyObject *frame_info = NULL;
PyObject *thread_id = NULL;
PyObject *result = NULL;
StackChunkList chunks = {0};
- char ts[SIZEOF_THREAD_STATE];
- int bytes_read = _Py_RemoteDebug_PagedReadRemoteMemory(
- &unwinder->handle, *current_tstate, (size_t)unwinder->debug_offsets.thread_state.size, ts);
- if (bytes_read < 0) {
- set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state");
- goto error;
+ char local_ts[SIZEOF_THREAD_STATE];
+ char local_prefetched_frame[SIZEOF_INTERP_FRAME];
+ const char *ts;
+ RemoteReadPrefetch ctx_prefetch = {0};
+ if (prefetch->tstate && prefetch->tstate_addr == *current_tstate) {
+ ts = prefetch->tstate;
+ if (prefetch->frame) {
+ ctx_prefetch.frame = prefetch->frame;
+ ctx_prefetch.frame_addr = prefetch->frame_addr;
+ }
+ }
+ else if (unwinder->cache_frames) {
+ uintptr_t predicted_frame_addr = 0;
+ int have_prefetched_frame = 0;
+ FrameCacheEntry *entry = frame_cache_find_by_tstate(unwinder, *current_tstate);
+ if (entry && entry->num_addrs > 0) {
+ predicted_frame_addr = entry->addrs[0];
+ }
+
+ int rc = read_thread_state_and_maybe_frame(
+ unwinder,
+ *current_tstate,
+ (size_t)unwinder->debug_offsets.thread_state.size,
+ local_ts,
+ predicted_frame_addr,
+ local_prefetched_frame,
+ &have_prefetched_frame);
+ if (rc < 0) {
+ set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state");
+ goto error;
+ }
+ ts = local_ts;
+ if (have_prefetched_frame) {
+ ctx_prefetch.frame = local_prefetched_frame;
+ ctx_prefetch.frame_addr = predicted_frame_addr;
+ }
+ }
+ else {
+ int rc = _Py_RemoteDebug_ReadRemoteMemory(
+ &unwinder->handle,
+ *current_tstate,
+ (size_t)unwinder->debug_offsets.thread_state.size,
+ local_ts);
+ if (rc < 0) {
+ set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to read thread state");
+ goto error;
+ }
+ ts = local_ts;
}
STATS_INC(unwinder, memory_reads);
STATS_ADD(unwinder, memory_bytes_read, unwinder->debug_offsets.thread_state.size);
+ if (ctx_prefetch.frame) {
+ STATS_INC(unwinder, memory_reads);
+ STATS_ADD(unwinder, memory_bytes_read, SIZEOF_INTERP_FRAME);
+ }
long tid = GET_MEMBER(long, ts, unwinder->debug_offsets.thread_state.native_thread_id);
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
FrameWalkContext ctx = {
.frame_addr = frame_addr,
+ .thread_state_addr = *current_tstate,
.base_frame_addr = base_frame_addr,
.gc_frame = gc_frame,
.chunks = &chunks,
+ .prefetch = ctx_prefetch,
.frame_info = frame_info,
.frame_addrs = addrs,
.num_addrs = 0,
*current_tstate = GET_MEMBER(uintptr_t, ts, unwinder->debug_offsets.thread_state.next);
- thread_id = PyLong_FromLongLong(tid);
+ if (unwinder->cache_frames) {
+ FrameCacheEntry *entry = frame_cache_find(unwinder, (uint64_t)tid);
+ if (entry && entry->thread_id_obj) {
+ thread_id = Py_NewRef(entry->thread_id_obj);
+ }
+ }
if (thread_id == NULL) {
- set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create thread ID");
- goto error;
+ thread_id = PyLong_FromLongLong(tid);
+ if (thread_id == NULL) {
+ set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to create thread ID");
+ goto error;
+ }
}
RemoteDebuggingState *state = RemoteDebugging_GetStateFromObject((PyObject*)unwinder);
int memfd;
#endif
page_cache_entry_t pages[MAX_PAGES];
+ int page_cache_count;
Py_ssize_t page_size;
} proc_handle_t;
handle->pages[i].data = NULL;
handle->pages[i].valid = 0;
}
+ handle->page_cache_count = 0;
}
UNUSED static void
_Py_RemoteDebug_ClearCache(proc_handle_t *handle)
{
- for (int i = 0; i < MAX_PAGES; i++) {
+ for (int i = 0; i < handle->page_cache_count; i++) {
handle->pages[i].valid = 0;
}
+ handle->page_cache_count = 0;
}
#if defined(__APPLE__) && defined(TARGET_OS_OSX) && TARGET_OS_OSX
handle->memfd = -1;
#endif
handle->page_size = get_page_size();
+ handle->page_cache_count = 0;
for (int i = 0; i < MAX_PAGES; i++) {
handle->pages[i].data = NULL;
handle->pages[i].valid = 0;
return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out);
}
- // Search for valid cached page
- for (int i = 0; i < MAX_PAGES; i++) {
+ // Search only the pages used since the last clear. The cache is cleared
+ // between profiler samples, so entries are packed at the front.
+ for (int i = 0; i < handle->page_cache_count; i++) {
page_cache_entry_t *entry = &handle->pages[i];
if (entry->valid && entry->page_addr == page_base) {
memcpy(out, entry->data + offset_in_page, size);
}
}
- // Find reusable slot
- for (int i = 0; i < MAX_PAGES; i++) {
- page_cache_entry_t *entry = &handle->pages[i];
- if (!entry->valid) {
+ if (handle->page_cache_count < MAX_PAGES) {
+ page_cache_entry_t *entry = &handle->pages[handle->page_cache_count];
+ if (entry->data == NULL) {
+ entry->data = PyMem_RawMalloc(page_size);
if (entry->data == NULL) {
- entry->data = PyMem_RawMalloc(page_size);
- if (entry->data == NULL) {
- PyErr_NoMemory();
- _set_debug_exception_cause(PyExc_MemoryError,
- "Cannot allocate %zu bytes for page cache entry "
- "during read from PID %d at address 0x%lx",
- page_size, handle->pid, addr);
- return -1;
- }
- }
-
- if (_Py_RemoteDebug_ReadRemoteMemory(handle, page_base, page_size, entry->data) < 0) {
- // Try to just copy the exact amount as a fallback
- PyErr_Clear();
- goto fallback;
+ PyErr_NoMemory();
+ _set_debug_exception_cause(PyExc_MemoryError,
+ "Cannot allocate %zu bytes for page cache entry "
+ "during read from PID %d at address 0x%lx",
+ page_size, handle->pid, addr);
+ return -1;
}
+ }
- entry->page_addr = page_base;
- entry->valid = 1;
- memcpy(out, entry->data + offset_in_page, size);
- return 0;
+ if (_Py_RemoteDebug_ReadRemoteMemory(handle, page_base, page_size, entry->data) < 0) {
+ // Try to just copy the exact amount as a fallback
+ PyErr_Clear();
+ goto fallback;
}
+
+ entry->page_addr = page_base;
+ entry->valid = 1;
+ handle->page_cache_count++;
+ memcpy(out, entry->data + offset_in_page, size);
+ return 0;
}
fallback:
return _Py_RemoteDebug_ReadRemoteMemory(handle, addr, size, out);
}
+typedef struct {
+ uintptr_t remote_addr;
+ void *local_buf;
+ size_t size;
+} _Py_RemoteReadSegment;
+
+#define _PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS 4
+
+// Batched read of multiple remote regions in a single syscall when supported.
+// Returns total bytes read (>= 0) on success, -1 if batched reads are
+// unavailable or the syscall failed. Callers compare the return value against
+// cumulative segment sizes to determine which segments were fully populated.
+UNUSED static Py_ssize_t
+_Py_RemoteDebug_BatchedReadRemoteMemory(
+ proc_handle_t *handle,
+ const _Py_RemoteReadSegment *segments,
+ int nsegs)
+{
+#if defined(__linux__) && HAVE_PROCESS_VM_READV
+ if (handle->memfd == -1
+ && nsegs > 0
+ && nsegs <= _PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS) {
+ struct iovec local[_PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS];
+ struct iovec remote[_PY_REMOTE_DEBUG_MAX_BATCHED_SEGMENTS];
+ for (int i = 0; i < nsegs; i++) {
+ local[i].iov_base = segments[i].local_buf;
+ local[i].iov_len = segments[i].size;
+ remote[i].iov_base = (void *)segments[i].remote_addr;
+ remote[i].iov_len = segments[i].size;
+ }
+ ssize_t nread = process_vm_readv(handle->pid, local, nsegs, remote, nsegs, 0);
+ if (nread >= 0) {
+ return (Py_ssize_t)nread;
+ }
+ }
+#else
+ (void)handle;
+ (void)segments;
+ (void)nsegs;
+#endif
+ return -1;
+}
+
UNUSED static int
_Py_RemoteDebug_ReadDebugOffsets(
proc_handle_t *handle,
time.sleep(0.05)
'''
+ASYNC_CODE = '''\
+import asyncio
+import contextlib
+import math
+
+def compute_slice(seed):
+ result = 0.0
+ for i in range(2000):
+ result += math.sin(seed + i) * math.sqrt(i + 1)
+ return result
+
+async def leaf_task(seed):
+ total = 0.0
+ while True:
+ total += compute_slice(seed)
+ await asyncio.sleep(0)
+
+async def parent_task(seed):
+ child = asyncio.create_task(leaf_task(seed + 1000), name=f"leaf-{seed}")
+ try:
+ while True:
+ compute_slice(seed)
+ await asyncio.sleep(0.001)
+ finally:
+ child.cancel()
+ with contextlib.suppress(asyncio.CancelledError):
+ await child
+
+async def main():
+ tasks = [
+ asyncio.create_task(parent_task(i), name=f"parent-{i}")
+ for i in range(8)
+ ]
+ await asyncio.gather(*tasks)
+
+if __name__ == "__main__":
+ asyncio.run(main())
+'''
+
CODE_EXAMPLES = {
"basic": {
"code": CODE,
"code": CODE_WITH_TONS_OF_THREADS,
"description": "Tons of threads doing mixed CPU/IO work",
},
+ "asyncio": {
+ "code": ASYNC_CODE,
+ "description": "Asyncio tasks with active and awaited coroutine chains",
+ },
+}
+
+OPERATIONS = {
+ "stack_trace": {
+ "method": "get_stack_trace",
+ "label": "get_stack_trace()",
+ },
+ "async_stack_trace": {
+ "method": "get_async_stack_trace",
+ "label": "get_async_stack_trace()",
+ },
+ "all_awaited_by": {
+ "method": "get_all_awaited_by",
+ "label": "get_all_awaited_by()",
+ },
}
-def benchmark(unwinder, duration_seconds=10, blocking=False):
+def benchmark(unwinder, duration_seconds=10, blocking=False, operation="stack_trace"):
"""Benchmark mode - measure raw sampling speed for specified duration"""
sample_count = 0
fail_count = 0
start_time = time.perf_counter()
end_time = start_time + duration_seconds
total_attempts = 0
+ operation_info = OPERATIONS[operation]
+ operation_method = getattr(unwinder, operation_info["method"])
colors = get_colors(can_colorize())
print(
- f"{colors.BOLD_BLUE}Benchmarking sampling speed for {duration_seconds} seconds...{colors.RESET}"
+ f"{colors.BOLD_BLUE}Benchmarking {operation_info['label']} speed "
+ f"for {duration_seconds} seconds...{colors.RESET}"
)
try:
if blocking:
unwinder.pause_threads()
try:
- stack_trace = unwinder.get_stack_trace()
- if stack_trace:
+ sample = operation_method()
+ if sample:
sample_count += 1
finally:
if blocking:
(sample_count / total_attempts) * 100 if total_attempts > 0 else 0
),
"total_work_time": total_work_time,
+ "operation": operation_info["label"],
"avg_work_time_us": (
(total_work_time / total_attempts) * 1e6 if total_attempts > 0 else 0
),
colors = get_colors(can_colorize())
print(f"\n{colors.BOLD_GREEN}{'='*60}{colors.RESET}")
- print(f"{colors.BOLD_GREEN}get_stack_trace() Benchmark Results{colors.RESET}")
+ print(f"{colors.BOLD_GREEN}{results['operation']} Benchmark Results{colors.RESET}")
print(f"{colors.BOLD_GREEN}{'='*60}{colors.RESET}")
# Basic statistics
%(prog)s -d 60 # Run basic benchmark for 60 seconds
%(prog)s --code deep_static # Run deep static call stack benchmark
%(prog)s --code deep_static -d 30 # Run deep static benchmark for 30 seconds
+ %(prog)s --operation async_stack_trace
+ %(prog)s --operation all_awaited_by
Available code examples:
{examples_desc}
"--code",
"-c",
choices=list(CODE_EXAMPLES.keys()),
- default="basic",
- help="Code example to benchmark (default: basic)",
+ default=None,
+ help="Code example to benchmark (default: basic, or asyncio for async operations)",
+ )
+
+ parser.add_argument(
+ "--operation",
+ choices=list(OPERATIONS.keys()),
+ default="stack_trace",
+ help="Remote unwinder operation to benchmark (default: stack_trace)",
)
parser.add_argument(
help="Stop all threads before sampling for consistent snapshots",
)
- return parser.parse_args()
+ args = parser.parse_args()
+ if args.code is None:
+ args.code = "asyncio" if args.operation != "stack_trace" else "basic"
+ return args
def create_target_process(temp_file, code_example="basic"):
print(
f"{colors.CYAN}Benchmark Duration:{colors.RESET} {colors.YELLOW}{args.duration}{colors.RESET} seconds"
)
+ print(
+ f"{colors.CYAN}Operation:{colors.RESET} {colors.GREEN}{OPERATIONS[args.operation]['label']}{colors.RESET}"
+ )
print(
f"{colors.CYAN}Blocking Mode:{colors.RESET} {colors.GREEN if args.blocking else colors.YELLOW}{'enabled' if args.blocking else 'disabled'}{colors.RESET}"
)
unwinder = _remote_debugging.RemoteUnwinder(
process.pid, cache_frames=True, **kwargs
)
- results = benchmark(unwinder, duration_seconds=args.duration, blocking=args.blocking)
+ results = benchmark(
+ unwinder,
+ duration_seconds=args.duration,
+ blocking=args.blocking,
+ operation=args.operation,
+ )
finally:
cleanup_process(process, temp_file_path)