struct _PyInterpreterFrame *base_frame;
struct _PyInterpreterFrame *last_profiled_frame;
+ uintptr_t last_profiled_frame_seq;
Py_tracefunc c_profilefunc;
Py_tracefunc c_tracefunc;
uint64_t current_frame;
uint64_t base_frame;
uint64_t last_profiled_frame;
+ uint64_t last_profiled_frame_seq;
uint64_t thread_id;
uint64_t native_thread_id;
uint64_t datastack_chunk;
.current_frame = offsetof(PyThreadState, current_frame), \
.base_frame = offsetof(PyThreadState, base_frame), \
.last_profiled_frame = offsetof(PyThreadState, last_profiled_frame), \
+ .last_profiled_frame_seq = offsetof(PyThreadState, last_profiled_frame_seq), \
.thread_id = offsetof(PyThreadState, thread_id), \
.native_thread_id = offsetof(PyThreadState, native_thread_id), \
.datastack_chunk = offsetof(PyThreadState, datastack_chunk), \
// This avoids corrupting the cache when transient frames (called and returned
// between profiler samples) update last_profiled_frame to addresses the
// profiler never saw.
+// The sequence distinguishes this anchor from a later frame that reuses the
+// same _PyInterpreterFrame address.
#define _PyThreadState_UpdateLastProfiledFrame(tstate, frame, previous) \
do { \
PyThreadState *tstate_ = (tstate); \
_PyInterpreterFrame *frame_ = (frame); \
if (tstate_->last_profiled_frame == frame_) { \
tstate_->last_profiled_frame = (previous); \
+ tstate_->last_profiled_frame_seq++; \
} \
} while (0)
### Remote Profiling Frame Cache
-The `last_profiled_frame` field in `PyThreadState` supports an optimization for
-remote profilers that sample call stacks from external processes. When a remote
-profiler reads the call stack, it writes the current frame address to this field.
-The eval loop then keeps this pointer valid by updating it to the parent frame
-whenever a frame returns (in `_PyEval_FrameClearAndPop`).
+The `last_profiled_frame` and `last_profiled_frame_seq` fields in
+`PyThreadState` support an optimization for remote profilers that sample call
+stacks from external processes. When a remote profiler reads the call stack, it
+writes the current frame address to `last_profiled_frame`. The eval loop then
+keeps this pointer valid by updating it to the parent frame whenever a frame
+returns (in `_PyEval_FrameClearAndPop`) and increments the sequence.
This creates a "high-water mark" that always points to a frame still on the stack.
On subsequent samples, the profiler can walk from `current_frame` until it reaches
-`last_profiled_frame`, knowing that frames from that point downward are unchanged
-and can be retrieved from a cache. This significantly reduces the amount of remote
-memory reads needed when call stacks are deep and stable at their base.
-
-The update in `_PyEval_FrameClearAndPop` is guarded: it only writes when
-`last_profiled_frame` is non-NULL AND matches the frame being popped. This
-prevents transient frames (called and returned between profiler samples) from
-corrupting the cache pointer, while avoiding any overhead when profiling is inactive.
+`last_profiled_frame`, then validate the pointer and sequence before using cached
+callers. This prevents a later frame that reuses the same `_PyInterpreterFrame`
+address from being mistaken for the sampled frame. The cache significantly
+reduces the amount of remote memory reads needed when call stacks are deep and
+stable at their base.
+
+The update in `_PyEval_FrameClearAndPop` is guarded: it only advances the
+pointer and sequence when `last_profiled_frame` is non-NULL AND matches the
+frame being popped. This prevents transient frames (called and returned between
+profiler samples) from corrupting the cache anchor, while avoiding any overhead
+when profiling is inactive.
### The Instruction Pointer
--- /dev/null
+Fix another way the Tachyon profiler frame cache could produce impossible
+mixed stack traces when ``_PyInterpreterFrame`` addresses are reused, by
+validating cached frame anchors with a sequence counter.
#define FRAME_CACHE_MAX_THREADS 32
#define FRAME_CACHE_MAX_FRAMES 1024
+typedef struct {
+ uintptr_t frame;
+ uintptr_t seq;
+} FrameCacheAnchor;
+
typedef struct {
uint64_t thread_id; // 0 = empty slot
uintptr_t thread_state_addr;
+ uintptr_t last_profiled_frame_seq; // sequence paired with addrs[0]
uintptr_t addrs[FRAME_CACHE_MAX_FRAMES];
Py_ssize_t num_addrs;
PyObject *thread_id_obj; // owned reference, NULL if empty
uintptr_t thread_state_addr; // Owning thread state address
uintptr_t base_frame_addr; // Sentinel at bottom (for validation)
uintptr_t gc_frame; // GC frame address (0 if not tracking)
- uintptr_t last_profiled_frame; // Last cached frame (0 if no cache)
+ FrameCacheAnchor last_profiled; // Last cached frame anchor
StackChunkList *chunks; // Pre-copied stack chunks
int skip_first_frame; // Skip frame_addr itself (continue from its caller)
RemoteReadPrefetch prefetch; // Optional already-read thread/frame buffers
extern FrameCacheEntry *frame_cache_find(RemoteUnwinderObject *unwinder, uint64_t thread_id);
extern FrameCacheEntry *frame_cache_find_by_tstate(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr);
extern int clear_last_profiled_frames(RemoteUnwinderObject *unwinder);
+extern int set_last_profiled_frame(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr, uintptr_t frame_addr);
extern void frame_cache_invalidate_stale(RemoteUnwinderObject *unwinder, PyObject *result);
extern int frame_cache_lookup_and_extend(
RemoteUnwinderObject *unwinder,
uint64_t thread_id,
- uintptr_t last_profiled_frame,
+ uintptr_t thread_state_addr,
+ FrameCacheAnchor anchor,
PyObject *frame_info,
uintptr_t *frame_addrs,
Py_ssize_t *num_addrs,
Py_ssize_t max_addrs);
+extern int frame_cache_anchor_matches(
+ RemoteUnwinderObject *unwinder,
+ uintptr_t thread_state_addr,
+ FrameCacheAnchor anchor);
// Returns: 1 = stored, 0 = not stored (graceful), -1 = error
// Only stores complete stacks that reach base_frame_addr
extern int frame_cache_store(
const uintptr_t *addrs,
Py_ssize_t num_addrs,
uintptr_t thread_state_addr,
+ uintptr_t last_profiled_frame_seq,
uintptr_t base_frame_addr,
uintptr_t last_frame_visited);
#define FIELD_SIZE(type, member) sizeof(((type *)0)->member)
enum {
- PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 880,
+ PY_REMOTE_DEBUG_OFFSETS_TOTAL_SIZE = 888,
PY_REMOTE_ASYNC_DEBUG_OFFSETS_TOTAL_SIZE = 104,
};
APPLY(thread_state, next, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
APPLY(thread_state, current_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
APPLY(thread_state, base_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
- APPLY(thread_state, last_profiled_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size)
+ APPLY(thread_state, last_profiled_frame, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size); \
+ APPLY(thread_state, last_profiled_frame_seq, sizeof(uintptr_t), _Alignof(uintptr_t), buffer_size)
#define PY_REMOTE_DEBUG_INTERPRETER_STATE_FIELDS(APPLY, buffer_size) \
APPLY(interpreter_state, id, sizeof(int64_t), _Alignof(int64_t), buffer_size); \
Py_CLEAR(unwinder->frame_cache[i].frame_list);
unwinder->frame_cache[i].thread_id = 0;
unwinder->frame_cache[i].thread_state_addr = 0;
+ unwinder->frame_cache[i].last_profiled_frame_seq = 0;
unwinder->frame_cache[i].num_addrs = 0;
STATS_INC(unwinder, stale_cache_invalidations);
}
}
}
+static int
+read_last_profiled_anchor(RemoteUnwinderObject *unwinder,
+ uintptr_t thread_state_addr,
+ FrameCacheAnchor *anchor)
+{
+ uintptr_t frame_offset = (uintptr_t)unwinder->debug_offsets.thread_state.last_profiled_frame;
+ uintptr_t seq_offset = (uintptr_t)unwinder->debug_offsets.thread_state.last_profiled_frame_seq;
+
+ // These fields are adjacent in PyThreadState. Read them together when the
+ // layout allows it so validation uses a pointer and sequence from the same
+ // remote-memory read.
+ if (seq_offset == frame_offset + sizeof(uintptr_t)) {
+ uintptr_t live_anchor[2];
+ if (_Py_RemoteDebug_ReadRemoteMemory(&unwinder->handle,
+ thread_state_addr + frame_offset,
+ sizeof(live_anchor),
+ live_anchor) < 0) {
+ return -1;
+ }
+ anchor->frame = live_anchor[0];
+ anchor->seq = live_anchor[1];
+ return 0;
+ }
+
+ if (read_ptr(unwinder, thread_state_addr + frame_offset, &anchor->frame) < 0) {
+ return -1;
+ }
+ return read_ptr(unwinder, thread_state_addr + seq_offset, &anchor->seq);
+}
+
+static Py_ssize_t
+find_cached_frame_addr(const FrameCacheEntry *entry, uintptr_t frame_addr,
+ uintptr_t *real_pops)
+{
+ *real_pops = 0;
+ for (Py_ssize_t i = 0; i < entry->num_addrs; i++) {
+ if (entry->addrs[i] == frame_addr) {
+ return i;
+ }
+ if (entry->addrs[i] != 0) {
+ (*real_pops)++;
+ }
+ }
+ return -1;
+}
+
+int
+frame_cache_anchor_matches(
+ RemoteUnwinderObject *unwinder,
+ uintptr_t thread_state_addr,
+ FrameCacheAnchor anchor)
+{
+ FrameCacheAnchor live_anchor = {0, 0};
+ if (read_last_profiled_anchor(unwinder, thread_state_addr, &live_anchor) < 0) {
+ PyErr_Clear();
+ return 0;
+ }
+ return live_anchor.frame == anchor.frame && live_anchor.seq == anchor.seq;
+}
+
// Find last_profiled_frame in cache and extend frame_info with cached continuation
// If frame_addrs is provided (not NULL), also extends it with cached addresses
int
frame_cache_lookup_and_extend(
RemoteUnwinderObject *unwinder,
uint64_t thread_id,
- uintptr_t last_profiled_frame,
+ uintptr_t thread_state_addr,
+ FrameCacheAnchor anchor,
PyObject *frame_info,
uintptr_t *frame_addrs,
Py_ssize_t *num_addrs,
Py_ssize_t max_addrs)
{
- if (!unwinder->frame_cache || last_profiled_frame == 0) {
+ if (!unwinder->frame_cache || anchor.frame == 0) {
return 0;
}
if (!entry || !entry->frame_list) {
return 0;
}
+ if (entry->thread_state_addr != thread_state_addr) {
+ return 0;
+ }
assert(entry->num_addrs >= 0 && entry->num_addrs <= FRAME_CACHE_MAX_FRAMES);
- // Find the index where last_profiled_frame matches
- Py_ssize_t start_idx = -1;
- for (Py_ssize_t i = 0; i < entry->num_addrs; i++) {
- if (entry->addrs[i] == last_profiled_frame) {
- start_idx = i;
- break;
- }
- }
-
+ uintptr_t real_pops = 0;
+ Py_ssize_t start_idx = find_cached_frame_addr(entry, anchor.frame, &real_pops);
if (start_idx < 0) {
return 0; // Not found
}
assert(start_idx < entry->num_addrs);
+ // Synthetic marker frames (<native>/<GC>) are stored as addr-0 entries but
+ // never increment last_profiled_frame_seq in the target (only real frame
+ // pops do). Count the real frames before start_idx so the sequence check is
+ // not thrown off by markers sitting between the leaf and the anchor.
+ if (entry->last_profiled_frame_seq + real_pops != anchor.seq) {
+ return 0;
+ }
+
Py_ssize_t num_frames = PyList_GET_SIZE(entry->frame_list);
+ if (start_idx >= num_frames) {
+ return 0;
+ }
// Extend frame_info with frames ABOVE start_idx (not including it).
// The frame at start_idx (last_profiled_frame) was the executing frame
if (cache_start >= num_frames) {
return 0; // Nothing above last_profiled_frame to extend with
}
+ if (!frame_cache_anchor_matches(unwinder, thread_state_addr, anchor)) {
+ return 0;
+ }
PyObject *slice = PyList_GetSlice(entry->frame_list, cache_start, num_frames);
if (!slice) {
const uintptr_t *addrs,
Py_ssize_t num_addrs,
uintptr_t thread_state_addr,
+ uintptr_t last_profiled_frame_seq,
uintptr_t base_frame_addr,
uintptr_t last_frame_visited)
{
}
entry->thread_id = thread_id;
entry->thread_state_addr = thread_state_addr;
+ entry->last_profiled_frame_seq = last_profiled_frame_seq;
if (entry->thread_id_obj == NULL) {
entry->thread_id_obj = PyLong_FromUnsignedLongLong(thread_id);
if (entry->thread_id_obj == NULL) {
Py_DECREF(frame);
}
- if (ctx->last_profiled_frame != 0 && frame_addr == ctx->last_profiled_frame) {
+ if (ctx->last_profiled.frame != 0 && frame_addr == ctx->last_profiled.frame) {
ctx->stopped_at_cached_frame = 1;
break;
}
return 0;
}
-// Clear last_profiled_frame for all threads in the target process.
-// This must be called at the start of profiling to avoid stale values
-// from previous profilers causing us to stop frame walking early.
+int
+set_last_profiled_frame(RemoteUnwinderObject *unwinder, uintptr_t tstate_addr,
+ uintptr_t frame_addr)
+{
+ uintptr_t lpf_addr = tstate_addr +
+ (uintptr_t)unwinder->debug_offsets.thread_state.last_profiled_frame;
+ return _Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr,
+ sizeof(uintptr_t), &frame_addr);
+}
+
+// Clear the profiler anchor frame for all threads in the target process. The
+// sequence is intentionally preserved: a zero frame disables cache lookup, and
+// the next profiler-owned anchor should use the target's current generation.
int
clear_last_profiled_frames(RemoteUnwinderObject *unwinder)
{
uintptr_t current_interp = unwinder->interpreter_addr;
- uintptr_t zero = 0;
const size_t MAX_INTERPRETERS = 256;
size_t interp_count = 0;
size_t thread_count = 0;
while (tstate_addr != 0 && thread_count < MAX_THREADS_PER_INTERP) {
thread_count++;
- // Clear last_profiled_frame
- uintptr_t lpf_addr = tstate_addr + unwinder->debug_offsets.thread_state.last_profiled_frame;
- if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr,
- sizeof(uintptr_t), &zero) < 0) {
- // Non-fatal: just continue
+ uintptr_t no_frame = 0;
+ if (set_last_profiled_frame(unwinder, tstate_addr, no_frame) < 0) {
PyErr_Clear();
}
const FrameWalkContext *ctx,
uint64_t thread_id)
{
- if (!unwinder->frame_cache || ctx->last_profiled_frame == 0) {
+ if (!unwinder->frame_cache || ctx->last_profiled.frame == 0) {
return 0;
}
- if (ctx->frame_addr != ctx->last_profiled_frame) {
+ if (ctx->frame_addr != ctx->last_profiled.frame) {
return 0;
}
if (!entry || !entry->frame_list) {
return 0;
}
+ if (entry->thread_state_addr != ctx->thread_state_addr) {
+ return 0;
+ }
if (entry->num_addrs == 0 || entry->addrs[0] != ctx->frame_addr) {
return 0;
}
+ if (entry->last_profiled_frame_seq != ctx->last_profiled.seq) {
+ return 0;
+ }
PyObject *current_frame = NULL;
uintptr_t code_object_addr = 0;
if (parse_result < 0) {
return -1;
}
+ if (!frame_cache_anchor_matches(unwinder, ctx->thread_state_addr,
+ ctx->last_profiled)) {
+ Py_XDECREF(current_frame);
+ return 0;
+ }
if (current_frame != NULL) {
if (PyList_Append(ctx->frame_info, current_frame) < 0) {
assert(ctx->chunks != NULL);
+ // Cache misses copy stack chunks before walking. Frames found there are
+ // parsed from a stable snapshot, which keeps moving stacks from seeding the
+ // cache with an impossible parent chain.
if (ctx->chunks->count == 0) {
if (copy_stack_chunks(unwinder, ctx->thread_state_addr, ctx->chunks) < 0) {
- PyErr_Clear();
+ return -1;
}
}
if (ctx->stopped_at_cached_frame) {
Py_ssize_t frames_before_cache = PyList_GET_SIZE(ctx->frame_info);
- int cache_result = frame_cache_lookup_and_extend(unwinder, thread_id, ctx->last_profiled_frame,
+ int cache_result = frame_cache_lookup_and_extend(unwinder, thread_id,
+ ctx->thread_state_addr,
+ ctx->last_profiled,
ctx->frame_info, ctx->frame_addrs, &ctx->num_addrs,
ctx->max_addrs);
if (cache_result < 0) {
// Continue walking from last_profiled_frame, skipping it (already processed)
Py_ssize_t frames_before_walk = PyList_GET_SIZE(ctx->frame_info);
FrameWalkContext continue_ctx = {
- .frame_addr = ctx->last_profiled_frame,
+ .frame_addr = ctx->last_profiled.frame,
.base_frame_addr = ctx->base_frame_addr,
.gc_frame = ctx->gc_frame,
.chunks = ctx->chunks,
STATS_ADD(unwinder, frames_read_from_cache, PyList_GET_SIZE(ctx->frame_info) - frames_before_cache);
}
} else {
- if (ctx->last_profiled_frame == 0) {
+ if (ctx->last_profiled.frame == 0) {
STATS_INC(unwinder, frame_cache_misses);
}
}
- if (frame_cache_store(unwinder, thread_id, ctx->frame_info, ctx->frame_addrs, ctx->num_addrs,
- ctx->thread_state_addr, ctx->base_frame_addr,
+ if (frame_cache_store(unwinder, thread_id, ctx->frame_info, ctx->frame_addrs,
+ ctx->num_addrs, ctx->thread_state_addr,
+ ctx->last_profiled.seq, ctx->base_frame_addr,
ctx->last_frame_visited) < 0) {
return -1;
}
return -1;
}
- // Clear stale last_profiled_frame values from previous profilers
- // This prevents us from stopping frame walking early due to stale values
+ // Clear stale profiler anchors from previous profilers. This prevents us
+ // from stopping frame walking early due to stale frame pointers.
if (cache_frames) {
clear_last_profiled_frames(self);
}
goto error;
}
- // In cache mode, copying stack chunks is more expensive than direct memory reads
+ // Cache mode skips this for full hits, but cache misses copy chunks before
+ // walking so newly stored cache entries come from a stable stack snapshot.
if (!unwinder->cache_frames) {
if (copy_stack_chunks(unwinder, *current_tstate, &chunks) < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to copy stack chunks");
if (unwinder->cache_frames) {
// Use cache to avoid re-reading unchanged parent frames
- ctx.last_profiled_frame = GET_MEMBER(uintptr_t, ts,
+ ctx.last_profiled.frame = GET_MEMBER(uintptr_t, ts,
unwinder->debug_offsets.thread_state.last_profiled_frame);
+ ctx.last_profiled.seq = GET_MEMBER(uintptr_t, ts,
+ unwinder->debug_offsets.thread_state.last_profiled_frame_seq);
if (collect_frames_with_cache(unwinder, &ctx, tid) < 0) {
set_exception_cause(unwinder, PyExc_RuntimeError, "Failed to collect frames");
goto error;
}
// Update last_profiled_frame for next sample if it changed
- if (frame_addr != ctx.last_profiled_frame) {
- uintptr_t lpf_addr =
- *current_tstate + (uintptr_t)unwinder->debug_offsets.thread_state.last_profiled_frame;
- if (_Py_RemoteDebug_WriteRemoteMemory(&unwinder->handle, lpf_addr,
- sizeof(uintptr_t), &frame_addr) < 0) {
+ if (frame_addr != ctx.last_profiled.frame) {
+ if (set_last_profiled_frame(unwinder, *current_tstate, frame_addr) < 0) {
PyErr_Clear(); // Non-fatal
}
}
tstate->current_frame = &_tstate->base_frame;
// base_frame pointer for profilers to validate stack unwinding
tstate->base_frame = &_tstate->base_frame;
+ tstate->last_profiled_frame = NULL;
+ tstate->last_profiled_frame_seq = 0;
tstate->datastack_chunk = NULL;
tstate->datastack_top = NULL;
tstate->datastack_limit = NULL;