From: Sasha Levin Date: Fri, 29 Dec 2023 14:17:34 +0000 (-0500) Subject: Fixes for 6.6 X-Git-Tag: v6.1.70~18 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=1f50b176c804811df2208c30f165d5777bb94083;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 6.6 Signed-off-by: Sasha Levin --- diff --git a/queue-6.6/ring-buffer-fix-32-bit-rb_time_read-race-with-rb_tim.patch b/queue-6.6/ring-buffer-fix-32-bit-rb_time_read-race-with-rb_tim.patch new file mode 100644 index 00000000000..7543f65afde --- /dev/null +++ b/queue-6.6/ring-buffer-fix-32-bit-rb_time_read-race-with-rb_tim.patch @@ -0,0 +1,74 @@ +From 9c8977245a32c9a32567b935df66321b6b043bbf Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Dec 2023 14:30:49 -0500 +Subject: ring-buffer: Fix 32-bit rb_time_read() race with rb_time_cmpxchg() + +From: Mathieu Desnoyers + +[ Upstream commit dec890089bf79a4954b61482715ee2d084364856 ] + +The following race can cause rb_time_read() to observe a corrupted time +stamp: + +rb_time_cmpxchg() +[...] + if (!rb_time_read_cmpxchg(&t->msb, msb, msb2)) + return false; + if (!rb_time_read_cmpxchg(&t->top, top, top2)) + return false; + +__rb_time_read() +[...] + do { + c = local_read(&t->cnt); + top = local_read(&t->top); + bottom = local_read(&t->bottom); + msb = local_read(&t->msb); + } while (c != local_read(&t->cnt)); + + *cnt = rb_time_cnt(top); + + /* If top and msb counts don't match, this interrupted a write */ + if (*cnt != rb_time_cnt(msb)) + return false; + ^ this check fails to catch that "bottom" is still not updated. + +So the old "bottom" value is returned, which is wrong. + +Fix this by checking that all three of msb, top, and bottom 2-bit cnt +values match. + +The reason to favor checking all three fields over requiring a specific +update order for both rb_time_set() and rb_time_cmpxchg() is because +checking all three fields is more robust to handle partial failures of +rb_time_cmpxchg() when interrupted by nested rb_time_set(). + +Link: https://lore.kernel.org/lkml/20231211201324.652870-1-mathieu.desnoyers@efficios.com/ +Link: https://lore.kernel.org/linux-trace-kernel/20231212193049.680122-1-mathieu.desnoyers@efficios.com + +Fixes: f458a1453424e ("ring-buffer: Test last update in 32bit version of __rb_time_read()") +Signed-off-by: Mathieu Desnoyers +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Sasha Levin +--- + kernel/trace/ring_buffer.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c +index af08a1a411e3d..070566baa0ca4 100644 +--- a/kernel/trace/ring_buffer.c ++++ b/kernel/trace/ring_buffer.c +@@ -644,8 +644,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt) + + *cnt = rb_time_cnt(top); + +- /* If top and msb counts don't match, this interrupted a write */ +- if (*cnt != rb_time_cnt(msb)) ++ /* If top, msb or bottom counts don't match, this interrupted a write */ ++ if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom)) + return false; + + /* The shift to msb will lose its cnt bits */ +-- +2.43.0 + diff --git a/queue-6.6/ring-buffer-fix-slowpath-of-interrupted-event.patch b/queue-6.6/ring-buffer-fix-slowpath-of-interrupted-event.patch new file mode 100644 index 00000000000..65896c39536 --- /dev/null +++ b/queue-6.6/ring-buffer-fix-slowpath-of-interrupted-event.patch @@ -0,0 +1,279 @@ +From e3007e07c2e381d60867646d35be72d98b55755b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 18 Dec 2023 23:07:12 -0500 +Subject: ring-buffer: Fix slowpath of interrupted event + +From: Steven Rostedt (Google) + +[ Upstream commit b803d7c664d55705831729d2f2e29c874bcd62ea ] + +To synchronize the timestamps with the ring buffer reservation, there are +two timestamps that are saved in the buffer meta data. + +1. before_stamp +2. write_stamp + +When the two are equal, the write_stamp is considered valid, as in, it may +be used to calculate the delta of the next event as the write_stamp is the +timestamp of the previous reserved event on the buffer. + +This is done by the following: + + /*A*/ w = current position on the ring buffer + before = before_stamp + after = write_stamp + ts = read current timestamp + + if (before != after) { + write_stamp is not valid, force adding an absolute + timestamp. + } + + /*B*/ before_stamp = ts + + /*C*/ write = local_add_return(event length, position on ring buffer) + + if (w == write - event length) { + /* Nothing interrupted between A and C */ + /*E*/ write_stamp = ts; + delta = ts - after + /* + * If nothing interrupted again, + * before_stamp == write_stamp and write_stamp + * can be used to calculate the delta for + * events that come in after this one. + */ + } else { + + /* + * The slow path! + * Was interrupted between A and C. + */ + +This is the place that there's a bug. We currently have: + + after = write_stamp + ts = read current timestamp + + /*F*/ if (write == current position on the ring buffer && + after < ts && cmpxchg(write_stamp, after, ts)) { + + delta = ts - after; + + } else { + delta = 0; + } + +The assumption is that if the current position on the ring buffer hasn't +moved between C and F, then it also was not interrupted, and that the last +event written has a timestamp that matches the write_stamp. That is the +write_stamp is valid. + +But this may not be the case: + +If a task context event was interrupted by softirq between B and C. + +And the softirq wrote an event that got interrupted by a hard irq between +C and E. + +and the hard irq wrote an event (does not need to be interrupted) + +We have: + + /*B*/ before_stamp = ts of normal context + + ---> interrupted by softirq + + /*B*/ before_stamp = ts of softirq context + + ---> interrupted by hardirq + + /*B*/ before_stamp = ts of hard irq context + /*E*/ write_stamp = ts of hard irq context + + /* matches and write_stamp valid */ + <---- + + /*E*/ write_stamp = ts of softirq context + + /* No longer matches before_stamp, write_stamp is not valid! */ + + <--- + + w != write - length, go to slow path + +// Right now the order of events in the ring buffer is: +// +// |-- softirq event --|-- hard irq event --|-- normal context event --| +// + + after = write_stamp (this is the ts of softirq) + ts = read current timestamp + + if (write == current position on the ring buffer [true] && + after < ts [true] && cmpxchg(write_stamp, after, ts) [true]) { + + delta = ts - after [Wrong!] + +The delta is to be between the hard irq event and the normal context +event, but the above logic made the delta between the softirq event and +the normal context event, where the hard irq event is between the two. This +will shift all the remaining event timestamps on the sub-buffer +incorrectly. + +The write_stamp is only valid if it matches the before_stamp. The cmpxchg +does nothing to help this. + +Instead, the following logic can be done to fix this: + + before = before_stamp + ts = read current timestamp + before_stamp = ts + + after = write_stamp + + if (write == current position on the ring buffer && + after == before && after < ts) { + + delta = ts - after + + } else { + delta = 0; + } + +The above will only use the write_stamp if it still matches before_stamp +and was tested to not have changed since C. + +As a bonus, with this logic we do not need any 64-bit cmpxchg() at all! + +This means the 32-bit rb_time_t workaround can finally be removed. But +that's for a later time. + +Link: https://lore.kernel.org/linux-trace-kernel/20231218175229.58ec3daf@gandalf.local.home/ +Link: https://lore.kernel.org/linux-trace-kernel/20231218230712.3a76b081@gandalf.local.home + +Cc: stable@vger.kernel.org +Cc: Masami Hiramatsu +Cc: Mark Rutland +Cc: Mathieu Desnoyers +Cc: Linus Torvalds +Fixes: dd93942570789 ("ring-buffer: Do not try to put back write_stamp") +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Sasha Levin +--- + kernel/trace/ring_buffer.c | 79 ++++++++++++-------------------------- + 1 file changed, 24 insertions(+), 55 deletions(-) + +diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c +index 4c97160ab9c15..783a500e89c58 100644 +--- a/kernel/trace/ring_buffer.c ++++ b/kernel/trace/ring_buffer.c +@@ -700,48 +700,6 @@ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set) + return local_try_cmpxchg(l, &expect, set); + } + +-static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) +-{ +- unsigned long cnt, top, bottom, msb; +- unsigned long cnt2, top2, bottom2, msb2; +- u64 val; +- +- /* Any interruptions in this function should cause a failure */ +- cnt = local_read(&t->cnt); +- +- /* The cmpxchg always fails if it interrupted an update */ +- if (!__rb_time_read(t, &val, &cnt2)) +- return false; +- +- if (val != expect) +- return false; +- +- if ((cnt & 3) != cnt2) +- return false; +- +- cnt2 = cnt + 1; +- +- rb_time_split(val, &top, &bottom, &msb); +- msb = rb_time_val_cnt(msb, cnt); +- top = rb_time_val_cnt(top, cnt); +- bottom = rb_time_val_cnt(bottom, cnt); +- +- rb_time_split(set, &top2, &bottom2, &msb2); +- msb2 = rb_time_val_cnt(msb2, cnt); +- top2 = rb_time_val_cnt(top2, cnt2); +- bottom2 = rb_time_val_cnt(bottom2, cnt2); +- +- if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2)) +- return false; +- if (!rb_time_read_cmpxchg(&t->msb, msb, msb2)) +- return false; +- if (!rb_time_read_cmpxchg(&t->top, top, top2)) +- return false; +- if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2)) +- return false; +- return true; +-} +- + #else /* 64 bits */ + + /* local64_t always succeeds */ +@@ -755,11 +713,6 @@ static void rb_time_set(rb_time_t *t, u64 val) + { + local64_set(&t->time, val); + } +- +-static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set) +-{ +- return local64_try_cmpxchg(&t->time, &expect, set); +-} + #endif + + /* +@@ -3610,20 +3563,36 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, + } else { + u64 ts; + /* SLOW PATH - Interrupted between A and C */ +- a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); +- /* Was interrupted before here, write_stamp must be valid */ ++ ++ /* Save the old before_stamp */ ++ a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); + RB_WARN_ON(cpu_buffer, !a_ok); ++ ++ /* ++ * Read a new timestamp and update the before_stamp to make ++ * the next event after this one force using an absolute ++ * timestamp. This is in case an interrupt were to come in ++ * between E and F. ++ */ + ts = rb_time_stamp(cpu_buffer->buffer); ++ rb_time_set(&cpu_buffer->before_stamp, ts); ++ ++ barrier(); ++ /*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); ++ /* Was interrupted before here, write_stamp must be valid */ ++ RB_WARN_ON(cpu_buffer, !a_ok); + barrier(); +- /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && +- info->after < ts && +- rb_time_cmpxchg(&cpu_buffer->write_stamp, +- info->after, ts)) { +- /* Nothing came after this event between C and E */ ++ /*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && ++ info->after == info->before && info->after < ts) { ++ /* ++ * Nothing came after this event between C and F, it is ++ * safe to use info->after for the delta as it ++ * matched info->before and is still valid. ++ */ + info->delta = ts - info->after; + } else { + /* +- * Interrupted between C and E: ++ * Interrupted between C and F: + * Lost the previous events time stamp. Just set the + * delta to zero, and this will be the same time as + * the event this event interrupted. And the events that +-- +2.43.0 + diff --git a/queue-6.6/ring-buffer-remove-useless-update-to-write_stamp-in-.patch b/queue-6.6/ring-buffer-remove-useless-update-to-write_stamp-in-.patch new file mode 100644 index 00000000000..18dbd202b90 --- /dev/null +++ b/queue-6.6/ring-buffer-remove-useless-update-to-write_stamp-in-.patch @@ -0,0 +1,150 @@ +From 99aab7dafc7a92f1763cc236e46dda4bda3efc03 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 15 Dec 2023 08:18:10 -0500 +Subject: ring-buffer: Remove useless update to write_stamp in + rb_try_to_discard() + +From: Steven Rostedt (Google) + +[ Upstream commit 083e9f65bd215582bf8f6a920db729fadf16704f ] + +When filtering is enabled, a temporary buffer is created to place the +content of the trace event output so that the filter logic can decide +from the trace event output if the trace event should be filtered out or +not. If it is to be filtered out, the content in the temporary buffer is +simply discarded, otherwise it is written into the trace buffer. + +But if an interrupt were to come in while a previous event was using that +temporary buffer, the event written by the interrupt would actually go +into the ring buffer itself to prevent corrupting the data on the +temporary buffer. If the event is to be filtered out, the event in the +ring buffer is discarded, or if it fails to discard because another event +were to have already come in, it is turned into padding. + +The update to the write_stamp in the rb_try_to_discard() happens after a +fix was made to force the next event after the discard to use an absolute +timestamp by setting the before_stamp to zero so it does not match the +write_stamp (which causes an event to use the absolute timestamp). + +But there's an effort in rb_try_to_discard() to put back the write_stamp +to what it was before the event was added. But this is useless and +wasteful because nothing is going to be using that write_stamp for +calculations as it still will not match the before_stamp. + +Remove this useless update, and in doing so, we remove another +cmpxchg64()! + +Also update the comments to reflect this change as well as remove some +extra white space in another comment. + +Link: https://lore.kernel.org/linux-trace-kernel/20231215081810.1f4f38fe@rorschach.local.home + +Cc: Masami Hiramatsu +Cc: Mark Rutland +Cc: Mathieu Desnoyers +Cc: Joel Fernandes +Cc: Vincent Donnefort +Fixes: b2dd797543cf ("ring-buffer: Force absolute timestamp on discard of event") +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Sasha Levin +--- + kernel/trace/ring_buffer.c | 47 +++++++++----------------------------- + 1 file changed, 11 insertions(+), 36 deletions(-) + +diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c +index 070566baa0ca4..4c97160ab9c15 100644 +--- a/kernel/trace/ring_buffer.c ++++ b/kernel/trace/ring_buffer.c +@@ -2987,25 +2987,6 @@ static unsigned rb_calculate_event_length(unsigned length) + return length; + } + +-static u64 rb_time_delta(struct ring_buffer_event *event) +-{ +- switch (event->type_len) { +- case RINGBUF_TYPE_PADDING: +- return 0; +- +- case RINGBUF_TYPE_TIME_EXTEND: +- return rb_event_time_stamp(event); +- +- case RINGBUF_TYPE_TIME_STAMP: +- return 0; +- +- case RINGBUF_TYPE_DATA: +- return event->time_delta; +- default: +- return 0; +- } +-} +- + static inline bool + rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +@@ -3013,8 +2994,6 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long new_index, old_index; + struct buffer_page *bpage; + unsigned long addr; +- u64 write_stamp; +- u64 delta; + + new_index = rb_event_index(event); + old_index = new_index + rb_event_ts_length(event); +@@ -3023,14 +3002,10 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, + + bpage = READ_ONCE(cpu_buffer->tail_page); + +- delta = rb_time_delta(event); +- +- if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp)) +- return false; +- +- /* Make sure the write stamp is read before testing the location */ +- barrier(); +- ++ /* ++ * Make sure the tail_page is still the same and ++ * the next write location is the end of this event ++ */ + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { + unsigned long write_mask = + local_read(&bpage->write) & ~RB_WRITE_MASK; +@@ -3041,20 +3016,20 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, + * to make sure that the next event adds an absolute + * value and does not rely on the saved write stamp, which + * is now going to be bogus. ++ * ++ * By setting the before_stamp to zero, the next event ++ * is not going to use the write_stamp and will instead ++ * create an absolute timestamp. This means there's no ++ * reason to update the wirte_stamp! + */ + rb_time_set(&cpu_buffer->before_stamp, 0); + +- /* Something came in, can't discard */ +- if (!rb_time_cmpxchg(&cpu_buffer->write_stamp, +- write_stamp, write_stamp - delta)) +- return false; +- + /* + * If an event were to come in now, it would see that the + * write_stamp and the before_stamp are different, and assume + * that this event just added itself before updating + * the write stamp. The interrupting event will fix the +- * write stamp for us, and use the before stamp as its delta. ++ * write stamp for us, and use an absolute timestamp. + */ + + /* +@@ -3491,7 +3466,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, + return; + + /* +- * If this interrupted another event, ++ * If this interrupted another event, + */ + if (atomic_inc_return(this_cpu_ptr(&checking)) != 1) + goto out; +-- +2.43.0 + diff --git a/queue-6.6/series b/queue-6.6/series index 6c88b8ef6db..b303ed1e24c 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -129,3 +129,6 @@ smb-client-fix-oob-in-smbcalcsize.patch drm-i915-reject-async-flips-with-bigjoiner.patch drm-i915-dmc-don-t-enable-any-pipe-dmc-events.patch 9p-prevent-read-overrun-in-protocol-dump-tracepoint.patch +ring-buffer-fix-32-bit-rb_time_read-race-with-rb_tim.patch +ring-buffer-remove-useless-update-to-write_stamp-in-.patch +ring-buffer-fix-slowpath-of-interrupted-event.patch