Fixes for 6.6

author Sasha Levin <sashal@kernel.org>

Fri, 29 Dec 2023 14:17:34 +0000 (09:17 -0500)

committer Sasha Levin <sashal@kernel.org>

Fri, 29 Dec 2023 14:17:34 +0000 (09:17 -0500)
author Sasha Levin <sashal@kernel.org>
Fri, 29 Dec 2023 14:17:34 +0000 (09:17 -0500)
committer Sasha Levin <sashal@kernel.org>
Fri, 29 Dec 2023 14:17:34 +0000 (09:17 -0500)
diff --git a/queue-6.6/ring-buffer-fix-32-bit-rb_time_read-race-with-rb_tim.patch b/queue-6.6/ring-buffer-fix-32-bit-rb_time_read-race-with-rb_tim.patch

new file mode 100644 (file)

index 0000000..7543f65
--- /dev/null
+++ b/queue-6.6/ring-buffer-fix-32-bit-rb_time_read-race-with-rb_tim.patch
@@ -0,0 +1,74 @@
+From 9c8977245a32c9a32567b935df66321b6b043bbf Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Dec 2023 14:30:49 -0500
+Subject: ring-buffer: Fix 32-bit rb_time_read() race with rb_time_cmpxchg()
+
+From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+
+[ Upstream commit dec890089bf79a4954b61482715ee2d084364856 ]
+
+The following race can cause rb_time_read() to observe a corrupted time
+stamp:
+
+rb_time_cmpxchg()
+[...]
+        if (!rb_time_read_cmpxchg(&t->msb, msb, msb2))
+                return false;
+        if (!rb_time_read_cmpxchg(&t->top, top, top2))
+                return false;
+<interrupted before updating bottom>
+__rb_time_read()
+[...]
+        do {
+                c = local_read(&t->cnt);
+                top = local_read(&t->top);
+                bottom = local_read(&t->bottom);
+                msb = local_read(&t->msb);
+        } while (c != local_read(&t->cnt));
+
+        *cnt = rb_time_cnt(top);
+
+        /* If top and msb counts don't match, this interrupted a write */
+        if (*cnt != rb_time_cnt(msb))
+                return false;
+          ^ this check fails to catch that "bottom" is still not updated.
+
+So the old "bottom" value is returned, which is wrong.
+
+Fix this by checking that all three of msb, top, and bottom 2-bit cnt
+values match.
+
+The reason to favor checking all three fields over requiring a specific
+update order for both rb_time_set() and rb_time_cmpxchg() is because
+checking all three fields is more robust to handle partial failures of
+rb_time_cmpxchg() when interrupted by nested rb_time_set().
+
+Link: https://lore.kernel.org/lkml/20231211201324.652870-1-mathieu.desnoyers@efficios.com/
+Link: https://lore.kernel.org/linux-trace-kernel/20231212193049.680122-1-mathieu.desnoyers@efficios.com
+
+Fixes: f458a1453424e ("ring-buffer: Test last update in 32bit version of __rb_time_read()")
+Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/trace/ring_buffer.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
+index af08a1a411e3d..070566baa0ca4 100644
+--- a/kernel/trace/ring_buffer.c
++++ b/kernel/trace/ring_buffer.c
+@@ -644,8 +644,8 @@ static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
+ 
+       *cnt = rb_time_cnt(top);
+ 
+-      /* If top and msb counts don't match, this interrupted a write */
+-      if (*cnt != rb_time_cnt(msb))
++      /* If top, msb or bottom counts don't match, this interrupted a write */
++      if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom))
+               return false;
+ 
+       /* The shift to msb will lose its cnt bits */
+-- 
+2.43.0
+
diff --git a/queue-6.6/ring-buffer-fix-slowpath-of-interrupted-event.patch b/queue-6.6/ring-buffer-fix-slowpath-of-interrupted-event.patch

new file mode 100644 (file)

index 0000000..65896c3
--- /dev/null
+++ b/queue-6.6/ring-buffer-fix-slowpath-of-interrupted-event.patch
@@ -0,0 +1,279 @@
+From e3007e07c2e381d60867646d35be72d98b55755b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 18 Dec 2023 23:07:12 -0500
+Subject: ring-buffer: Fix slowpath of interrupted event
+
+From: Steven Rostedt (Google) <rostedt@goodmis.org>
+
+[ Upstream commit b803d7c664d55705831729d2f2e29c874bcd62ea ]
+
+To synchronize the timestamps with the ring buffer reservation, there are
+two timestamps that are saved in the buffer meta data.
+
+1. before_stamp
+2. write_stamp
+
+When the two are equal, the write_stamp is considered valid, as in, it may
+be used to calculate the delta of the next event as the write_stamp is the
+timestamp of the previous reserved event on the buffer.
+
+This is done by the following:
+
+ /*A*/ w = current position on the ring buffer
+       before = before_stamp
+       after = write_stamp
+       ts = read current timestamp
+
+       if (before != after) {
+               write_stamp is not valid, force adding an absolute
+               timestamp.
+       }
+
+ /*B*/ before_stamp = ts
+
+ /*C*/ write = local_add_return(event length, position on ring buffer)
+
+       if (w == write - event length) {
+               /* Nothing interrupted between A and C */
+ /*E*/         write_stamp = ts;
+               delta = ts - after
+               /*
+                * If nothing interrupted again,
+                * before_stamp == write_stamp and write_stamp
+                * can be used to calculate the delta for
+                * events that come in after this one.
+                */
+       } else {
+
+               /*
+                * The slow path!
+                * Was interrupted between A and C.
+                */
+
+This is the place that there's a bug. We currently have:
+
+               after = write_stamp
+               ts = read current timestamp
+
+ /*F*/         if (write == current position on the ring buffer &&
+                   after < ts && cmpxchg(write_stamp, after, ts)) {
+
+                       delta = ts - after;
+
+               } else {
+                       delta = 0;
+               }
+
+The assumption is that if the current position on the ring buffer hasn't
+moved between C and F, then it also was not interrupted, and that the last
+event written has a timestamp that matches the write_stamp. That is the
+write_stamp is valid.
+
+But this may not be the case:
+
+If a task context event was interrupted by softirq between B and C.
+
+And the softirq wrote an event that got interrupted by a hard irq between
+C and E.
+
+and the hard irq wrote an event (does not need to be interrupted)
+
+We have:
+
+ /*B*/ before_stamp = ts of normal context
+
+   ---> interrupted by softirq
+
+       /*B*/ before_stamp = ts of softirq context
+
+         ---> interrupted by hardirq
+
+               /*B*/ before_stamp = ts of hard irq context
+               /*E*/ write_stamp = ts of hard irq context
+
+               /* matches and write_stamp valid */
+         <----
+
+       /*E*/ write_stamp = ts of softirq context
+
+       /* No longer matches before_stamp, write_stamp is not valid! */
+
+   <---
+
+ w != write - length, go to slow path
+
+// Right now the order of events in the ring buffer is:
+//
+// |-- softirq event --|-- hard irq event --|-- normal context event --|
+//
+
+ after = write_stamp (this is the ts of softirq)
+ ts = read current timestamp
+
+ if (write == current position on the ring buffer [true] &&
+     after < ts [true] && cmpxchg(write_stamp, after, ts) [true]) {
+
+       delta = ts - after  [Wrong!]
+
+The delta is to be between the hard irq event and the normal context
+event, but the above logic made the delta between the softirq event and
+the normal context event, where the hard irq event is between the two. This
+will shift all the remaining event timestamps on the sub-buffer
+incorrectly.
+
+The write_stamp is only valid if it matches the before_stamp. The cmpxchg
+does nothing to help this.
+
+Instead, the following logic can be done to fix this:
+
+       before = before_stamp
+       ts = read current timestamp
+       before_stamp = ts
+
+       after = write_stamp
+
+       if (write == current position on the ring buffer &&
+           after == before && after < ts) {
+
+               delta = ts - after
+
+       } else {
+               delta = 0;
+       }
+
+The above will only use the write_stamp if it still matches before_stamp
+and was tested to not have changed since C.
+
+As a bonus, with this logic we do not need any 64-bit cmpxchg() at all!
+
+This means the 32-bit rb_time_t workaround can finally be removed. But
+that's for a later time.
+
+Link: https://lore.kernel.org/linux-trace-kernel/20231218175229.58ec3daf@gandalf.local.home/
+Link: https://lore.kernel.org/linux-trace-kernel/20231218230712.3a76b081@gandalf.local.home
+
+Cc: stable@vger.kernel.org
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Fixes: dd93942570789 ("ring-buffer: Do not try to put back write_stamp")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/trace/ring_buffer.c | 79 ++++++++++++--------------------------
+ 1 file changed, 24 insertions(+), 55 deletions(-)
+
+diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
+index 4c97160ab9c15..783a500e89c58 100644
+--- a/kernel/trace/ring_buffer.c
++++ b/kernel/trace/ring_buffer.c
+@@ -700,48 +700,6 @@ rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
+       return local_try_cmpxchg(l, &expect, set);
+ }
+ 
+-static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+-{
+-      unsigned long cnt, top, bottom, msb;
+-      unsigned long cnt2, top2, bottom2, msb2;
+-      u64 val;
+-
+-      /* Any interruptions in this function should cause a failure */
+-      cnt = local_read(&t->cnt);
+-
+-      /* The cmpxchg always fails if it interrupted an update */
+-       if (!__rb_time_read(t, &val, &cnt2))
+-               return false;
+-
+-       if (val != expect)
+-               return false;
+-
+-       if ((cnt & 3) != cnt2)
+-               return false;
+-
+-       cnt2 = cnt + 1;
+-
+-       rb_time_split(val, &top, &bottom, &msb);
+-       msb = rb_time_val_cnt(msb, cnt);
+-       top = rb_time_val_cnt(top, cnt);
+-       bottom = rb_time_val_cnt(bottom, cnt);
+-
+-       rb_time_split(set, &top2, &bottom2, &msb2);
+-       msb2 = rb_time_val_cnt(msb2, cnt);
+-       top2 = rb_time_val_cnt(top2, cnt2);
+-       bottom2 = rb_time_val_cnt(bottom2, cnt2);
+-
+-      if (!rb_time_read_cmpxchg(&t->cnt, cnt, cnt2))
+-              return false;
+-      if (!rb_time_read_cmpxchg(&t->msb, msb, msb2))
+-              return false;
+-      if (!rb_time_read_cmpxchg(&t->top, top, top2))
+-              return false;
+-      if (!rb_time_read_cmpxchg(&t->bottom, bottom, bottom2))
+-              return false;
+-      return true;
+-}
+-
+ #else /* 64 bits */
+ 
+ /* local64_t always succeeds */
+@@ -755,11 +713,6 @@ static void rb_time_set(rb_time_t *t, u64 val)
+ {
+       local64_set(&t->time, val);
+ }
+-
+-static bool rb_time_cmpxchg(rb_time_t *t, u64 expect, u64 set)
+-{
+-      return local64_try_cmpxchg(&t->time, &expect, set);
+-}
+ #endif
+ 
+ /*
+@@ -3610,20 +3563,36 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+       } else {
+               u64 ts;
+               /* SLOW PATH - Interrupted between A and C */
+-              a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+-              /* Was interrupted before here, write_stamp must be valid */
++
++              /* Save the old before_stamp */
++              a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
+               RB_WARN_ON(cpu_buffer, !a_ok);
++
++              /*
++               * Read a new timestamp and update the before_stamp to make
++               * the next event after this one force using an absolute
++               * timestamp. This is in case an interrupt were to come in
++               * between E and F.
++               */
+               ts = rb_time_stamp(cpu_buffer->buffer);
++              rb_time_set(&cpu_buffer->before_stamp, ts);
++
++              barrier();
++ /*E*/                a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
++              /* Was interrupted before here, write_stamp must be valid */
++              RB_WARN_ON(cpu_buffer, !a_ok);
+               barrier();
+- /*E*/                if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
+-                  info->after < ts &&
+-                  rb_time_cmpxchg(&cpu_buffer->write_stamp,
+-                                  info->after, ts)) {
+-                      /* Nothing came after this event between C and E */
++ /*F*/                if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
++                  info->after == info->before && info->after < ts) {
++                      /*
++                       * Nothing came after this event between C and F, it is
++                       * safe to use info->after for the delta as it
++                       * matched info->before and is still valid.
++                       */
+                       info->delta = ts - info->after;
+               } else {
+                       /*
+-                       * Interrupted between C and E:
++                       * Interrupted between C and F:
+                        * Lost the previous events time stamp. Just set the
+                        * delta to zero, and this will be the same time as
+                        * the event this event interrupted. And the events that
+-- 
+2.43.0
+
diff --git a/queue-6.6/ring-buffer-remove-useless-update-to-write_stamp-in-.patch b/queue-6.6/ring-buffer-remove-useless-update-to-write_stamp-in-.patch

new file mode 100644 (file)

index 0000000..18dbd20
--- /dev/null
+++ b/queue-6.6/ring-buffer-remove-useless-update-to-write_stamp-in-.patch
@@ -0,0 +1,150 @@
+From 99aab7dafc7a92f1763cc236e46dda4bda3efc03 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 15 Dec 2023 08:18:10 -0500
+Subject: ring-buffer: Remove useless update to write_stamp in
+ rb_try_to_discard()
+
+From: Steven Rostedt (Google) <rostedt@goodmis.org>
+
+[ Upstream commit 083e9f65bd215582bf8f6a920db729fadf16704f ]
+
+When filtering is enabled, a temporary buffer is created to place the
+content of the trace event output so that the filter logic can decide
+from the trace event output if the trace event should be filtered out or
+not. If it is to be filtered out, the content in the temporary buffer is
+simply discarded, otherwise it is written into the trace buffer.
+
+But if an interrupt were to come in while a previous event was using that
+temporary buffer, the event written by the interrupt would actually go
+into the ring buffer itself to prevent corrupting the data on the
+temporary buffer. If the event is to be filtered out, the event in the
+ring buffer is discarded, or if it fails to discard because another event
+were to have already come in, it is turned into padding.
+
+The update to the write_stamp in the rb_try_to_discard() happens after a
+fix was made to force the next event after the discard to use an absolute
+timestamp by setting the before_stamp to zero so it does not match the
+write_stamp (which causes an event to use the absolute timestamp).
+
+But there's an effort in rb_try_to_discard() to put back the write_stamp
+to what it was before the event was added. But this is useless and
+wasteful because nothing is going to be using that write_stamp for
+calculations as it still will not match the before_stamp.
+
+Remove this useless update, and in doing so, we remove another
+cmpxchg64()!
+
+Also update the comments to reflect this change as well as remove some
+extra white space in another comment.
+
+Link: https://lore.kernel.org/linux-trace-kernel/20231215081810.1f4f38fe@rorschach.local.home
+
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Cc: Joel Fernandes <joel@joelfernandes.org>
+Cc: Vincent Donnefort   <vdonnefort@google.com>
+Fixes: b2dd797543cf ("ring-buffer: Force absolute timestamp on discard of event")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/trace/ring_buffer.c | 47 +++++++++-----------------------------
+ 1 file changed, 11 insertions(+), 36 deletions(-)
+
+diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
+index 070566baa0ca4..4c97160ab9c15 100644
+--- a/kernel/trace/ring_buffer.c
++++ b/kernel/trace/ring_buffer.c
+@@ -2987,25 +2987,6 @@ static unsigned rb_calculate_event_length(unsigned length)
+       return length;
+ }
+ 
+-static u64 rb_time_delta(struct ring_buffer_event *event)
+-{
+-      switch (event->type_len) {
+-      case RINGBUF_TYPE_PADDING:
+-              return 0;
+-
+-      case RINGBUF_TYPE_TIME_EXTEND:
+-              return rb_event_time_stamp(event);
+-
+-      case RINGBUF_TYPE_TIME_STAMP:
+-              return 0;
+-
+-      case RINGBUF_TYPE_DATA:
+-              return event->time_delta;
+-      default:
+-              return 0;
+-      }
+-}
+-
+ static inline bool
+ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
+                 struct ring_buffer_event *event)
+@@ -3013,8 +2994,6 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
+       unsigned long new_index, old_index;
+       struct buffer_page *bpage;
+       unsigned long addr;
+-      u64 write_stamp;
+-      u64 delta;
+ 
+       new_index = rb_event_index(event);
+       old_index = new_index + rb_event_ts_length(event);
+@@ -3023,14 +3002,10 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
+ 
+       bpage = READ_ONCE(cpu_buffer->tail_page);
+ 
+-      delta = rb_time_delta(event);
+-
+-      if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
+-              return false;
+-
+-      /* Make sure the write stamp is read before testing the location */
+-      barrier();
+-
++      /*
++       * Make sure the tail_page is still the same and
++       * the next write location is the end of this event
++       */
+       if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
+               unsigned long write_mask =
+                       local_read(&bpage->write) & ~RB_WRITE_MASK;
+@@ -3041,20 +3016,20 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
+                * to make sure that the next event adds an absolute
+                * value and does not rely on the saved write stamp, which
+                * is now going to be bogus.
++               *
++               * By setting the before_stamp to zero, the next event
++               * is not going to use the write_stamp and will instead
++               * create an absolute timestamp. This means there's no
++               * reason to update the wirte_stamp!
+                */
+               rb_time_set(&cpu_buffer->before_stamp, 0);
+ 
+-              /* Something came in, can't discard */
+-              if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
+-                                     write_stamp, write_stamp - delta))
+-                      return false;
+-
+               /*
+                * If an event were to come in now, it would see that the
+                * write_stamp and the before_stamp are different, and assume
+                * that this event just added itself before updating
+                * the write stamp. The interrupting event will fix the
+-               * write stamp for us, and use the before stamp as its delta.
++               * write stamp for us, and use an absolute timestamp.
+                */
+ 
+               /*
+@@ -3491,7 +3466,7 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
+               return;
+ 
+       /*
+-       * If this interrupted another event, 
++       * If this interrupted another event,
+        */
+       if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
+               goto out;
+-- 
+2.43.0
+
diff --git a/queue-6.6/series b/queue-6.6/series

index 6c88b8ef6db7e8f5b291dc9ff3940a5ab775c083..b303ed1e24cddfc3389ab7b7a66eea55de49d2c4 100644 (file)
--- a/queue-6.6/series
+++ b/queue-6.6/series
@@ -129,3 +129,6 @@ smb-client-fix-oob-in-smbcalcsize.patch
  drm-i915-reject-async-flips-with-bigjoiner.patch
  drm-i915-dmc-don-t-enable-any-pipe-dmc-events.patch
  9p-prevent-read-overrun-in-protocol-dump-tracepoint.patch
+ring-buffer-fix-32-bit-rb_time_read-race-with-rb_tim.patch
+ring-buffer-remove-useless-update-to-write_stamp-in-.patch
+ring-buffer-fix-slowpath-of-interrupted-event.patch
author	Sasha Levin <sashal@kernel.org>
	Fri, 29 Dec 2023 14:17:34 +0000 (09:17 -0500)
committer	Sasha Levin <sashal@kernel.org>
	Fri, 29 Dec 2023 14:17:34 +0000 (09:17 -0500)
queue-6.6/ring-buffer-fix-32-bit-rb_time_read-race-with-rb_tim.patch	[new file with mode: 0644]	patch \| blob
queue-6.6/ring-buffer-fix-slowpath-of-interrupted-event.patch	[new file with mode: 0644]	patch \| blob
queue-6.6/ring-buffer-remove-useless-update-to-write_stamp-in-.patch	[new file with mode: 0644]	patch \| blob
queue-6.6/series		patch \| blob \| blame \| history