]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.1-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 3 Jan 2024 10:19:07 +0000 (11:19 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 3 Jan 2024 10:19:07 +0000 (11:19 +0100)
added patches:
ftrace-fix-modification-of-direct_function-hash-while-in-use.patch
ring-buffer-fix-wake-ups-when-buffer_percent-is-set-to-100.patch
ring-buffer-remove-useless-update-to-write_stamp-in-rb_try_to_discard.patch
tracing-fix-blocked-reader-of-snapshot-buffer.patch

queue-6.1/ftrace-fix-modification-of-direct_function-hash-while-in-use.patch [new file with mode: 0644]
queue-6.1/ring-buffer-fix-wake-ups-when-buffer_percent-is-set-to-100.patch [new file with mode: 0644]
queue-6.1/ring-buffer-remove-useless-update-to-write_stamp-in-rb_try_to_discard.patch [new file with mode: 0644]
queue-6.1/series
queue-6.1/tracing-fix-blocked-reader-of-snapshot-buffer.patch [new file with mode: 0644]

diff --git a/queue-6.1/ftrace-fix-modification-of-direct_function-hash-while-in-use.patch b/queue-6.1/ftrace-fix-modification-of-direct_function-hash-while-in-use.patch
new file mode 100644 (file)
index 0000000..f661d8f
--- /dev/null
@@ -0,0 +1,304 @@
+From d05cb470663a2a1879277e544f69e660208f08f2 Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
+Date: Fri, 29 Dec 2023 11:51:34 -0500
+Subject: ftrace: Fix modification of direct_function hash while in use
+
+From: Steven Rostedt (Google) <rostedt@goodmis.org>
+
+commit d05cb470663a2a1879277e544f69e660208f08f2 upstream.
+
+Masami Hiramatsu reported a memory leak in register_ftrace_direct() where
+if the number of new entries are added is large enough to cause two
+allocations in the loop:
+
+        for (i = 0; i < size; i++) {
+                hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+                        new = ftrace_add_rec_direct(entry->ip, addr, &free_hash);
+                        if (!new)
+                                goto out_remove;
+                        entry->direct = addr;
+                }
+        }
+
+Where ftrace_add_rec_direct() has:
+
+        if (ftrace_hash_empty(direct_functions) ||
+            direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
+                struct ftrace_hash *new_hash;
+                int size = ftrace_hash_empty(direct_functions) ? 0 :
+                        direct_functions->count + 1;
+
+                if (size < 32)
+                        size = 32;
+
+                new_hash = dup_hash(direct_functions, size);
+                if (!new_hash)
+                        return NULL;
+
+                *free_hash = direct_functions;
+                direct_functions = new_hash;
+        }
+
+The "*free_hash = direct_functions;" can happen twice, losing the previous
+allocation of direct_functions.
+
+But this also exposed a more serious bug.
+
+The modification of direct_functions above is not safe. As
+direct_functions can be referenced at any time to find what direct caller
+it should call, the time between:
+
+                new_hash = dup_hash(direct_functions, size);
+ and
+                direct_functions = new_hash;
+
+can have a race with another CPU (or even this one if it gets interrupted),
+and the entries being moved to the new hash are not referenced.
+
+That's because the "dup_hash()" is really misnamed and is really a
+"move_hash()". It moves the entries from the old hash to the new one.
+
+Now even if that was changed, this code is not proper as direct_functions
+should not be updated until the end. That is the best way to handle
+function reference changes, and is the way other parts of ftrace handles
+this.
+
+The following is done:
+
+ 1. Change add_hash_entry() to return the entry it created and inserted
+    into the hash, and not just return success or not.
+
+ 2. Replace ftrace_add_rec_direct() with add_hash_entry(), and remove
+    the former.
+
+ 3. Allocate a "new_hash" at the start that is made for holding both the
+    new hash entries as well as the existing entries in direct_functions.
+
+ 4. Copy (not move) the direct_function entries over to the new_hash.
+
+ 5. Copy the entries of the added hash to the new_hash.
+
+ 6. If everything succeeds, then use rcu_pointer_assign() to update the
+    direct_functions with the new_hash.
+
+This simplifies the code and fixes both the memory leak as well as the
+race condition mentioned above.
+
+Link: https://lore.kernel.org/all/170368070504.42064.8960569647118388081.stgit@devnote2/
+Link: https://lore.kernel.org/linux-trace-kernel/20231229115134.08dd5174@gandalf.local.home
+
+Cc: stable@vger.kernel.org
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Cc: Jiri Olsa <jolsa@kernel.org>
+Cc: Alexei Starovoitov <ast@kernel.org>
+Cc: Daniel Borkmann <daniel@iogearbox.net>
+Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
+Fixes: 763e34e74bb7d ("ftrace: Add register_ftrace_direct()")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/ftrace.c |  100 +++++++++++++++++++++++---------------------------
+ 1 file changed, 47 insertions(+), 53 deletions(-)
+
+--- a/kernel/trace/ftrace.c
++++ b/kernel/trace/ftrace.c
+@@ -1152,18 +1152,19 @@ static void __add_hash_entry(struct ftra
+       hash->count++;
+ }
+-static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
++static struct ftrace_func_entry *
++add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
+ {
+       struct ftrace_func_entry *entry;
+       entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+       if (!entry)
+-              return -ENOMEM;
++              return NULL;
+       entry->ip = ip;
+       __add_hash_entry(hash, entry);
+-      return 0;
++      return entry;
+ }
+ static void
+@@ -1318,7 +1319,6 @@ alloc_and_copy_ftrace_hash(int size_bits
+       struct ftrace_func_entry *entry;
+       struct ftrace_hash *new_hash;
+       int size;
+-      int ret;
+       int i;
+       new_hash = alloc_ftrace_hash(size_bits);
+@@ -1335,8 +1335,7 @@ alloc_and_copy_ftrace_hash(int size_bits
+       size = 1 << hash->size_bits;
+       for (i = 0; i < size; i++) {
+               hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+-                      ret = add_hash_entry(new_hash, entry->ip);
+-                      if (ret < 0)
++                      if (add_hash_entry(new_hash, entry->ip) == NULL)
+                               goto free_hash;
+               }
+       }
+@@ -2439,7 +2438,7 @@ ftrace_find_tramp_ops_new(struct dyn_ftr
+ #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ /* Protected by rcu_tasks for reading, and direct_mutex for writing */
+-static struct ftrace_hash *direct_functions = EMPTY_HASH;
++static struct ftrace_hash __rcu *direct_functions = EMPTY_HASH;
+ static DEFINE_MUTEX(direct_mutex);
+ int ftrace_direct_func_count;
+@@ -2458,39 +2457,6 @@ unsigned long ftrace_find_rec_direct(uns
+       return entry->direct;
+ }
+-static struct ftrace_func_entry*
+-ftrace_add_rec_direct(unsigned long ip, unsigned long addr,
+-                    struct ftrace_hash **free_hash)
+-{
+-      struct ftrace_func_entry *entry;
+-
+-      if (ftrace_hash_empty(direct_functions) ||
+-          direct_functions->count > 2 * (1 << direct_functions->size_bits)) {
+-              struct ftrace_hash *new_hash;
+-              int size = ftrace_hash_empty(direct_functions) ? 0 :
+-                      direct_functions->count + 1;
+-
+-              if (size < 32)
+-                      size = 32;
+-
+-              new_hash = dup_hash(direct_functions, size);
+-              if (!new_hash)
+-                      return NULL;
+-
+-              *free_hash = direct_functions;
+-              direct_functions = new_hash;
+-      }
+-
+-      entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+-      if (!entry)
+-              return NULL;
+-
+-      entry->ip = ip;
+-      entry->direct = addr;
+-      __add_hash_entry(direct_functions, entry);
+-      return entry;
+-}
+-
+ static void call_direct_funcs(unsigned long ip, unsigned long pip,
+                             struct ftrace_ops *ops, struct ftrace_regs *fregs)
+ {
+@@ -4065,8 +4031,8 @@ enter_record(struct ftrace_hash *hash, s
+               /* Do nothing if it exists */
+               if (entry)
+                       return 0;
+-
+-              ret = add_hash_entry(hash, rec->ip);
++              if (add_hash_entry(hash, rec->ip) == NULL)
++                      ret = -ENOMEM;
+       }
+       return ret;
+ }
+@@ -5107,7 +5073,8 @@ __ftrace_match_addr(struct ftrace_hash *
+               return 0;
+       }
+-      return add_hash_entry(hash, ip);
++      entry = add_hash_entry(hash, ip);
++      return entry ? 0 :  -ENOMEM;
+ }
+ static int
+@@ -5633,7 +5600,7 @@ static void remove_direct_functions_hash
+  */
+ int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+ {
+-      struct ftrace_hash *hash, *free_hash = NULL;
++      struct ftrace_hash *hash, *new_hash = NULL, *free_hash = NULL;
+       struct ftrace_func_entry *entry, *new;
+       int err = -EBUSY, size, i;
+@@ -5659,34 +5626,61 @@ int register_ftrace_direct_multi(struct
+               }
+       }
+-      /* ... and insert them to direct_functions hash. */
+       err = -ENOMEM;
++
++      /* Make a copy hash to place the new and the old entries in */
++      size = hash->count + direct_functions->count;
++      if (size > 32)
++              size = 32;
++      new_hash = alloc_ftrace_hash(fls(size));
++      if (!new_hash)
++              goto out_unlock;
++
++      /* Now copy over the existing direct entries */
++      size = 1 << direct_functions->size_bits;
++      for (i = 0; i < size; i++) {
++              hlist_for_each_entry(entry, &direct_functions->buckets[i], hlist) {
++                      new = add_hash_entry(new_hash, entry->ip);
++                      if (!new)
++                              goto out_unlock;
++                      new->direct = entry->direct;
++              }
++      }
++
++      /* ... and add the new entries */
++      size = 1 << hash->size_bits;
+       for (i = 0; i < size; i++) {
+               hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+-                      new = ftrace_add_rec_direct(entry->ip, addr, &free_hash);
++                      new = add_hash_entry(new_hash, entry->ip);
+                       if (!new)
+-                              goto out_remove;
++                              goto out_unlock;
++                      /* Update both the copy and the hash entry */
++                      new->direct = addr;
+                       entry->direct = addr;
+               }
+       }
++      free_hash = direct_functions;
++      rcu_assign_pointer(direct_functions, new_hash);
++      new_hash = NULL;
++
+       ops->func = call_direct_funcs;
+       ops->flags = MULTI_FLAGS;
+       ops->trampoline = FTRACE_REGS_ADDR;
+       err = register_ftrace_function_nolock(ops);
+- out_remove:
+-      if (err)
+-              remove_direct_functions_hash(hash, addr);
+-
+  out_unlock:
+       mutex_unlock(&direct_mutex);
+-      if (free_hash) {
++      if (free_hash && free_hash != EMPTY_HASH) {
+               synchronize_rcu_tasks();
+               free_ftrace_hash(free_hash);
+       }
++
++      if (new_hash)
++              free_ftrace_hash(new_hash);
++
+       return err;
+ }
+ EXPORT_SYMBOL_GPL(register_ftrace_direct_multi);
+@@ -6510,7 +6504,7 @@ ftrace_graph_set_hash(struct ftrace_hash
+                               if (entry)
+                                       continue;
+-                              if (add_hash_entry(hash, rec->ip) < 0)
++                              if (add_hash_entry(hash, rec->ip) == NULL)
+                                       goto out;
+                       } else {
+                               if (entry) {
diff --git a/queue-6.1/ring-buffer-fix-wake-ups-when-buffer_percent-is-set-to-100.patch b/queue-6.1/ring-buffer-fix-wake-ups-when-buffer_percent-is-set-to-100.patch
new file mode 100644 (file)
index 0000000..eaa5ade
--- /dev/null
@@ -0,0 +1,73 @@
+From 623b1f896fa8a669a277ee5a258307a16c7377a3 Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
+Date: Tue, 26 Dec 2023 12:59:02 -0500
+Subject: ring-buffer: Fix wake ups when buffer_percent is set to 100
+
+From: Steven Rostedt (Google) <rostedt@goodmis.org>
+
+commit 623b1f896fa8a669a277ee5a258307a16c7377a3 upstream.
+
+The tracefs file "buffer_percent" is to allow user space to set a
+water-mark on how much of the tracing ring buffer needs to be filled in
+order to wake up a blocked reader.
+
+ 0 - is to wait until any data is in the buffer
+ 1 - is to wait for 1% of the sub buffers to be filled
+ 50 - would be half of the sub buffers are filled with data
+ 100 - is not to wake the waiter until the ring buffer is completely full
+
+Unfortunately the test for being full was:
+
+       dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
+       return (dirty * 100) > (full * nr_pages);
+
+Where "full" is the value for "buffer_percent".
+
+There is two issues with the above when full == 100.
+
+1. dirty * 100 > 100 * nr_pages will never be true
+   That is, the above is basically saying that if the user sets
+   buffer_percent to 100, more pages need to be dirty than exist in the
+   ring buffer!
+
+2. The page that the writer is on is never considered dirty, as dirty
+   pages are only those that are full. When the writer goes to a new
+   sub-buffer, it clears the contents of that sub-buffer.
+
+That is, even if the check was ">=" it would still not be equal as the
+most pages that can be considered "dirty" is nr_pages - 1.
+
+To fix this, add one to dirty and use ">=" in the compare.
+
+Link: https://lore.kernel.org/linux-trace-kernel/20231226125902.4a057f1d@gandalf.local.home
+
+Cc: stable@vger.kernel.org
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
+Fixes: 03329f9939781 ("tracing: Add tracefs file buffer_percentage")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/ring_buffer.c |    9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/kernel/trace/ring_buffer.c
++++ b/kernel/trace/ring_buffer.c
+@@ -935,9 +935,14 @@ static __always_inline bool full_hit(str
+       if (!nr_pages || !full)
+               return true;
+-      dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
++      /*
++       * Add one as dirty will never equal nr_pages, as the sub-buffer
++       * that the writer is on is not counted as dirty.
++       * This is needed if "buffer_percent" is set to 100.
++       */
++      dirty = ring_buffer_nr_dirty_pages(buffer, cpu) + 1;
+-      return (dirty * 100) > (full * nr_pages);
++      return (dirty * 100) >= (full * nr_pages);
+ }
+ /*
diff --git a/queue-6.1/ring-buffer-remove-useless-update-to-write_stamp-in-rb_try_to_discard.patch b/queue-6.1/ring-buffer-remove-useless-update-to-write_stamp-in-rb_try_to_discard.patch
new file mode 100644 (file)
index 0000000..ce0f58c
--- /dev/null
@@ -0,0 +1,144 @@
+From 083e9f65bd215582bf8f6a920db729fadf16704f Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
+Date: Fri, 15 Dec 2023 08:18:10 -0500
+Subject: ring-buffer: Remove useless update to write_stamp in rb_try_to_discard()
+
+From: Steven Rostedt (Google) <rostedt@goodmis.org>
+
+commit 083e9f65bd215582bf8f6a920db729fadf16704f upstream.
+
+When filtering is enabled, a temporary buffer is created to place the
+content of the trace event output so that the filter logic can decide
+from the trace event output if the trace event should be filtered out or
+not. If it is to be filtered out, the content in the temporary buffer is
+simply discarded, otherwise it is written into the trace buffer.
+
+But if an interrupt were to come in while a previous event was using that
+temporary buffer, the event written by the interrupt would actually go
+into the ring buffer itself to prevent corrupting the data on the
+temporary buffer. If the event is to be filtered out, the event in the
+ring buffer is discarded, or if it fails to discard because another event
+were to have already come in, it is turned into padding.
+
+The update to the write_stamp in the rb_try_to_discard() happens after a
+fix was made to force the next event after the discard to use an absolute
+timestamp by setting the before_stamp to zero so it does not match the
+write_stamp (which causes an event to use the absolute timestamp).
+
+But there's an effort in rb_try_to_discard() to put back the write_stamp
+to what it was before the event was added. But this is useless and
+wasteful because nothing is going to be using that write_stamp for
+calculations as it still will not match the before_stamp.
+
+Remove this useless update, and in doing so, we remove another
+cmpxchg64()!
+
+Also update the comments to reflect this change as well as remove some
+extra white space in another comment.
+
+Link: https://lore.kernel.org/linux-trace-kernel/20231215081810.1f4f38fe@rorschach.local.home
+
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Cc: Joel Fernandes <joel@joelfernandes.org>
+Cc: Vincent Donnefort   <vdonnefort@google.com>
+Fixes: b2dd797543cf ("ring-buffer: Force absolute timestamp on discard of event")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/ring_buffer.c |   47 ++++++++++-----------------------------------
+ 1 file changed, 11 insertions(+), 36 deletions(-)
+
+--- a/kernel/trace/ring_buffer.c
++++ b/kernel/trace/ring_buffer.c
+@@ -2987,25 +2987,6 @@ static unsigned rb_calculate_event_lengt
+       return length;
+ }
+-static u64 rb_time_delta(struct ring_buffer_event *event)
+-{
+-      switch (event->type_len) {
+-      case RINGBUF_TYPE_PADDING:
+-              return 0;
+-
+-      case RINGBUF_TYPE_TIME_EXTEND:
+-              return rb_event_time_stamp(event);
+-
+-      case RINGBUF_TYPE_TIME_STAMP:
+-              return 0;
+-
+-      case RINGBUF_TYPE_DATA:
+-              return event->time_delta;
+-      default:
+-              return 0;
+-      }
+-}
+-
+ static inline int
+ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
+                 struct ring_buffer_event *event)
+@@ -3014,8 +2995,6 @@ rb_try_to_discard(struct ring_buffer_per
+       struct buffer_page *bpage;
+       unsigned long index;
+       unsigned long addr;
+-      u64 write_stamp;
+-      u64 delta;
+       new_index = rb_event_index(event);
+       old_index = new_index + rb_event_ts_length(event);
+@@ -3024,14 +3003,10 @@ rb_try_to_discard(struct ring_buffer_per
+       bpage = READ_ONCE(cpu_buffer->tail_page);
+-      delta = rb_time_delta(event);
+-
+-      if (!rb_time_read(&cpu_buffer->write_stamp, &write_stamp))
+-              return 0;
+-
+-      /* Make sure the write stamp is read before testing the location */
+-      barrier();
+-
++      /*
++       * Make sure the tail_page is still the same and
++       * the next write location is the end of this event
++       */
+       if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
+               unsigned long write_mask =
+                       local_read(&bpage->write) & ~RB_WRITE_MASK;
+@@ -3042,20 +3017,20 @@ rb_try_to_discard(struct ring_buffer_per
+                * to make sure that the next event adds an absolute
+                * value and does not rely on the saved write stamp, which
+                * is now going to be bogus.
++               *
++               * By setting the before_stamp to zero, the next event
++               * is not going to use the write_stamp and will instead
++               * create an absolute timestamp. This means there's no
++               * reason to update the wirte_stamp!
+                */
+               rb_time_set(&cpu_buffer->before_stamp, 0);
+-              /* Something came in, can't discard */
+-              if (!rb_time_cmpxchg(&cpu_buffer->write_stamp,
+-                                     write_stamp, write_stamp - delta))
+-                      return 0;
+-
+               /*
+                * If an event were to come in now, it would see that the
+                * write_stamp and the before_stamp are different, and assume
+                * that this event just added itself before updating
+                * the write stamp. The interrupting event will fix the
+-               * write stamp for us, and use the before stamp as its delta.
++               * write stamp for us, and use an absolute timestamp.
+                */
+               /*
+@@ -3494,7 +3469,7 @@ static void check_buffer(struct ring_buf
+               return;
+       /*
+-       * If this interrupted another event, 
++       * If this interrupted another event,
+        */
+       if (atomic_inc_return(this_cpu_ptr(&checking)) != 1)
+               goto out;
index d64a7a726da55ac124428142214cc2338df7e943..85abc97b5b496b92bc605ccf93590f3397db2c7a 100644 (file)
@@ -88,3 +88,7 @@ mm-filemap-avoid-buffered-read-write-race-to-read-inconsistent-data.patch
 mm-migrate-high-order-folios-in-swap-cache-correctly.patch
 mm-memory-failure-cast-index-to-loff_t-before-shifting-it.patch
 mm-memory-failure-check-the-mapcount-of-the-precise-page.patch
+ring-buffer-fix-wake-ups-when-buffer_percent-is-set-to-100.patch
+ftrace-fix-modification-of-direct_function-hash-while-in-use.patch
+tracing-fix-blocked-reader-of-snapshot-buffer.patch
+ring-buffer-remove-useless-update-to-write_stamp-in-rb_try_to_discard.patch
diff --git a/queue-6.1/tracing-fix-blocked-reader-of-snapshot-buffer.patch b/queue-6.1/tracing-fix-blocked-reader-of-snapshot-buffer.patch
new file mode 100644 (file)
index 0000000..4bdc4b4
--- /dev/null
@@ -0,0 +1,105 @@
+From 39a7dc23a1ed0fe81141792a09449d124c5953bd Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
+Date: Thu, 28 Dec 2023 09:51:49 -0500
+Subject: tracing: Fix blocked reader of snapshot buffer
+
+From: Steven Rostedt (Google) <rostedt@goodmis.org>
+
+commit 39a7dc23a1ed0fe81141792a09449d124c5953bd upstream.
+
+If an application blocks on the snapshot or snapshot_raw files, expecting
+to be woken up when a snapshot occurs, it will not happen. Or it may
+happen with an unexpected result.
+
+That result is that the application will be reading the main buffer
+instead of the snapshot buffer. That is because when the snapshot occurs,
+the main and snapshot buffers are swapped. But the reader has a descriptor
+still pointing to the buffer that it originally connected to.
+
+This is fine for the main buffer readers, as they may be blocked waiting
+for a watermark to be hit, and when a snapshot occurs, the data that the
+main readers want is now on the snapshot buffer.
+
+But for waiters of the snapshot buffer, they are waiting for an event to
+occur that will trigger the snapshot and they can then consume it quickly
+to save the snapshot before the next snapshot occurs. But to do this, they
+need to read the new snapshot buffer, not the old one that is now
+receiving new data.
+
+Also, it does not make sense to have a watermark "buffer_percent" on the
+snapshot buffer, as the snapshot buffer is static and does not receive new
+data except all at once.
+
+Link: https://lore.kernel.org/linux-trace-kernel/20231228095149.77f5b45d@gandalf.local.home
+
+Cc: stable@vger.kernel.org
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Acked-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
+Fixes: debdd57f5145f ("tracing: Make a snapshot feature available from userspace")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/ring_buffer.c |    3 ++-
+ kernel/trace/trace.c       |   20 +++++++++++++++++---
+ 2 files changed, 19 insertions(+), 4 deletions(-)
+
+--- a/kernel/trace/ring_buffer.c
++++ b/kernel/trace/ring_buffer.c
+@@ -1002,7 +1002,8 @@ void ring_buffer_wake_waiters(struct tra
+       /* make sure the waiters see the new index */
+       smp_wmb();
+-      rb_wake_up_waiters(&rbwork->work);
++      /* This can be called in any context */
++      irq_work_queue(&rbwork->work);
+ }
+ /**
+--- a/kernel/trace/trace.c
++++ b/kernel/trace/trace.c
+@@ -1850,6 +1850,9 @@ update_max_tr(struct trace_array *tr, st
+       __update_max_tr(tr, tsk, cpu);
+       arch_spin_unlock(&tr->max_lock);
++
++      /* Any waiters on the old snapshot buffer need to wake up */
++      ring_buffer_wake_waiters(tr->array_buffer.buffer, RING_BUFFER_ALL_CPUS);
+ }
+ /**
+@@ -1901,12 +1904,23 @@ update_max_tr_single(struct trace_array
+ static int wait_on_pipe(struct trace_iterator *iter, int full)
+ {
++      int ret;
++
+       /* Iterators are static, they should be filled or empty */
+       if (trace_buffer_iter(iter, iter->cpu_file))
+               return 0;
+-      return ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file,
+-                              full);
++      ret = ring_buffer_wait(iter->array_buffer->buffer, iter->cpu_file, full);
++
++#ifdef CONFIG_TRACER_MAX_TRACE
++      /*
++       * Make sure this is still the snapshot buffer, as if a snapshot were
++       * to happen, this would now be the main buffer.
++       */
++      if (iter->snapshot)
++              iter->array_buffer = &iter->tr->max_buffer;
++#endif
++      return ret;
+ }
+ #ifdef CONFIG_FTRACE_STARTUP_TEST
+@@ -8433,7 +8447,7 @@ tracing_buffers_splice_read(struct file
+               wait_index = READ_ONCE(iter->wait_index);
+-              ret = wait_on_pipe(iter, iter->tr->buffer_percent);
++              ret = wait_on_pipe(iter, iter->snapshot ? 0 : iter->tr->buffer_percent);
+               if (ret)
+                       goto out;