]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.4-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 4 Dec 2022 15:51:07 +0000 (16:51 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 4 Dec 2022 15:51:07 +0000 (16:51 +0100)
added patches:
epoll-call-final-ep_events_available-check-under-the-lock.patch
epoll-check-for-events-when-removing-a-timed-out-thread-from-the-wait-queue.patch
nvme-ensure-subsystem-reset-is-single-threaded.patch
nvme-restrict-management-ioctls-to-admin.patch
tracing-ring-buffer-have-polling-block-on-watermark.patch

queue-5.4/epoll-call-final-ep_events_available-check-under-the-lock.patch [new file with mode: 0644]
queue-5.4/epoll-check-for-events-when-removing-a-timed-out-thread-from-the-wait-queue.patch [new file with mode: 0644]
queue-5.4/nvme-ensure-subsystem-reset-is-single-threaded.patch [new file with mode: 0644]
queue-5.4/nvme-restrict-management-ioctls-to-admin.patch [new file with mode: 0644]
queue-5.4/series
queue-5.4/tracing-ring-buffer-have-polling-block-on-watermark.patch [new file with mode: 0644]

diff --git a/queue-5.4/epoll-call-final-ep_events_available-check-under-the-lock.patch b/queue-5.4/epoll-call-final-ep_events_available-check-under-the-lock.patch
new file mode 100644 (file)
index 0000000..275be3d
--- /dev/null
@@ -0,0 +1,124 @@
+From 65759097d804d2a9ad2b687db436319704ba7019 Mon Sep 17 00:00:00 2001
+From: Roman Penyaev <rpenyaev@suse.de>
+Date: Wed, 13 May 2020 17:50:38 -0700
+Subject: epoll: call final ep_events_available() check under the lock
+
+From: Roman Penyaev <rpenyaev@suse.de>
+
+commit 65759097d804d2a9ad2b687db436319704ba7019 upstream.
+
+There is a possible race when ep_scan_ready_list() leaves ->rdllist and
+->obflist empty for a short period of time although some events are
+pending.  It is quite likely that ep_events_available() observes empty
+lists and goes to sleep.
+
+Since commit 339ddb53d373 ("fs/epoll: remove unnecessary wakeups of
+nested epoll") we are conservative in wakeups (there is only one place
+for wakeup and this is ep_poll_callback()), thus ep_events_available()
+must always observe correct state of two lists.
+
+The easiest and correct way is to do the final check under the lock.
+This does not impact the performance, since lock is taken anyway for
+adding a wait entry to the wait queue.
+
+The discussion of the problem can be found here:
+
+   https://lore.kernel.org/linux-fsdevel/a2f22c3c-c25a-4bda-8339-a7bdaf17849e@akamai.com/
+
+In this patch barrierless __set_current_state() is used.  This is safe
+since waitqueue_active() is called under the same lock on wakeup side.
+
+Short-circuit for fatal signals (i.e.  fatal_signal_pending() check) is
+moved to the line just before actual events harvesting routine.  This is
+fully compliant to what is said in the comment of the patch where the
+actual fatal_signal_pending() check was added: c257a340ede0 ("fs, epoll:
+short circuit fetching events if thread has been killed").
+
+Fixes: 339ddb53d373 ("fs/epoll: remove unnecessary wakeups of nested epoll")
+Reported-by: Jason Baron <jbaron@akamai.com>
+Reported-by: Randy Dunlap <rdunlap@infradead.org>
+Signed-off-by: Roman Penyaev <rpenyaev@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Reviewed-by: Jason Baron <jbaron@akamai.com>
+Cc: Khazhismel Kumykov <khazhy@google.com>
+Cc: Alexander Viro <viro@zeniv.linux.org.uk>
+Cc: <stable@vger.kernel.org>
+Link: http://lkml.kernel.org/r/20200505145609.1865152-1-rpenyaev@suse.de
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Acked-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
+Signed-off-by: Rishabh Bhatnagar <risbhat@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/eventpoll.c |   47 +++++++++++++++++++++++++++--------------------
+ 1 file changed, 27 insertions(+), 20 deletions(-)
+
+--- a/fs/eventpoll.c
++++ b/fs/eventpoll.c
+@@ -1905,33 +1905,31 @@ fetch_events:
+               init_wait(&wait);
+               wait.func = ep_autoremove_wake_function;
+               write_lock_irq(&ep->lock);
+-              __add_wait_queue_exclusive(&ep->wq, &wait);
+-              write_unlock_irq(&ep->lock);
+-
+               /*
+-               * We don't want to sleep if the ep_poll_callback() sends us
+-               * a wakeup in between. That's why we set the task state
+-               * to TASK_INTERRUPTIBLE before doing the checks.
++               * Barrierless variant, waitqueue_active() is called under
++               * the same lock on wakeup ep_poll_callback() side, so it
++               * is safe to avoid an explicit barrier.
+                */
+-              set_current_state(TASK_INTERRUPTIBLE);
++              __set_current_state(TASK_INTERRUPTIBLE);
++
+               /*
+-               * Always short-circuit for fatal signals to allow
+-               * threads to make a timely exit without the chance of
+-               * finding more events available and fetching
+-               * repeatedly.
++               * Do the final check under the lock. ep_scan_ready_list()
++               * plays with two lists (->rdllist and ->ovflist) and there
++               * is always a race when both lists are empty for short
++               * period of time although events are pending, so lock is
++               * important.
+                */
+-              if (fatal_signal_pending(current)) {
+-                      res = -EINTR;
+-                      break;
++              eavail = ep_events_available(ep);
++              if (!eavail) {
++                      if (signal_pending(current))
++                              res = -EINTR;
++                      else
++                              __add_wait_queue_exclusive(&ep->wq, &wait);
+               }
++              write_unlock_irq(&ep->lock);
+-              eavail = ep_events_available(ep);
+-              if (eavail)
+-                      break;
+-              if (signal_pending(current)) {
+-                      res = -EINTR;
++              if (eavail || res)
+                       break;
+-              }
+               if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
+                       timed_out = 1;
+@@ -1952,6 +1950,15 @@ fetch_events:
+       }
+ send_events:
++      if (fatal_signal_pending(current)) {
++              /*
++               * Always short-circuit for fatal signals to allow
++               * threads to make a timely exit without the chance of
++               * finding more events available and fetching
++               * repeatedly.
++               */
++              res = -EINTR;
++      }
+       /*
+        * Try to transfer events to user space. In case we get 0 events and
+        * there's still timeout left over, we go trying again in search of
diff --git a/queue-5.4/epoll-check-for-events-when-removing-a-timed-out-thread-from-the-wait-queue.patch b/queue-5.4/epoll-check-for-events-when-removing-a-timed-out-thread-from-the-wait-queue.patch
new file mode 100644 (file)
index 0000000..f8065ec
--- /dev/null
@@ -0,0 +1,125 @@
+From 289caf5d8f6c61c6d2b7fd752a7f483cd153f182 Mon Sep 17 00:00:00 2001
+From: Soheil Hassas Yeganeh <soheil@google.com>
+Date: Fri, 18 Dec 2020 14:01:44 -0800
+Subject: epoll: check for events when removing a timed out thread from the wait queue
+
+From: Soheil Hassas Yeganeh <soheil@google.com>
+
+commit 289caf5d8f6c61c6d2b7fd752a7f483cd153f182 upstream.
+
+Patch series "simplify ep_poll".
+
+This patch series is a followup based on the suggestions and feedback by
+Linus:
+https://lkml.kernel.org/r/CAHk-=wizk=OxUyQPbO8MS41w2Pag1kniUV5WdD5qWL-gq1kjDA@mail.gmail.com
+
+The first patch in the series is a fix for the epoll race in presence of
+timeouts, so that it can be cleanly backported to all affected stable
+kernels.
+
+The rest of the patch series simplify the ep_poll() implementation.  Some
+of these simplifications result in minor performance enhancements as well.
+We have kept these changes under self tests and internal benchmarks for a
+few days, and there are minor (1-2%) performance enhancements as a result.
+
+This patch (of 8):
+
+After abc610e01c66 ("fs/epoll: avoid barrier after an epoll_wait(2)
+timeout"), we break out of the ep_poll loop upon timeout, without checking
+whether there is any new events available.  Prior to that patch-series we
+always called ep_events_available() after exiting the loop.
+
+This can cause races and missed wakeups.  For example, consider the
+following scenario reported by Guantao Liu:
+
+Suppose we have an eventfd added using EPOLLET to an epollfd.
+
+Thread 1: Sleeps for just below 5ms and then writes to an eventfd.
+Thread 2: Calls epoll_wait with a timeout of 5 ms. If it sees an
+          event of the eventfd, it will write back on that fd.
+Thread 3: Calls epoll_wait with a negative timeout.
+
+Prior to abc610e01c66, it is guaranteed that Thread 3 will wake up either
+by Thread 1 or Thread 2.  After abc610e01c66, Thread 3 can be blocked
+indefinitely if Thread 2 sees a timeout right before the write to the
+eventfd by Thread 1.  Thread 2 will be woken up from
+schedule_hrtimeout_range and, with evail 0, it will not call
+ep_send_events().
+
+To fix this issue:
+1) Simplify the timed_out case as suggested by Linus.
+2) while holding the lock, recheck whether the thread was woken up
+   after its time out has reached.
+
+Note that (2) is different from Linus' original suggestion: It do not set
+"eavail = ep_events_available(ep)" to avoid unnecessary contention (when
+there are too many timed-out threads and a small number of events), as
+well as races mentioned in the discussion thread.
+
+This is the first patch in the series so that the backport to stable
+releases is straightforward.
+
+Link: https://lkml.kernel.org/r/20201106231635.3528496-1-soheil.kdev@gmail.com
+Link: https://lkml.kernel.org/r/CAHk-=wizk=OxUyQPbO8MS41w2Pag1kniUV5WdD5qWL-gq1kjDA@mail.gmail.com
+Link: https://lkml.kernel.org/r/20201106231635.3528496-2-soheil.kdev@gmail.com
+Fixes: abc610e01c66 ("fs/epoll: avoid barrier after an epoll_wait(2) timeout")
+Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
+Tested-by: Guantao Liu <guantaol@google.com>
+Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
+Reported-by: Guantao Liu <guantaol@google.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Reviewed-by: Khazhismel Kumykov <khazhy@google.com>
+Reviewed-by: Davidlohr Bueso <dbueso@suse.de>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Acked-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
+Signed-off-by: Rishabh Bhatnagar <risbhat@amazon.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/eventpoll.c |   27 +++++++++++++++++----------
+ 1 file changed, 17 insertions(+), 10 deletions(-)
+
+--- a/fs/eventpoll.c
++++ b/fs/eventpoll.c
+@@ -1928,23 +1928,30 @@ fetch_events:
+               }
+               write_unlock_irq(&ep->lock);
+-              if (eavail || res)
+-                      break;
+-
+-              if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
+-                      timed_out = 1;
+-                      break;
+-              }
+-
+-              /* We were woken up, thus go and try to harvest some events */
++              if (!eavail && !res)
++                      timed_out = !schedule_hrtimeout_range(to, slack,
++                                                            HRTIMER_MODE_ABS);
++
++              /*
++               * We were woken up, thus go and try to harvest some events.
++               * If timed out and still on the wait queue, recheck eavail
++               * carefully under lock, below.
++               */
+               eavail = 1;
+-
+       } while (0);
+       __set_current_state(TASK_RUNNING);
+       if (!list_empty_careful(&wait.entry)) {
+               write_lock_irq(&ep->lock);
++              /*
++               * If the thread timed out and is not on the wait queue, it
++               * means that the thread was woken up after its timeout expired
++               * before it could reacquire the lock. Thus, when wait.entry is
++               * empty, it needs to harvest events.
++               */
++              if (timed_out)
++                      eavail = list_empty(&wait.entry);
+               __remove_wait_queue(&ep->wq, &wait);
+               write_unlock_irq(&ep->lock);
+       }
diff --git a/queue-5.4/nvme-ensure-subsystem-reset-is-single-threaded.patch b/queue-5.4/nvme-ensure-subsystem-reset-is-single-threaded.patch
new file mode 100644 (file)
index 0000000..ff628ad
--- /dev/null
@@ -0,0 +1,67 @@
+From 1e866afd4bcdd01a70a5eddb4371158d3035ce03 Mon Sep 17 00:00:00 2001
+From: Keith Busch <kbusch@kernel.org>
+Date: Thu, 22 Sep 2022 08:13:47 -0700
+Subject: nvme: ensure subsystem reset is single threaded
+
+From: Keith Busch <kbusch@kernel.org>
+
+commit 1e866afd4bcdd01a70a5eddb4371158d3035ce03 upstream.
+
+The subsystem reset writes to a register, so we have to ensure the
+device state is capable of handling that otherwise the driver may access
+unmapped registers. Use the state machine to ensure the subsystem reset
+doesn't try to write registers on a device already undergoing this type
+of reset.
+
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=214771
+Signed-off-by: Keith Busch <kbusch@kernel.org>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/nvme/host/nvme.h |   16 +++++++++++++---
+ 1 file changed, 13 insertions(+), 3 deletions(-)
+
+--- a/drivers/nvme/host/nvme.h
++++ b/drivers/nvme/host/nvme.h
+@@ -422,11 +422,23 @@ static inline void nvme_fault_inject_fin
+ static inline void nvme_should_fail(struct request *req) {}
+ #endif
++bool nvme_wait_reset(struct nvme_ctrl *ctrl);
++int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
++
+ static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl)
+ {
++      int ret;
++
+       if (!ctrl->subsystem)
+               return -ENOTTY;
+-      return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65);
++      if (!nvme_wait_reset(ctrl))
++              return -EBUSY;
++
++      ret = ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65);
++      if (ret)
++              return ret;
++
++      return nvme_try_sched_reset(ctrl);
+ }
+ /*
+@@ -473,7 +485,6 @@ void nvme_cancel_tagset(struct nvme_ctrl
+ void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl);
+ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
+               enum nvme_ctrl_state new_state);
+-bool nvme_wait_reset(struct nvme_ctrl *ctrl);
+ int nvme_disable_ctrl(struct nvme_ctrl *ctrl);
+ int nvme_enable_ctrl(struct nvme_ctrl *ctrl);
+ int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
+@@ -525,7 +536,6 @@ int nvme_set_queue_count(struct nvme_ctr
+ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
+ int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
+ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
+-int nvme_try_sched_reset(struct nvme_ctrl *ctrl);
+ int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
+ int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp,
diff --git a/queue-5.4/nvme-restrict-management-ioctls-to-admin.patch b/queue-5.4/nvme-restrict-management-ioctls-to-admin.patch
new file mode 100644 (file)
index 0000000..1b587aa
--- /dev/null
@@ -0,0 +1,41 @@
+From 23e085b2dead13b51fe86d27069895b740f749c0 Mon Sep 17 00:00:00 2001
+From: Keith Busch <kbusch@kernel.org>
+Date: Thu, 22 Sep 2022 07:54:06 -0700
+Subject: nvme: restrict management ioctls to admin
+
+From: Keith Busch <kbusch@kernel.org>
+
+commit 23e085b2dead13b51fe86d27069895b740f749c0 upstream.
+
+The passthrough commands already have this restriction, but the other
+operations do not. Require the same capabilities for all users as all of
+these operations, which include resets and rescans, can be disruptive.
+
+Signed-off-by: Keith Busch <kbusch@kernel.org>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+Signed-off-by: Ovidiu Panait <ovidiu.panait@windriver.com>
+---
+ drivers/nvme/host/core.c |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/drivers/nvme/host/core.c
++++ b/drivers/nvme/host/core.c
+@@ -3027,11 +3027,17 @@ static long nvme_dev_ioctl(struct file *
+       case NVME_IOCTL_IO_CMD:
+               return nvme_dev_user_cmd(ctrl, argp);
+       case NVME_IOCTL_RESET:
++              if (!capable(CAP_SYS_ADMIN))
++                      return -EACCES;
+               dev_warn(ctrl->device, "resetting controller\n");
+               return nvme_reset_ctrl_sync(ctrl);
+       case NVME_IOCTL_SUBSYS_RESET:
++              if (!capable(CAP_SYS_ADMIN))
++                      return -EACCES;
+               return nvme_reset_subsystem(ctrl);
+       case NVME_IOCTL_RESCAN:
++              if (!capable(CAP_SYS_ADMIN))
++                      return -EACCES;
+               nvme_queue_scan(ctrl);
+               return 0;
+       default:
index 4fc04ed9e270692f3cfa9564f1bcf15b7fa35e3d..2bcb95d87ec4cb8cfdb7444d4a777e62b2276c78 100644 (file)
@@ -145,3 +145,8 @@ selftests-net-add-delete-nexthop-route-warning-test.patch
 selftests-net-fix-nexthop-warning-cleanup-double-ip-.patch
 ipv4-handle-attempt-to-delete-multipath-route-when-f.patch
 ipv4-fix-route-deletion-when-nexthop-info-is-not-spe.patch
+tracing-ring-buffer-have-polling-block-on-watermark.patch
+epoll-call-final-ep_events_available-check-under-the-lock.patch
+epoll-check-for-events-when-removing-a-timed-out-thread-from-the-wait-queue.patch
+nvme-restrict-management-ioctls-to-admin.patch
+nvme-ensure-subsystem-reset-is-single-threaded.patch
diff --git a/queue-5.4/tracing-ring-buffer-have-polling-block-on-watermark.patch b/queue-5.4/tracing-ring-buffer-have-polling-block-on-watermark.patch
new file mode 100644 (file)
index 0000000..ae4f6d5
--- /dev/null
@@ -0,0 +1,186 @@
+From 42fb0a1e84ff525ebe560e2baf9451ab69127e2b Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (Google)" <rostedt@goodmis.org>
+Date: Thu, 20 Oct 2022 23:14:27 -0400
+Subject: tracing/ring-buffer: Have polling block on watermark
+
+From: Steven Rostedt (Google) <rostedt@goodmis.org>
+
+commit 42fb0a1e84ff525ebe560e2baf9451ab69127e2b upstream.
+
+Currently the way polling works on the ring buffer is broken. It will
+return immediately if there's any data in the ring buffer whereas a read
+will block until the watermark (defined by the tracefs buffer_percent file)
+is hit.
+
+That is, a select() or poll() will return as if there's data available,
+but then the following read will block. This is broken for the way
+select()s and poll()s are supposed to work.
+
+Have the polling on the ring buffer also block the same way reads and
+splice does on the ring buffer.
+
+Link: https://lkml.kernel.org/r/20221020231427.41be3f26@gandalf.local.home
+
+Cc: Linux Trace Kernel <linux-trace-kernel@vger.kernel.org>
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Cc: Primiano Tucci <primiano@google.com>
+Cc: stable@vger.kernel.org
+Fixes: 1e0d6714aceb7 ("ring-buffer: Do not wake up a splice waiter when page is not full")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/ring_buffer.h |    2 -
+ kernel/trace/ring_buffer.c  |   54 ++++++++++++++++++++++++++++----------------
+ kernel/trace/trace.c        |    2 -
+ 3 files changed, 37 insertions(+), 21 deletions(-)
+
+--- a/include/linux/ring_buffer.h
++++ b/include/linux/ring_buffer.h
+@@ -99,7 +99,7 @@ __ring_buffer_alloc(unsigned long size,
+ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, int full);
+ __poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
+-                        struct file *filp, poll_table *poll_table);
++                        struct file *filp, poll_table *poll_table, int full);
+ #define RING_BUFFER_ALL_CPUS -1
+--- a/kernel/trace/ring_buffer.c
++++ b/kernel/trace/ring_buffer.c
+@@ -566,6 +566,21 @@ size_t ring_buffer_nr_dirty_pages(struct
+       return cnt - read;
+ }
++static __always_inline bool full_hit(struct ring_buffer *buffer, int cpu, int full)
++{
++      struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
++      size_t nr_pages;
++      size_t dirty;
++
++      nr_pages = cpu_buffer->nr_pages;
++      if (!nr_pages || !full)
++              return true;
++
++      dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
++
++      return (dirty * 100) > (full * nr_pages);
++}
++
+ /*
+  * rb_wake_up_waiters - wake up tasks waiting for ring buffer input
+  *
+@@ -661,22 +676,20 @@ int ring_buffer_wait(struct ring_buffer
+                   !ring_buffer_empty_cpu(buffer, cpu)) {
+                       unsigned long flags;
+                       bool pagebusy;
+-                      size_t nr_pages;
+-                      size_t dirty;
++                      bool done;
+                       if (!full)
+                               break;
+                       raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+                       pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
+-                      nr_pages = cpu_buffer->nr_pages;
+-                      dirty = ring_buffer_nr_dirty_pages(buffer, cpu);
++                      done = !pagebusy && full_hit(buffer, cpu, full);
++
+                       if (!cpu_buffer->shortest_full ||
+                           cpu_buffer->shortest_full > full)
+                               cpu_buffer->shortest_full = full;
+                       raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+-                      if (!pagebusy &&
+-                          (!nr_pages || (dirty * 100) > full * nr_pages))
++                      if (done)
+                               break;
+               }
+@@ -697,6 +710,7 @@ int ring_buffer_wait(struct ring_buffer
+  * @cpu: the cpu buffer to wait on
+  * @filp: the file descriptor
+  * @poll_table: The poll descriptor
++ * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS
+  *
+  * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
+  * as data is added to any of the @buffer's cpu buffers. Otherwise
+@@ -706,14 +720,14 @@ int ring_buffer_wait(struct ring_buffer
+  * zero otherwise.
+  */
+ __poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
+-                        struct file *filp, poll_table *poll_table)
++                        struct file *filp, poll_table *poll_table, int full)
+ {
+       struct ring_buffer_per_cpu *cpu_buffer;
+       struct rb_irq_work *work;
+-      if (cpu == RING_BUFFER_ALL_CPUS)
++      if (cpu == RING_BUFFER_ALL_CPUS) {
+               work = &buffer->irq_work;
+-      else {
++      } else {
+               if (!cpumask_test_cpu(cpu, buffer->cpumask))
+                       return -EINVAL;
+@@ -721,8 +735,14 @@ __poll_t ring_buffer_poll_wait(struct ri
+               work = &cpu_buffer->irq_work;
+       }
+-      poll_wait(filp, &work->waiters, poll_table);
+-      work->waiters_pending = true;
++      if (full) {
++              poll_wait(filp, &work->full_waiters, poll_table);
++              work->full_waiters_pending = true;
++      } else {
++              poll_wait(filp, &work->waiters, poll_table);
++              work->waiters_pending = true;
++      }
++
+       /*
+        * There's a tight race between setting the waiters_pending and
+        * checking if the ring buffer is empty.  Once the waiters_pending bit
+@@ -738,6 +758,9 @@ __poll_t ring_buffer_poll_wait(struct ri
+        */
+       smp_mb();
++      if (full)
++              return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0;
++
+       if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
+           (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
+               return EPOLLIN | EPOLLRDNORM;
+@@ -2640,10 +2663,6 @@ static void rb_commit(struct ring_buffer
+ static __always_inline void
+ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
+ {
+-      size_t nr_pages;
+-      size_t dirty;
+-      size_t full;
+-
+       if (buffer->irq_work.waiters_pending) {
+               buffer->irq_work.waiters_pending = false;
+               /* irq_work_queue() supplies it's own memory barriers */
+@@ -2667,10 +2686,7 @@ rb_wakeups(struct ring_buffer *buffer, s
+       cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched);
+-      full = cpu_buffer->shortest_full;
+-      nr_pages = cpu_buffer->nr_pages;
+-      dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu);
+-      if (full && nr_pages && (dirty * 100) <= full * nr_pages)
++      if (!full_hit(buffer, cpu_buffer->cpu, cpu_buffer->shortest_full))
+               return;
+       cpu_buffer->irq_work.wakeup_full = true;
+--- a/kernel/trace/trace.c
++++ b/kernel/trace/trace.c
+@@ -5993,7 +5993,7 @@ trace_poll(struct trace_iterator *iter,
+               return EPOLLIN | EPOLLRDNORM;
+       else
+               return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file,
+-                                           filp, poll_table);
++                                           filp, poll_table, iter->tr->buffer_percent);
+ }
+ static __poll_t