From dd9f37f4a405dfdc987db327b9f90d040e167319 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 4 Dec 2022 16:51:07 +0100 Subject: [PATCH] 5.4-stable patches added patches: epoll-call-final-ep_events_available-check-under-the-lock.patch epoll-check-for-events-when-removing-a-timed-out-thread-from-the-wait-queue.patch nvme-ensure-subsystem-reset-is-single-threaded.patch nvme-restrict-management-ioctls-to-admin.patch tracing-ring-buffer-have-polling-block-on-watermark.patch --- ...vents_available-check-under-the-lock.patch | 124 ++++++++++++ ...timed-out-thread-from-the-wait-queue.patch | 125 ++++++++++++ ...e-subsystem-reset-is-single-threaded.patch | 67 +++++++ ...-restrict-management-ioctls-to-admin.patch | 41 ++++ queue-5.4/series | 5 + ...ffer-have-polling-block-on-watermark.patch | 186 ++++++++++++++++++ 6 files changed, 548 insertions(+) create mode 100644 queue-5.4/epoll-call-final-ep_events_available-check-under-the-lock.patch create mode 100644 queue-5.4/epoll-check-for-events-when-removing-a-timed-out-thread-from-the-wait-queue.patch create mode 100644 queue-5.4/nvme-ensure-subsystem-reset-is-single-threaded.patch create mode 100644 queue-5.4/nvme-restrict-management-ioctls-to-admin.patch create mode 100644 queue-5.4/tracing-ring-buffer-have-polling-block-on-watermark.patch diff --git a/queue-5.4/epoll-call-final-ep_events_available-check-under-the-lock.patch b/queue-5.4/epoll-call-final-ep_events_available-check-under-the-lock.patch new file mode 100644 index 00000000000..275be3dc6f8 --- /dev/null +++ b/queue-5.4/epoll-call-final-ep_events_available-check-under-the-lock.patch @@ -0,0 +1,124 @@ +From 65759097d804d2a9ad2b687db436319704ba7019 Mon Sep 17 00:00:00 2001 +From: Roman Penyaev +Date: Wed, 13 May 2020 17:50:38 -0700 +Subject: epoll: call final ep_events_available() check under the lock + +From: Roman Penyaev + +commit 65759097d804d2a9ad2b687db436319704ba7019 upstream. + +There is a possible race when ep_scan_ready_list() leaves ->rdllist and +->obflist empty for a short period of time although some events are +pending. It is quite likely that ep_events_available() observes empty +lists and goes to sleep. + +Since commit 339ddb53d373 ("fs/epoll: remove unnecessary wakeups of +nested epoll") we are conservative in wakeups (there is only one place +for wakeup and this is ep_poll_callback()), thus ep_events_available() +must always observe correct state of two lists. + +The easiest and correct way is to do the final check under the lock. +This does not impact the performance, since lock is taken anyway for +adding a wait entry to the wait queue. + +The discussion of the problem can be found here: + + https://lore.kernel.org/linux-fsdevel/a2f22c3c-c25a-4bda-8339-a7bdaf17849e@akamai.com/ + +In this patch barrierless __set_current_state() is used. This is safe +since waitqueue_active() is called under the same lock on wakeup side. + +Short-circuit for fatal signals (i.e. fatal_signal_pending() check) is +moved to the line just before actual events harvesting routine. This is +fully compliant to what is said in the comment of the patch where the +actual fatal_signal_pending() check was added: c257a340ede0 ("fs, epoll: +short circuit fetching events if thread has been killed"). + +Fixes: 339ddb53d373 ("fs/epoll: remove unnecessary wakeups of nested epoll") +Reported-by: Jason Baron +Reported-by: Randy Dunlap +Signed-off-by: Roman Penyaev +Signed-off-by: Andrew Morton +Reviewed-by: Jason Baron +Cc: Khazhismel Kumykov +Cc: Alexander Viro +Cc: +Link: http://lkml.kernel.org/r/20200505145609.1865152-1-rpenyaev@suse.de +Signed-off-by: Linus Torvalds +Acked-by: Thadeu Lima de Souza Cascardo +Signed-off-by: Rishabh Bhatnagar +Signed-off-by: Greg Kroah-Hartman +--- + fs/eventpoll.c | 47 +++++++++++++++++++++++++++-------------------- + 1 file changed, 27 insertions(+), 20 deletions(-) + +--- a/fs/eventpoll.c ++++ b/fs/eventpoll.c +@@ -1905,33 +1905,31 @@ fetch_events: + init_wait(&wait); + wait.func = ep_autoremove_wake_function; + write_lock_irq(&ep->lock); +- __add_wait_queue_exclusive(&ep->wq, &wait); +- write_unlock_irq(&ep->lock); +- + /* +- * We don't want to sleep if the ep_poll_callback() sends us +- * a wakeup in between. That's why we set the task state +- * to TASK_INTERRUPTIBLE before doing the checks. ++ * Barrierless variant, waitqueue_active() is called under ++ * the same lock on wakeup ep_poll_callback() side, so it ++ * is safe to avoid an explicit barrier. + */ +- set_current_state(TASK_INTERRUPTIBLE); ++ __set_current_state(TASK_INTERRUPTIBLE); ++ + /* +- * Always short-circuit for fatal signals to allow +- * threads to make a timely exit without the chance of +- * finding more events available and fetching +- * repeatedly. ++ * Do the final check under the lock. ep_scan_ready_list() ++ * plays with two lists (->rdllist and ->ovflist) and there ++ * is always a race when both lists are empty for short ++ * period of time although events are pending, so lock is ++ * important. + */ +- if (fatal_signal_pending(current)) { +- res = -EINTR; +- break; ++ eavail = ep_events_available(ep); ++ if (!eavail) { ++ if (signal_pending(current)) ++ res = -EINTR; ++ else ++ __add_wait_queue_exclusive(&ep->wq, &wait); + } ++ write_unlock_irq(&ep->lock); + +- eavail = ep_events_available(ep); +- if (eavail) +- break; +- if (signal_pending(current)) { +- res = -EINTR; ++ if (eavail || res) + break; +- } + + if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) { + timed_out = 1; +@@ -1952,6 +1950,15 @@ fetch_events: + } + + send_events: ++ if (fatal_signal_pending(current)) { ++ /* ++ * Always short-circuit for fatal signals to allow ++ * threads to make a timely exit without the chance of ++ * finding more events available and fetching ++ * repeatedly. ++ */ ++ res = -EINTR; ++ } + /* + * Try to transfer events to user space. In case we get 0 events and + * there's still timeout left over, we go trying again in search of diff --git a/queue-5.4/epoll-check-for-events-when-removing-a-timed-out-thread-from-the-wait-queue.patch b/queue-5.4/epoll-check-for-events-when-removing-a-timed-out-thread-from-the-wait-queue.patch new file mode 100644 index 00000000000..f8065eca3d9 --- /dev/null +++ b/queue-5.4/epoll-check-for-events-when-removing-a-timed-out-thread-from-the-wait-queue.patch @@ -0,0 +1,125 @@ +From 289caf5d8f6c61c6d2b7fd752a7f483cd153f182 Mon Sep 17 00:00:00 2001 +From: Soheil Hassas Yeganeh +Date: Fri, 18 Dec 2020 14:01:44 -0800 +Subject: epoll: check for events when removing a timed out thread from the wait queue + +From: Soheil Hassas Yeganeh + +commit 289caf5d8f6c61c6d2b7fd752a7f483cd153f182 upstream. + +Patch series "simplify ep_poll". + +This patch series is a followup based on the suggestions and feedback by +Linus: +https://lkml.kernel.org/r/CAHk-=wizk=OxUyQPbO8MS41w2Pag1kniUV5WdD5qWL-gq1kjDA@mail.gmail.com + +The first patch in the series is a fix for the epoll race in presence of +timeouts, so that it can be cleanly backported to all affected stable +kernels. + +The rest of the patch series simplify the ep_poll() implementation. Some +of these simplifications result in minor performance enhancements as well. +We have kept these changes under self tests and internal benchmarks for a +few days, and there are minor (1-2%) performance enhancements as a result. + +This patch (of 8): + +After abc610e01c66 ("fs/epoll: avoid barrier after an epoll_wait(2) +timeout"), we break out of the ep_poll loop upon timeout, without checking +whether there is any new events available. Prior to that patch-series we +always called ep_events_available() after exiting the loop. + +This can cause races and missed wakeups. For example, consider the +following scenario reported by Guantao Liu: + +Suppose we have an eventfd added using EPOLLET to an epollfd. + +Thread 1: Sleeps for just below 5ms and then writes to an eventfd. +Thread 2: Calls epoll_wait with a timeout of 5 ms. If it sees an + event of the eventfd, it will write back on that fd. +Thread 3: Calls epoll_wait with a negative timeout. + +Prior to abc610e01c66, it is guaranteed that Thread 3 will wake up either +by Thread 1 or Thread 2. After abc610e01c66, Thread 3 can be blocked +indefinitely if Thread 2 sees a timeout right before the write to the +eventfd by Thread 1. Thread 2 will be woken up from +schedule_hrtimeout_range and, with evail 0, it will not call +ep_send_events(). + +To fix this issue: +1) Simplify the timed_out case as suggested by Linus. +2) while holding the lock, recheck whether the thread was woken up + after its time out has reached. + +Note that (2) is different from Linus' original suggestion: It do not set +"eavail = ep_events_available(ep)" to avoid unnecessary contention (when +there are too many timed-out threads and a small number of events), as +well as races mentioned in the discussion thread. + +This is the first patch in the series so that the backport to stable +releases is straightforward. + +Link: https://lkml.kernel.org/r/20201106231635.3528496-1-soheil.kdev@gmail.com +Link: https://lkml.kernel.org/r/CAHk-=wizk=OxUyQPbO8MS41w2Pag1kniUV5WdD5qWL-gq1kjDA@mail.gmail.com +Link: https://lkml.kernel.org/r/20201106231635.3528496-2-soheil.kdev@gmail.com +Fixes: abc610e01c66 ("fs/epoll: avoid barrier after an epoll_wait(2) timeout") +Signed-off-by: Soheil Hassas Yeganeh +Tested-by: Guantao Liu +Suggested-by: Linus Torvalds +Reported-by: Guantao Liu +Reviewed-by: Eric Dumazet +Reviewed-by: Willem de Bruijn +Reviewed-by: Khazhismel Kumykov +Reviewed-by: Davidlohr Bueso +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Acked-by: Thadeu Lima de Souza Cascardo +Signed-off-by: Rishabh Bhatnagar +Signed-off-by: Greg Kroah-Hartman +--- + fs/eventpoll.c | 27 +++++++++++++++++---------- + 1 file changed, 17 insertions(+), 10 deletions(-) + +--- a/fs/eventpoll.c ++++ b/fs/eventpoll.c +@@ -1928,23 +1928,30 @@ fetch_events: + } + write_unlock_irq(&ep->lock); + +- if (eavail || res) +- break; +- +- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) { +- timed_out = 1; +- break; +- } +- +- /* We were woken up, thus go and try to harvest some events */ ++ if (!eavail && !res) ++ timed_out = !schedule_hrtimeout_range(to, slack, ++ HRTIMER_MODE_ABS); ++ ++ /* ++ * We were woken up, thus go and try to harvest some events. ++ * If timed out and still on the wait queue, recheck eavail ++ * carefully under lock, below. ++ */ + eavail = 1; +- + } while (0); + + __set_current_state(TASK_RUNNING); + + if (!list_empty_careful(&wait.entry)) { + write_lock_irq(&ep->lock); ++ /* ++ * If the thread timed out and is not on the wait queue, it ++ * means that the thread was woken up after its timeout expired ++ * before it could reacquire the lock. Thus, when wait.entry is ++ * empty, it needs to harvest events. ++ */ ++ if (timed_out) ++ eavail = list_empty(&wait.entry); + __remove_wait_queue(&ep->wq, &wait); + write_unlock_irq(&ep->lock); + } diff --git a/queue-5.4/nvme-ensure-subsystem-reset-is-single-threaded.patch b/queue-5.4/nvme-ensure-subsystem-reset-is-single-threaded.patch new file mode 100644 index 00000000000..ff628adc8c5 --- /dev/null +++ b/queue-5.4/nvme-ensure-subsystem-reset-is-single-threaded.patch @@ -0,0 +1,67 @@ +From 1e866afd4bcdd01a70a5eddb4371158d3035ce03 Mon Sep 17 00:00:00 2001 +From: Keith Busch +Date: Thu, 22 Sep 2022 08:13:47 -0700 +Subject: nvme: ensure subsystem reset is single threaded + +From: Keith Busch + +commit 1e866afd4bcdd01a70a5eddb4371158d3035ce03 upstream. + +The subsystem reset writes to a register, so we have to ensure the +device state is capable of handling that otherwise the driver may access +unmapped registers. Use the state machine to ensure the subsystem reset +doesn't try to write registers on a device already undergoing this type +of reset. + +Link: https://bugzilla.kernel.org/show_bug.cgi?id=214771 +Signed-off-by: Keith Busch +Signed-off-by: Christoph Hellwig +Signed-off-by: Ovidiu Panait +Signed-off-by: Greg Kroah-Hartman +--- + drivers/nvme/host/nvme.h | 16 +++++++++++++--- + 1 file changed, 13 insertions(+), 3 deletions(-) + +--- a/drivers/nvme/host/nvme.h ++++ b/drivers/nvme/host/nvme.h +@@ -422,11 +422,23 @@ static inline void nvme_fault_inject_fin + static inline void nvme_should_fail(struct request *req) {} + #endif + ++bool nvme_wait_reset(struct nvme_ctrl *ctrl); ++int nvme_try_sched_reset(struct nvme_ctrl *ctrl); ++ + static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl) + { ++ int ret; ++ + if (!ctrl->subsystem) + return -ENOTTY; +- return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65); ++ if (!nvme_wait_reset(ctrl)) ++ return -EBUSY; ++ ++ ret = ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65); ++ if (ret) ++ return ret; ++ ++ return nvme_try_sched_reset(ctrl); + } + + /* +@@ -473,7 +485,6 @@ void nvme_cancel_tagset(struct nvme_ctrl + void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl); + bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, + enum nvme_ctrl_state new_state); +-bool nvme_wait_reset(struct nvme_ctrl *ctrl); + int nvme_disable_ctrl(struct nvme_ctrl *ctrl); + int nvme_enable_ctrl(struct nvme_ctrl *ctrl); + int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); +@@ -525,7 +536,6 @@ int nvme_set_queue_count(struct nvme_ctr + void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); + int nvme_reset_ctrl(struct nvme_ctrl *ctrl); + int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); +-int nvme_try_sched_reset(struct nvme_ctrl *ctrl); + int nvme_delete_ctrl(struct nvme_ctrl *ctrl); + + int nvme_get_log(struct nvme_ctrl *ctrl, u32 nsid, u8 log_page, u8 lsp, diff --git a/queue-5.4/nvme-restrict-management-ioctls-to-admin.patch b/queue-5.4/nvme-restrict-management-ioctls-to-admin.patch new file mode 100644 index 00000000000..1b587aa9ea0 --- /dev/null +++ b/queue-5.4/nvme-restrict-management-ioctls-to-admin.patch @@ -0,0 +1,41 @@ +From 23e085b2dead13b51fe86d27069895b740f749c0 Mon Sep 17 00:00:00 2001 +From: Keith Busch +Date: Thu, 22 Sep 2022 07:54:06 -0700 +Subject: nvme: restrict management ioctls to admin + +From: Keith Busch + +commit 23e085b2dead13b51fe86d27069895b740f749c0 upstream. + +The passthrough commands already have this restriction, but the other +operations do not. Require the same capabilities for all users as all of +these operations, which include resets and rescans, can be disruptive. + +Signed-off-by: Keith Busch +Signed-off-by: Christoph Hellwig +Signed-off-by: Greg Kroah-Hartman +Signed-off-by: Ovidiu Panait +--- + drivers/nvme/host/core.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/drivers/nvme/host/core.c ++++ b/drivers/nvme/host/core.c +@@ -3027,11 +3027,17 @@ static long nvme_dev_ioctl(struct file * + case NVME_IOCTL_IO_CMD: + return nvme_dev_user_cmd(ctrl, argp); + case NVME_IOCTL_RESET: ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EACCES; + dev_warn(ctrl->device, "resetting controller\n"); + return nvme_reset_ctrl_sync(ctrl); + case NVME_IOCTL_SUBSYS_RESET: ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EACCES; + return nvme_reset_subsystem(ctrl); + case NVME_IOCTL_RESCAN: ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EACCES; + nvme_queue_scan(ctrl); + return 0; + default: diff --git a/queue-5.4/series b/queue-5.4/series index 4fc04ed9e27..2bcb95d87ec 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -145,3 +145,8 @@ selftests-net-add-delete-nexthop-route-warning-test.patch selftests-net-fix-nexthop-warning-cleanup-double-ip-.patch ipv4-handle-attempt-to-delete-multipath-route-when-f.patch ipv4-fix-route-deletion-when-nexthop-info-is-not-spe.patch +tracing-ring-buffer-have-polling-block-on-watermark.patch +epoll-call-final-ep_events_available-check-under-the-lock.patch +epoll-check-for-events-when-removing-a-timed-out-thread-from-the-wait-queue.patch +nvme-restrict-management-ioctls-to-admin.patch +nvme-ensure-subsystem-reset-is-single-threaded.patch diff --git a/queue-5.4/tracing-ring-buffer-have-polling-block-on-watermark.patch b/queue-5.4/tracing-ring-buffer-have-polling-block-on-watermark.patch new file mode 100644 index 00000000000..ae4f6d534c8 --- /dev/null +++ b/queue-5.4/tracing-ring-buffer-have-polling-block-on-watermark.patch @@ -0,0 +1,186 @@ +From 42fb0a1e84ff525ebe560e2baf9451ab69127e2b Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (Google)" +Date: Thu, 20 Oct 2022 23:14:27 -0400 +Subject: tracing/ring-buffer: Have polling block on watermark + +From: Steven Rostedt (Google) + +commit 42fb0a1e84ff525ebe560e2baf9451ab69127e2b upstream. + +Currently the way polling works on the ring buffer is broken. It will +return immediately if there's any data in the ring buffer whereas a read +will block until the watermark (defined by the tracefs buffer_percent file) +is hit. + +That is, a select() or poll() will return as if there's data available, +but then the following read will block. This is broken for the way +select()s and poll()s are supposed to work. + +Have the polling on the ring buffer also block the same way reads and +splice does on the ring buffer. + +Link: https://lkml.kernel.org/r/20221020231427.41be3f26@gandalf.local.home + +Cc: Linux Trace Kernel +Cc: Masami Hiramatsu +Cc: Mathieu Desnoyers +Cc: Primiano Tucci +Cc: stable@vger.kernel.org +Fixes: 1e0d6714aceb7 ("ring-buffer: Do not wake up a splice waiter when page is not full") +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/ring_buffer.h | 2 - + kernel/trace/ring_buffer.c | 54 ++++++++++++++++++++++++++++---------------- + kernel/trace/trace.c | 2 - + 3 files changed, 37 insertions(+), 21 deletions(-) + +--- a/include/linux/ring_buffer.h ++++ b/include/linux/ring_buffer.h +@@ -99,7 +99,7 @@ __ring_buffer_alloc(unsigned long size, + + int ring_buffer_wait(struct ring_buffer *buffer, int cpu, int full); + __poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, +- struct file *filp, poll_table *poll_table); ++ struct file *filp, poll_table *poll_table, int full); + + + #define RING_BUFFER_ALL_CPUS -1 +--- a/kernel/trace/ring_buffer.c ++++ b/kernel/trace/ring_buffer.c +@@ -566,6 +566,21 @@ size_t ring_buffer_nr_dirty_pages(struct + return cnt - read; + } + ++static __always_inline bool full_hit(struct ring_buffer *buffer, int cpu, int full) ++{ ++ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; ++ size_t nr_pages; ++ size_t dirty; ++ ++ nr_pages = cpu_buffer->nr_pages; ++ if (!nr_pages || !full) ++ return true; ++ ++ dirty = ring_buffer_nr_dirty_pages(buffer, cpu); ++ ++ return (dirty * 100) > (full * nr_pages); ++} ++ + /* + * rb_wake_up_waiters - wake up tasks waiting for ring buffer input + * +@@ -661,22 +676,20 @@ int ring_buffer_wait(struct ring_buffer + !ring_buffer_empty_cpu(buffer, cpu)) { + unsigned long flags; + bool pagebusy; +- size_t nr_pages; +- size_t dirty; ++ bool done; + + if (!full) + break; + + raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page; +- nr_pages = cpu_buffer->nr_pages; +- dirty = ring_buffer_nr_dirty_pages(buffer, cpu); ++ done = !pagebusy && full_hit(buffer, cpu, full); ++ + if (!cpu_buffer->shortest_full || + cpu_buffer->shortest_full > full) + cpu_buffer->shortest_full = full; + raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +- if (!pagebusy && +- (!nr_pages || (dirty * 100) > full * nr_pages)) ++ if (done) + break; + } + +@@ -697,6 +710,7 @@ int ring_buffer_wait(struct ring_buffer + * @cpu: the cpu buffer to wait on + * @filp: the file descriptor + * @poll_table: The poll descriptor ++ * @full: wait until the percentage of pages are available, if @cpu != RING_BUFFER_ALL_CPUS + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise +@@ -706,14 +720,14 @@ int ring_buffer_wait(struct ring_buffer + * zero otherwise. + */ + __poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, +- struct file *filp, poll_table *poll_table) ++ struct file *filp, poll_table *poll_table, int full) + { + struct ring_buffer_per_cpu *cpu_buffer; + struct rb_irq_work *work; + +- if (cpu == RING_BUFFER_ALL_CPUS) ++ if (cpu == RING_BUFFER_ALL_CPUS) { + work = &buffer->irq_work; +- else { ++ } else { + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return -EINVAL; + +@@ -721,8 +735,14 @@ __poll_t ring_buffer_poll_wait(struct ri + work = &cpu_buffer->irq_work; + } + +- poll_wait(filp, &work->waiters, poll_table); +- work->waiters_pending = true; ++ if (full) { ++ poll_wait(filp, &work->full_waiters, poll_table); ++ work->full_waiters_pending = true; ++ } else { ++ poll_wait(filp, &work->waiters, poll_table); ++ work->waiters_pending = true; ++ } ++ + /* + * There's a tight race between setting the waiters_pending and + * checking if the ring buffer is empty. Once the waiters_pending bit +@@ -738,6 +758,9 @@ __poll_t ring_buffer_poll_wait(struct ri + */ + smp_mb(); + ++ if (full) ++ return full_hit(buffer, cpu, full) ? EPOLLIN | EPOLLRDNORM : 0; ++ + if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || + (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) + return EPOLLIN | EPOLLRDNORM; +@@ -2640,10 +2663,6 @@ static void rb_commit(struct ring_buffer + static __always_inline void + rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) + { +- size_t nr_pages; +- size_t dirty; +- size_t full; +- + if (buffer->irq_work.waiters_pending) { + buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ +@@ -2667,10 +2686,7 @@ rb_wakeups(struct ring_buffer *buffer, s + + cpu_buffer->last_pages_touch = local_read(&cpu_buffer->pages_touched); + +- full = cpu_buffer->shortest_full; +- nr_pages = cpu_buffer->nr_pages; +- dirty = ring_buffer_nr_dirty_pages(buffer, cpu_buffer->cpu); +- if (full && nr_pages && (dirty * 100) <= full * nr_pages) ++ if (!full_hit(buffer, cpu_buffer->cpu, cpu_buffer->shortest_full)) + return; + + cpu_buffer->irq_work.wakeup_full = true; +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -5993,7 +5993,7 @@ trace_poll(struct trace_iterator *iter, + return EPOLLIN | EPOLLRDNORM; + else + return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file, +- filp, poll_table); ++ filp, poll_table, iter->tr->buffer_percent); + } + + static __poll_t -- 2.47.3