]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.14-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 8 Jul 2021 18:32:18 +0000 (20:32 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 8 Jul 2021 18:32:18 +0000 (20:32 +0200)
added patches:
kthread-prevent-deadlock-when-kthread_mod_delayed_work-races-with-kthread_cancel_delayed_work_sync.patch
kthread_worker-split-code-for-canceling-the-delayed-work-timer.patch
xen-events-reset-active-flag-for-lateeoi-events-later.patch

queue-4.14/kthread-prevent-deadlock-when-kthread_mod_delayed_work-races-with-kthread_cancel_delayed_work_sync.patch [new file with mode: 0644]
queue-4.14/kthread_worker-split-code-for-canceling-the-delayed-work-timer.patch [new file with mode: 0644]
queue-4.14/series
queue-4.14/xen-events-reset-active-flag-for-lateeoi-events-later.patch [new file with mode: 0644]

diff --git a/queue-4.14/kthread-prevent-deadlock-when-kthread_mod_delayed_work-races-with-kthread_cancel_delayed_work_sync.patch b/queue-4.14/kthread-prevent-deadlock-when-kthread_mod_delayed_work-races-with-kthread_cancel_delayed_work_sync.patch
new file mode 100644 (file)
index 0000000..09f77ed
--- /dev/null
@@ -0,0 +1,182 @@
+From 5fa54346caf67b4b1b10b1f390316ae466da4d53 Mon Sep 17 00:00:00 2001
+From: Petr Mladek <pmladek@suse.com>
+Date: Thu, 24 Jun 2021 18:39:48 -0700
+Subject: kthread: prevent deadlock when kthread_mod_delayed_work() races with kthread_cancel_delayed_work_sync()
+
+From: Petr Mladek <pmladek@suse.com>
+
+commit 5fa54346caf67b4b1b10b1f390316ae466da4d53 upstream.
+
+The system might hang with the following backtrace:
+
+       schedule+0x80/0x100
+       schedule_timeout+0x48/0x138
+       wait_for_common+0xa4/0x134
+       wait_for_completion+0x1c/0x2c
+       kthread_flush_work+0x114/0x1cc
+       kthread_cancel_work_sync.llvm.16514401384283632983+0xe8/0x144
+       kthread_cancel_delayed_work_sync+0x18/0x2c
+       xxxx_pm_notify+0xb0/0xd8
+       blocking_notifier_call_chain_robust+0x80/0x194
+       pm_notifier_call_chain_robust+0x28/0x4c
+       suspend_prepare+0x40/0x260
+       enter_state+0x80/0x3f4
+       pm_suspend+0x60/0xdc
+       state_store+0x108/0x144
+       kobj_attr_store+0x38/0x88
+       sysfs_kf_write+0x64/0xc0
+       kernfs_fop_write_iter+0x108/0x1d0
+       vfs_write+0x2f4/0x368
+       ksys_write+0x7c/0xec
+
+It is caused by the following race between kthread_mod_delayed_work()
+and kthread_cancel_delayed_work_sync():
+
+CPU0                           CPU1
+
+Context: Thread A              Context: Thread B
+
+kthread_mod_delayed_work()
+  spin_lock()
+  __kthread_cancel_work()
+     spin_unlock()
+     del_timer_sync()
+                               kthread_cancel_delayed_work_sync()
+                                 spin_lock()
+                                 __kthread_cancel_work()
+                                   spin_unlock()
+                                   del_timer_sync()
+                                   spin_lock()
+
+                                 work->canceling++
+                                 spin_unlock
+     spin_lock()
+   queue_delayed_work()
+     // dwork is put into the worker->delayed_work_list
+
+   spin_unlock()
+
+                                 kthread_flush_work()
+     // flush_work is put at the tail of the dwork
+
+                                   wait_for_completion()
+
+Context: IRQ
+
+  kthread_delayed_work_timer_fn()
+    spin_lock()
+    list_del_init(&work->node);
+    spin_unlock()
+
+BANG: flush_work is not longer linked and will never get proceed.
+
+The problem is that kthread_mod_delayed_work() checks work->canceling
+flag before canceling the timer.
+
+A simple solution is to (re)check work->canceling after
+__kthread_cancel_work().  But then it is not clear what should be
+returned when __kthread_cancel_work() removed the work from the queue
+(list) and it can't queue it again with the new @delay.
+
+The return value might be used for reference counting.  The caller has
+to know whether a new work has been queued or an existing one was
+replaced.
+
+The proper solution is that kthread_mod_delayed_work() will remove the
+work from the queue (list) _only_ when work->canceling is not set.  The
+flag must be checked after the timer is stopped and the remaining
+operations can be done under worker->lock.
+
+Note that kthread_mod_delayed_work() could remove the timer and then
+bail out.  It is fine.  The other canceling caller needs to cancel the
+timer as well.  The important thing is that the queue (list)
+manipulation is done atomically under worker->lock.
+
+Link: https://lkml.kernel.org/r/20210610133051.15337-3-pmladek@suse.com
+Fixes: 9a6b06c8d9a220860468a ("kthread: allow to modify delayed kthread work")
+Signed-off-by: Petr Mladek <pmladek@suse.com>
+Reported-by: Martin Liu <liumartin@google.com>
+Cc: <jenhaochen@google.com>
+Cc: Minchan Kim <minchan@google.com>
+Cc: Nathan Chancellor <nathan@kernel.org>
+Cc: Nick Desaulniers <ndesaulniers@google.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/kthread.c |   35 ++++++++++++++++++++++++-----------
+ 1 file changed, 24 insertions(+), 11 deletions(-)
+
+--- a/kernel/kthread.c
++++ b/kernel/kthread.c
+@@ -1006,8 +1006,11 @@ static void kthread_cancel_delayed_work_
+ }
+ /*
+- * This function removes the work from the worker queue. Also it makes sure
+- * that it won't get queued later via the delayed work's timer.
++ * This function removes the work from the worker queue.
++ *
++ * It is called under worker->lock. The caller must make sure that
++ * the timer used by delayed work is not running, e.g. by calling
++ * kthread_cancel_delayed_work_timer().
+  *
+  * The work might still be in use when this function finishes. See the
+  * current_work proceed by the worker.
+@@ -1015,13 +1018,8 @@ static void kthread_cancel_delayed_work_
+  * Return: %true if @work was pending and successfully canceled,
+  *    %false if @work was not pending
+  */
+-static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork,
+-                                unsigned long *flags)
++static bool __kthread_cancel_work(struct kthread_work *work)
+ {
+-      /* Try to cancel the timer if exists. */
+-      if (is_dwork)
+-              kthread_cancel_delayed_work_timer(work, flags);
+-
+       /*
+        * Try to remove the work from a worker list. It might either
+        * be from worker->work_list or from worker->delayed_work_list.
+@@ -1074,11 +1072,23 @@ bool kthread_mod_delayed_work(struct kth
+       /* Work must not be used with >1 worker, see kthread_queue_work() */
+       WARN_ON_ONCE(work->worker != worker);
+-      /* Do not fight with another command that is canceling this work. */
++      /*
++       * Temporary cancel the work but do not fight with another command
++       * that is canceling the work as well.
++       *
++       * It is a bit tricky because of possible races with another
++       * mod_delayed_work() and cancel_delayed_work() callers.
++       *
++       * The timer must be canceled first because worker->lock is released
++       * when doing so. But the work can be removed from the queue (list)
++       * only when it can be queued again so that the return value can
++       * be used for reference counting.
++       */
++      kthread_cancel_delayed_work_timer(work, &flags);
+       if (work->canceling)
+               goto out;
++      ret = __kthread_cancel_work(work);
+-      ret = __kthread_cancel_work(work, true, &flags);
+ fast_queue:
+       __kthread_queue_delayed_work(worker, dwork, delay);
+ out:
+@@ -1100,7 +1110,10 @@ static bool __kthread_cancel_work_sync(s
+       /* Work must not be used with >1 worker, see kthread_queue_work(). */
+       WARN_ON_ONCE(work->worker != worker);
+-      ret = __kthread_cancel_work(work, is_dwork, &flags);
++      if (is_dwork)
++              kthread_cancel_delayed_work_timer(work, &flags);
++
++      ret = __kthread_cancel_work(work);
+       if (worker->current_work != work)
+               goto out_fast;
diff --git a/queue-4.14/kthread_worker-split-code-for-canceling-the-delayed-work-timer.patch b/queue-4.14/kthread_worker-split-code-for-canceling-the-delayed-work-timer.patch
new file mode 100644 (file)
index 0000000..0969875
--- /dev/null
@@ -0,0 +1,103 @@
+From 34b3d5344719d14fd2185b2d9459b3abcb8cf9d8 Mon Sep 17 00:00:00 2001
+From: Petr Mladek <pmladek@suse.com>
+Date: Thu, 24 Jun 2021 18:39:45 -0700
+Subject: kthread_worker: split code for canceling the delayed work timer
+
+From: Petr Mladek <pmladek@suse.com>
+
+commit 34b3d5344719d14fd2185b2d9459b3abcb8cf9d8 upstream.
+
+Patch series "kthread_worker: Fix race between kthread_mod_delayed_work()
+and kthread_cancel_delayed_work_sync()".
+
+This patchset fixes the race between kthread_mod_delayed_work() and
+kthread_cancel_delayed_work_sync() including proper return value
+handling.
+
+This patch (of 2):
+
+Simple code refactoring as a preparation step for fixing a race between
+kthread_mod_delayed_work() and kthread_cancel_delayed_work_sync().
+
+It does not modify the existing behavior.
+
+Link: https://lkml.kernel.org/r/20210610133051.15337-2-pmladek@suse.com
+Signed-off-by: Petr Mladek <pmladek@suse.com>
+Cc: <jenhaochen@google.com>
+Cc: Martin Liu <liumartin@google.com>
+Cc: Minchan Kim <minchan@google.com>
+Cc: Nathan Chancellor <nathan@kernel.org>
+Cc: Nick Desaulniers <ndesaulniers@google.com>
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ kernel/kthread.c |   46 +++++++++++++++++++++++++++++-----------------
+ 1 file changed, 29 insertions(+), 17 deletions(-)
+
+--- a/kernel/kthread.c
++++ b/kernel/kthread.c
+@@ -979,6 +979,33 @@ void kthread_flush_work(struct kthread_w
+ EXPORT_SYMBOL_GPL(kthread_flush_work);
+ /*
++ * Make sure that the timer is neither set nor running and could
++ * not manipulate the work list_head any longer.
++ *
++ * The function is called under worker->lock. The lock is temporary
++ * released but the timer can't be set again in the meantime.
++ */
++static void kthread_cancel_delayed_work_timer(struct kthread_work *work,
++                                            unsigned long *flags)
++{
++      struct kthread_delayed_work *dwork =
++              container_of(work, struct kthread_delayed_work, work);
++      struct kthread_worker *worker = work->worker;
++
++      /*
++       * del_timer_sync() must be called to make sure that the timer
++       * callback is not running. The lock must be temporary released
++       * to avoid a deadlock with the callback. In the meantime,
++       * any queuing is blocked by setting the canceling counter.
++       */
++      work->canceling++;
++      spin_unlock_irqrestore(&worker->lock, *flags);
++      del_timer_sync(&dwork->timer);
++      spin_lock_irqsave(&worker->lock, *flags);
++      work->canceling--;
++}
++
++/*
+  * This function removes the work from the worker queue. Also it makes sure
+  * that it won't get queued later via the delayed work's timer.
+  *
+@@ -992,23 +1019,8 @@ static bool __kthread_cancel_work(struct
+                                 unsigned long *flags)
+ {
+       /* Try to cancel the timer if exists. */
+-      if (is_dwork) {
+-              struct kthread_delayed_work *dwork =
+-                      container_of(work, struct kthread_delayed_work, work);
+-              struct kthread_worker *worker = work->worker;
+-
+-              /*
+-               * del_timer_sync() must be called to make sure that the timer
+-               * callback is not running. The lock must be temporary released
+-               * to avoid a deadlock with the callback. In the meantime,
+-               * any queuing is blocked by setting the canceling counter.
+-               */
+-              work->canceling++;
+-              spin_unlock_irqrestore(&worker->lock, *flags);
+-              del_timer_sync(&dwork->timer);
+-              spin_lock_irqsave(&worker->lock, *flags);
+-              work->canceling--;
+-      }
++      if (is_dwork)
++              kthread_cancel_delayed_work_timer(work, flags);
+       /*
+        * Try to remove the work from a worker list. It might either
index 6829ef67377fe6e36fdb888bd50bf4c44f13bff3..1f7dd0768ded89dcfcd20a76d3564ef045aa9de4 100644 (file)
@@ -20,3 +20,6 @@ mm-futex-fix-shared-futex-pgoff-on-shmem-huge-page.patch
 scsi-sr-return-appropriate-error-code-when-disk-is-e.patch
 drm-nouveau-fix-dma_address-check-for-cpu-gpu-sync.patch
 kfifo-declare_kifo_ptr-fifo-u64-does-not-work-on-arm-32-bit.patch
+kthread_worker-split-code-for-canceling-the-delayed-work-timer.patch
+kthread-prevent-deadlock-when-kthread_mod_delayed_work-races-with-kthread_cancel_delayed_work_sync.patch
+xen-events-reset-active-flag-for-lateeoi-events-later.patch
diff --git a/queue-4.14/xen-events-reset-active-flag-for-lateeoi-events-later.patch b/queue-4.14/xen-events-reset-active-flag-for-lateeoi-events-later.patch
new file mode 100644 (file)
index 0000000..7e82121
--- /dev/null
@@ -0,0 +1,69 @@
+From 3de218ff39b9e3f0d453fe3154f12a174de44b25 Mon Sep 17 00:00:00 2001
+From: Juergen Gross <jgross@suse.com>
+Date: Wed, 23 Jun 2021 15:09:13 +0200
+Subject: xen/events: reset active flag for lateeoi events later
+
+From: Juergen Gross <jgross@suse.com>
+
+commit 3de218ff39b9e3f0d453fe3154f12a174de44b25 upstream.
+
+In order to avoid a race condition for user events when changing
+cpu affinity reset the active flag only when EOI-ing the event.
+
+This is working fine as all user events are lateeoi events. Note that
+lateeoi_ack_mask_dynirq() is not modified as there is no explicit call
+to xen_irq_lateeoi() expected later.
+
+Cc: stable@vger.kernel.org
+Reported-by: Julien Grall <julien@xen.org>
+Fixes: b6622798bc50b62 ("xen/events: avoid handling the same event on two cpus at the same time")
+Tested-by: Julien Grall <julien@xen.org>
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Reviewed-by: Boris Ostrovsky <boris.ostrvsky@oracle.com>
+Link: https://lore.kernel.org/r/20210623130913.9405-1-jgross@suse.com
+Signed-off-by: Juergen Gross <jgross@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/xen/events/events_base.c |   23 +++++++++++++++++++----
+ 1 file changed, 19 insertions(+), 4 deletions(-)
+
+--- a/drivers/xen/events/events_base.c
++++ b/drivers/xen/events/events_base.c
+@@ -524,6 +524,9 @@ static void xen_irq_lateeoi_locked(struc
+       }
+       info->eoi_time = 0;
++
++      /* is_active hasn't been reset yet, do it now. */
++      smp_store_release(&info->is_active, 0);
+       do_unmask(info, EVT_MASK_REASON_EOI_PENDING);
+ }
+@@ -1780,10 +1783,22 @@ static void lateeoi_ack_dynirq(struct ir
+       struct irq_info *info = info_for_irq(data->irq);
+       evtchn_port_t evtchn = info ? info->evtchn : 0;
+-      if (VALID_EVTCHN(evtchn)) {
+-              do_mask(info, EVT_MASK_REASON_EOI_PENDING);
+-              ack_dynirq(data);
+-      }
++      if (!VALID_EVTCHN(evtchn))
++              return;
++
++      do_mask(info, EVT_MASK_REASON_EOI_PENDING);
++
++      if (unlikely(irqd_is_setaffinity_pending(data)) &&
++          likely(!irqd_irq_disabled(data))) {
++              do_mask(info, EVT_MASK_REASON_TEMPORARY);
++
++              clear_evtchn(evtchn);
++
++              irq_move_masked_irq(data);
++
++              do_unmask(info, EVT_MASK_REASON_TEMPORARY);
++      } else
++              clear_evtchn(evtchn);
+ }
+ static void lateeoi_mask_ack_dynirq(struct irq_data *data)