]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob
c167187471b25032f43a78900c0eb71b4f98b1cc
[thirdparty/kernel/stable-queue.git] /
1 From 5fa54346caf67b4b1b10b1f390316ae466da4d53 Mon Sep 17 00:00:00 2001
2 From: Petr Mladek <pmladek@suse.com>
3 Date: Thu, 24 Jun 2021 18:39:48 -0700
4 Subject: kthread: prevent deadlock when kthread_mod_delayed_work() races with kthread_cancel_delayed_work_sync()
5
6 From: Petr Mladek <pmladek@suse.com>
7
8 commit 5fa54346caf67b4b1b10b1f390316ae466da4d53 upstream.
9
10 The system might hang with the following backtrace:
11
12 schedule+0x80/0x100
13 schedule_timeout+0x48/0x138
14 wait_for_common+0xa4/0x134
15 wait_for_completion+0x1c/0x2c
16 kthread_flush_work+0x114/0x1cc
17 kthread_cancel_work_sync.llvm.16514401384283632983+0xe8/0x144
18 kthread_cancel_delayed_work_sync+0x18/0x2c
19 xxxx_pm_notify+0xb0/0xd8
20 blocking_notifier_call_chain_robust+0x80/0x194
21 pm_notifier_call_chain_robust+0x28/0x4c
22 suspend_prepare+0x40/0x260
23 enter_state+0x80/0x3f4
24 pm_suspend+0x60/0xdc
25 state_store+0x108/0x144
26 kobj_attr_store+0x38/0x88
27 sysfs_kf_write+0x64/0xc0
28 kernfs_fop_write_iter+0x108/0x1d0
29 vfs_write+0x2f4/0x368
30 ksys_write+0x7c/0xec
31
32 It is caused by the following race between kthread_mod_delayed_work()
33 and kthread_cancel_delayed_work_sync():
34
35 CPU0 CPU1
36
37 Context: Thread A Context: Thread B
38
39 kthread_mod_delayed_work()
40 spin_lock()
41 __kthread_cancel_work()
42 spin_unlock()
43 del_timer_sync()
44 kthread_cancel_delayed_work_sync()
45 spin_lock()
46 __kthread_cancel_work()
47 spin_unlock()
48 del_timer_sync()
49 spin_lock()
50
51 work->canceling++
52 spin_unlock
53 spin_lock()
54 queue_delayed_work()
55 // dwork is put into the worker->delayed_work_list
56
57 spin_unlock()
58
59 kthread_flush_work()
60 // flush_work is put at the tail of the dwork
61
62 wait_for_completion()
63
64 Context: IRQ
65
66 kthread_delayed_work_timer_fn()
67 spin_lock()
68 list_del_init(&work->node);
69 spin_unlock()
70
71 BANG: flush_work is not longer linked and will never get proceed.
72
73 The problem is that kthread_mod_delayed_work() checks work->canceling
74 flag before canceling the timer.
75
76 A simple solution is to (re)check work->canceling after
77 __kthread_cancel_work(). But then it is not clear what should be
78 returned when __kthread_cancel_work() removed the work from the queue
79 (list) and it can't queue it again with the new @delay.
80
81 The return value might be used for reference counting. The caller has
82 to know whether a new work has been queued or an existing one was
83 replaced.
84
85 The proper solution is that kthread_mod_delayed_work() will remove the
86 work from the queue (list) _only_ when work->canceling is not set. The
87 flag must be checked after the timer is stopped and the remaining
88 operations can be done under worker->lock.
89
90 Note that kthread_mod_delayed_work() could remove the timer and then
91 bail out. It is fine. The other canceling caller needs to cancel the
92 timer as well. The important thing is that the queue (list)
93 manipulation is done atomically under worker->lock.
94
95 Link: https://lkml.kernel.org/r/20210610133051.15337-3-pmladek@suse.com
96 Fixes: 9a6b06c8d9a220860468a ("kthread: allow to modify delayed kthread work")
97 Signed-off-by: Petr Mladek <pmladek@suse.com>
98 Reported-by: Martin Liu <liumartin@google.com>
99 Cc: <jenhaochen@google.com>
100 Cc: Minchan Kim <minchan@google.com>
101 Cc: Nathan Chancellor <nathan@kernel.org>
102 Cc: Nick Desaulniers <ndesaulniers@google.com>
103 Cc: Oleg Nesterov <oleg@redhat.com>
104 Cc: Tejun Heo <tj@kernel.org>
105 Cc: <stable@vger.kernel.org>
106 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
107 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
108 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
109 ---
110 kernel/kthread.c | 35 ++++++++++++++++++++++++-----------
111 1 file changed, 24 insertions(+), 11 deletions(-)
112
113 --- a/kernel/kthread.c
114 +++ b/kernel/kthread.c
115 @@ -1119,8 +1119,11 @@ static void kthread_cancel_delayed_work_
116 }
117
118 /*
119 - * This function removes the work from the worker queue. Also it makes sure
120 - * that it won't get queued later via the delayed work's timer.
121 + * This function removes the work from the worker queue.
122 + *
123 + * It is called under worker->lock. The caller must make sure that
124 + * the timer used by delayed work is not running, e.g. by calling
125 + * kthread_cancel_delayed_work_timer().
126 *
127 * The work might still be in use when this function finishes. See the
128 * current_work proceed by the worker.
129 @@ -1128,13 +1131,8 @@ static void kthread_cancel_delayed_work_
130 * Return: %true if @work was pending and successfully canceled,
131 * %false if @work was not pending
132 */
133 -static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork,
134 - unsigned long *flags)
135 +static bool __kthread_cancel_work(struct kthread_work *work)
136 {
137 - /* Try to cancel the timer if exists. */
138 - if (is_dwork)
139 - kthread_cancel_delayed_work_timer(work, flags);
140 -
141 /*
142 * Try to remove the work from a worker list. It might either
143 * be from worker->work_list or from worker->delayed_work_list.
144 @@ -1187,11 +1185,23 @@ bool kthread_mod_delayed_work(struct kth
145 /* Work must not be used with >1 worker, see kthread_queue_work() */
146 WARN_ON_ONCE(work->worker != worker);
147
148 - /* Do not fight with another command that is canceling this work. */
149 + /*
150 + * Temporary cancel the work but do not fight with another command
151 + * that is canceling the work as well.
152 + *
153 + * It is a bit tricky because of possible races with another
154 + * mod_delayed_work() and cancel_delayed_work() callers.
155 + *
156 + * The timer must be canceled first because worker->lock is released
157 + * when doing so. But the work can be removed from the queue (list)
158 + * only when it can be queued again so that the return value can
159 + * be used for reference counting.
160 + */
161 + kthread_cancel_delayed_work_timer(work, &flags);
162 if (work->canceling)
163 goto out;
164 + ret = __kthread_cancel_work(work);
165
166 - ret = __kthread_cancel_work(work, true, &flags);
167 fast_queue:
168 __kthread_queue_delayed_work(worker, dwork, delay);
169 out:
170 @@ -1213,7 +1223,10 @@ static bool __kthread_cancel_work_sync(s
171 /* Work must not be used with >1 worker, see kthread_queue_work(). */
172 WARN_ON_ONCE(work->worker != worker);
173
174 - ret = __kthread_cancel_work(work, is_dwork, &flags);
175 + if (is_dwork)
176 + kthread_cancel_delayed_work_timer(work, &flags);
177 +
178 + ret = __kthread_cancel_work(work);
179
180 if (worker->current_work != work)
181 goto out_fast;