]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob - releases/4.16.4/writeback-safer-lock-nesting.patch
Fixes for 4.19
[thirdparty/kernel/stable-queue.git] / releases / 4.16.4 / writeback-safer-lock-nesting.patch
1 From 2e898e4c0a3897ccd434adac5abb8330194f527b Mon Sep 17 00:00:00 2001
2 From: Greg Thelen <gthelen@google.com>
3 Date: Fri, 20 Apr 2018 14:55:42 -0700
4 Subject: writeback: safer lock nesting
5
6 From: Greg Thelen <gthelen@google.com>
7
8 commit 2e898e4c0a3897ccd434adac5abb8330194f527b upstream.
9
10 lock_page_memcg()/unlock_page_memcg() use spin_lock_irqsave/restore() if
11 the page's memcg is undergoing move accounting, which occurs when a
12 process leaves its memcg for a new one that has
13 memory.move_charge_at_immigrate set.
14
15 unlocked_inode_to_wb_begin,end() use spin_lock_irq/spin_unlock_irq() if
16 the given inode is switching writeback domains. Switches occur when
17 enough writes are issued from a new domain.
18
19 This existing pattern is thus suspicious:
20 lock_page_memcg(page);
21 unlocked_inode_to_wb_begin(inode, &locked);
22 ...
23 unlocked_inode_to_wb_end(inode, locked);
24 unlock_page_memcg(page);
25
26 If both inode switch and process memcg migration are both in-flight then
27 unlocked_inode_to_wb_end() will unconditionally enable interrupts while
28 still holding the lock_page_memcg() irq spinlock. This suggests the
29 possibility of deadlock if an interrupt occurs before unlock_page_memcg().
30
31 truncate
32 __cancel_dirty_page
33 lock_page_memcg
34 unlocked_inode_to_wb_begin
35 unlocked_inode_to_wb_end
36 <interrupts mistakenly enabled>
37 <interrupt>
38 end_page_writeback
39 test_clear_page_writeback
40 lock_page_memcg
41 <deadlock>
42 unlock_page_memcg
43
44 Due to configuration limitations this deadlock is not currently possible
45 because we don't mix cgroup writeback (a cgroupv2 feature) and
46 memory.move_charge_at_immigrate (a cgroupv1 feature).
47
48 If the kernel is hacked to always claim inode switching and memcg
49 moving_account, then this script triggers lockup in less than a minute:
50
51 cd /mnt/cgroup/memory
52 mkdir a b
53 echo 1 > a/memory.move_charge_at_immigrate
54 echo 1 > b/memory.move_charge_at_immigrate
55 (
56 echo $BASHPID > a/cgroup.procs
57 while true; do
58 dd if=/dev/zero of=/mnt/big bs=1M count=256
59 done
60 ) &
61 while true; do
62 sync
63 done &
64 sleep 1h &
65 SLEEP=$!
66 while true; do
67 echo $SLEEP > a/cgroup.procs
68 echo $SLEEP > b/cgroup.procs
69 done
70
71 The deadlock does not seem possible, so it's debatable if there's any
72 reason to modify the kernel. I suggest we should to prevent future
73 surprises. And Wang Long said "this deadlock occurs three times in our
74 environment", so there's more reason to apply this, even to stable.
75 Stable 4.4 has minor conflicts applying this patch. For a clean 4.4 patch
76 see "[PATCH for-4.4] writeback: safer lock nesting"
77 https://lkml.org/lkml/2018/4/11/146
78
79 Wang Long said "this deadlock occurs three times in our environment"
80
81 [gthelen@google.com: v4]
82 Link: http://lkml.kernel.org/r/20180411084653.254724-1-gthelen@google.com
83 [akpm@linux-foundation.org: comment tweaks, struct initialization simplification]
84 Change-Id: Ibb773e8045852978f6207074491d262f1b3fb613
85 Link: http://lkml.kernel.org/r/20180410005908.167976-1-gthelen@google.com
86 Fixes: 682aa8e1a6a1 ("writeback: implement unlocked_inode_to_wb transaction and use it for stat updates")
87 Signed-off-by: Greg Thelen <gthelen@google.com>
88 Reported-by: Wang Long <wanglong19@meituan.com>
89 Acked-by: Wang Long <wanglong19@meituan.com>
90 Acked-by: Michal Hocko <mhocko@suse.com>
91 Reviewed-by: Andrew Morton <akpm@linux-foundation.org>
92 Cc: Johannes Weiner <hannes@cmpxchg.org>
93 Cc: Tejun Heo <tj@kernel.org>
94 Cc: Nicholas Piggin <npiggin@gmail.com>
95 Cc: <stable@vger.kernel.org> [v4.2+]
96 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
97 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
98 [natechancellor: Adjust context due to lack of b93b016313b3b]
99 Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
100 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
101 ---
102 fs/fs-writeback.c | 7 ++++---
103 include/linux/backing-dev-defs.h | 5 +++++
104 include/linux/backing-dev.h | 30 ++++++++++++++++--------------
105 mm/page-writeback.c | 18 +++++++++---------
106 4 files changed, 34 insertions(+), 26 deletions(-)
107
108 --- a/fs/fs-writeback.c
109 +++ b/fs/fs-writeback.c
110 @@ -745,11 +745,12 @@ int inode_congested(struct inode *inode,
111 */
112 if (inode && inode_to_wb_is_valid(inode)) {
113 struct bdi_writeback *wb;
114 - bool locked, congested;
115 + struct wb_lock_cookie lock_cookie = {};
116 + bool congested;
117
118 - wb = unlocked_inode_to_wb_begin(inode, &locked);
119 + wb = unlocked_inode_to_wb_begin(inode, &lock_cookie);
120 congested = wb_congested(wb, cong_bits);
121 - unlocked_inode_to_wb_end(inode, locked);
122 + unlocked_inode_to_wb_end(inode, &lock_cookie);
123 return congested;
124 }
125
126 --- a/include/linux/backing-dev-defs.h
127 +++ b/include/linux/backing-dev-defs.h
128 @@ -223,6 +223,11 @@ static inline void set_bdi_congested(str
129 set_wb_congested(bdi->wb.congested, sync);
130 }
131
132 +struct wb_lock_cookie {
133 + bool locked;
134 + unsigned long flags;
135 +};
136 +
137 #ifdef CONFIG_CGROUP_WRITEBACK
138
139 /**
140 --- a/include/linux/backing-dev.h
141 +++ b/include/linux/backing-dev.h
142 @@ -346,7 +346,7 @@ static inline struct bdi_writeback *inod
143 /**
144 * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
145 * @inode: target inode
146 - * @lockedp: temp bool output param, to be passed to the end function
147 + * @cookie: output param, to be passed to the end function
148 *
149 * The caller wants to access the wb associated with @inode but isn't
150 * holding inode->i_lock, mapping->tree_lock or wb->list_lock. This
151 @@ -354,12 +354,12 @@ static inline struct bdi_writeback *inod
152 * association doesn't change until the transaction is finished with
153 * unlocked_inode_to_wb_end().
154 *
155 - * The caller must call unlocked_inode_to_wb_end() with *@lockdep
156 - * afterwards and can't sleep during transaction. IRQ may or may not be
157 - * disabled on return.
158 + * The caller must call unlocked_inode_to_wb_end() with *@cookie afterwards and
159 + * can't sleep during the transaction. IRQs may or may not be disabled on
160 + * return.
161 */
162 static inline struct bdi_writeback *
163 -unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
164 +unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
165 {
166 rcu_read_lock();
167
168 @@ -367,10 +367,10 @@ unlocked_inode_to_wb_begin(struct inode
169 * Paired with store_release in inode_switch_wb_work_fn() and
170 * ensures that we see the new wb if we see cleared I_WB_SWITCH.
171 */
172 - *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
173 + cookie->locked = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
174
175 - if (unlikely(*lockedp))
176 - spin_lock_irq(&inode->i_mapping->tree_lock);
177 + if (unlikely(cookie->locked))
178 + spin_lock_irqsave(&inode->i_mapping->tree_lock, cookie->flags);
179
180 /*
181 * Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock.
182 @@ -382,12 +382,13 @@ unlocked_inode_to_wb_begin(struct inode
183 /**
184 * unlocked_inode_to_wb_end - end inode wb access transaction
185 * @inode: target inode
186 - * @locked: *@lockedp from unlocked_inode_to_wb_begin()
187 + * @cookie: @cookie from unlocked_inode_to_wb_begin()
188 */
189 -static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
190 +static inline void unlocked_inode_to_wb_end(struct inode *inode,
191 + struct wb_lock_cookie *cookie)
192 {
193 - if (unlikely(locked))
194 - spin_unlock_irq(&inode->i_mapping->tree_lock);
195 + if (unlikely(cookie->locked))
196 + spin_unlock_irqrestore(&inode->i_mapping->tree_lock, cookie->flags);
197
198 rcu_read_unlock();
199 }
200 @@ -434,12 +435,13 @@ static inline struct bdi_writeback *inod
201 }
202
203 static inline struct bdi_writeback *
204 -unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
205 +unlocked_inode_to_wb_begin(struct inode *inode, struct wb_lock_cookie *cookie)
206 {
207 return inode_to_wb(inode);
208 }
209
210 -static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
211 +static inline void unlocked_inode_to_wb_end(struct inode *inode,
212 + struct wb_lock_cookie *cookie)
213 {
214 }
215
216 --- a/mm/page-writeback.c
217 +++ b/mm/page-writeback.c
218 @@ -2501,13 +2501,13 @@ void account_page_redirty(struct page *p
219 if (mapping && mapping_cap_account_dirty(mapping)) {
220 struct inode *inode = mapping->host;
221 struct bdi_writeback *wb;
222 - bool locked;
223 + struct wb_lock_cookie cookie = {};
224
225 - wb = unlocked_inode_to_wb_begin(inode, &locked);
226 + wb = unlocked_inode_to_wb_begin(inode, &cookie);
227 current->nr_dirtied--;
228 dec_node_page_state(page, NR_DIRTIED);
229 dec_wb_stat(wb, WB_DIRTIED);
230 - unlocked_inode_to_wb_end(inode, locked);
231 + unlocked_inode_to_wb_end(inode, &cookie);
232 }
233 }
234 EXPORT_SYMBOL(account_page_redirty);
235 @@ -2613,15 +2613,15 @@ void __cancel_dirty_page(struct page *pa
236 if (mapping_cap_account_dirty(mapping)) {
237 struct inode *inode = mapping->host;
238 struct bdi_writeback *wb;
239 - bool locked;
240 + struct wb_lock_cookie cookie = {};
241
242 lock_page_memcg(page);
243 - wb = unlocked_inode_to_wb_begin(inode, &locked);
244 + wb = unlocked_inode_to_wb_begin(inode, &cookie);
245
246 if (TestClearPageDirty(page))
247 account_page_cleaned(page, mapping, wb);
248
249 - unlocked_inode_to_wb_end(inode, locked);
250 + unlocked_inode_to_wb_end(inode, &cookie);
251 unlock_page_memcg(page);
252 } else {
253 ClearPageDirty(page);
254 @@ -2653,7 +2653,7 @@ int clear_page_dirty_for_io(struct page
255 if (mapping && mapping_cap_account_dirty(mapping)) {
256 struct inode *inode = mapping->host;
257 struct bdi_writeback *wb;
258 - bool locked;
259 + struct wb_lock_cookie cookie = {};
260
261 /*
262 * Yes, Virginia, this is indeed insane.
263 @@ -2690,14 +2690,14 @@ int clear_page_dirty_for_io(struct page
264 * always locked coming in here, so we get the desired
265 * exclusion.
266 */
267 - wb = unlocked_inode_to_wb_begin(inode, &locked);
268 + wb = unlocked_inode_to_wb_begin(inode, &cookie);
269 if (TestClearPageDirty(page)) {
270 dec_lruvec_page_state(page, NR_FILE_DIRTY);
271 dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
272 dec_wb_stat(wb, WB_RECLAIMABLE);
273 ret = 1;
274 }
275 - unlocked_inode_to_wb_end(inode, locked);
276 + unlocked_inode_to_wb_end(inode, &cookie);
277 return ret;
278 }
279 return TestClearPageDirty(page);