]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob - releases/5.0.19/revert-md-fix-lock-contention-for-flush-bios.patch
4.9-stable patches
[thirdparty/kernel/stable-queue.git] / releases / 5.0.19 / revert-md-fix-lock-contention-for-flush-bios.patch
1 From 4bc034d35377196c854236133b07730a777c4aba Mon Sep 17 00:00:00 2001
2 From: NeilBrown <neilb@suse.com>
3 Date: Fri, 29 Mar 2019 10:46:16 -0700
4 Subject: Revert "MD: fix lock contention for flush bios"
5
6 From: NeilBrown <neilb@suse.com>
7
8 commit 4bc034d35377196c854236133b07730a777c4aba upstream.
9
10 This reverts commit 5a409b4f56d50b212334f338cb8465d65550cd85.
11
12 This patch has two problems.
13
14 1/ it make multiple calls to submit_bio() from inside a make_request_fn.
15 The bios thus submitted will be queued on current->bio_list and not
16 submitted immediately. As the bios are allocated from a mempool,
17 this can theoretically result in a deadlock - all the pool of requests
18 could be in various ->bio_list queues and a subsequent mempool_alloc
19 could block waiting for one of them to be released.
20
21 2/ It aims to handle a case when there are many concurrent flush requests.
22 It handles this by submitting many requests in parallel - all of which
23 are identical and so most of which do nothing useful.
24 It would be more efficient to just send one lower-level request, but
25 allow that to satisfy multiple upper-level requests.
26
27 Fixes: 5a409b4f56d5 ("MD: fix lock contention for flush bios")
28 Cc: <stable@vger.kernel.org> # v4.19+
29 Tested-by: Xiao Ni <xni@redhat.com>
30 Signed-off-by: NeilBrown <neilb@suse.com>
31 Signed-off-by: Song Liu <songliubraving@fb.com>
32 Signed-off-by: Jens Axboe <axboe@kernel.dk>
33 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
34
35 ---
36 drivers/md/md.c | 159 +++++++++++++++++++-------------------------------------
37 drivers/md/md.h | 22 ++-----
38 2 files changed, 62 insertions(+), 119 deletions(-)
39
40 --- a/drivers/md/md.c
41 +++ b/drivers/md/md.c
42 @@ -132,24 +132,6 @@ static inline int speed_max(struct mddev
43 mddev->sync_speed_max : sysctl_speed_limit_max;
44 }
45
46 -static void * flush_info_alloc(gfp_t gfp_flags, void *data)
47 -{
48 - return kzalloc(sizeof(struct flush_info), gfp_flags);
49 -}
50 -static void flush_info_free(void *flush_info, void *data)
51 -{
52 - kfree(flush_info);
53 -}
54 -
55 -static void * flush_bio_alloc(gfp_t gfp_flags, void *data)
56 -{
57 - return kzalloc(sizeof(struct flush_bio), gfp_flags);
58 -}
59 -static void flush_bio_free(void *flush_bio, void *data)
60 -{
61 - kfree(flush_bio);
62 -}
63 -
64 static struct ctl_table_header *raid_table_header;
65
66 static struct ctl_table raid_table[] = {
67 @@ -423,54 +405,30 @@ static int md_congested(void *data, int
68 /*
69 * Generic flush handling for md
70 */
71 -static void submit_flushes(struct work_struct *ws)
72 -{
73 - struct flush_info *fi = container_of(ws, struct flush_info, flush_work);
74 - struct mddev *mddev = fi->mddev;
75 - struct bio *bio = fi->bio;
76 -
77 - bio->bi_opf &= ~REQ_PREFLUSH;
78 - md_handle_request(mddev, bio);
79 -
80 - mempool_free(fi, mddev->flush_pool);
81 -}
82
83 -static void md_end_flush(struct bio *fbio)
84 +static void md_end_flush(struct bio *bio)
85 {
86 - struct flush_bio *fb = fbio->bi_private;
87 - struct md_rdev *rdev = fb->rdev;
88 - struct flush_info *fi = fb->fi;
89 - struct bio *bio = fi->bio;
90 - struct mddev *mddev = fi->mddev;
91 + struct md_rdev *rdev = bio->bi_private;
92 + struct mddev *mddev = rdev->mddev;
93
94 rdev_dec_pending(rdev, mddev);
95
96 - if (atomic_dec_and_test(&fi->flush_pending)) {
97 - if (bio->bi_iter.bi_size == 0) {
98 - /* an empty barrier - all done */
99 - bio_endio(bio);
100 - mempool_free(fi, mddev->flush_pool);
101 - } else {
102 - INIT_WORK(&fi->flush_work, submit_flushes);
103 - queue_work(md_wq, &fi->flush_work);
104 - }
105 + if (atomic_dec_and_test(&mddev->flush_pending)) {
106 + /* The pre-request flush has finished */
107 + queue_work(md_wq, &mddev->flush_work);
108 }
109 -
110 - mempool_free(fb, mddev->flush_bio_pool);
111 - bio_put(fbio);
112 + bio_put(bio);
113 }
114
115 -void md_flush_request(struct mddev *mddev, struct bio *bio)
116 +static void md_submit_flush_data(struct work_struct *ws);
117 +
118 +static void submit_flushes(struct work_struct *ws)
119 {
120 + struct mddev *mddev = container_of(ws, struct mddev, flush_work);
121 struct md_rdev *rdev;
122 - struct flush_info *fi;
123 -
124 - fi = mempool_alloc(mddev->flush_pool, GFP_NOIO);
125 -
126 - fi->bio = bio;
127 - fi->mddev = mddev;
128 - atomic_set(&fi->flush_pending, 1);
129
130 + INIT_WORK(&mddev->flush_work, md_submit_flush_data);
131 + atomic_set(&mddev->flush_pending, 1);
132 rcu_read_lock();
133 rdev_for_each_rcu(rdev, mddev)
134 if (rdev->raid_disk >= 0 &&
135 @@ -480,40 +438,59 @@ void md_flush_request(struct mddev *mdde
136 * we reclaim rcu_read_lock
137 */
138 struct bio *bi;
139 - struct flush_bio *fb;
140 atomic_inc(&rdev->nr_pending);
141 atomic_inc(&rdev->nr_pending);
142 rcu_read_unlock();
143 -
144 - fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO);
145 - fb->fi = fi;
146 - fb->rdev = rdev;
147 -
148 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
149 - bio_set_dev(bi, rdev->bdev);
150 bi->bi_end_io = md_end_flush;
151 - bi->bi_private = fb;
152 + bi->bi_private = rdev;
153 + bio_set_dev(bi, rdev->bdev);
154 bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
155 -
156 - atomic_inc(&fi->flush_pending);
157 + atomic_inc(&mddev->flush_pending);
158 submit_bio(bi);
159 -
160 rcu_read_lock();
161 rdev_dec_pending(rdev, mddev);
162 }
163 rcu_read_unlock();
164 + if (atomic_dec_and_test(&mddev->flush_pending))
165 + queue_work(md_wq, &mddev->flush_work);
166 +}
167
168 - if (atomic_dec_and_test(&fi->flush_pending)) {
169 - if (bio->bi_iter.bi_size == 0) {
170 - /* an empty barrier - all done */
171 - bio_endio(bio);
172 - mempool_free(fi, mddev->flush_pool);
173 - } else {
174 - INIT_WORK(&fi->flush_work, submit_flushes);
175 - queue_work(md_wq, &fi->flush_work);
176 - }
177 +static void md_submit_flush_data(struct work_struct *ws)
178 +{
179 + struct mddev *mddev = container_of(ws, struct mddev, flush_work);
180 + struct bio *bio = mddev->flush_bio;
181 +
182 + /*
183 + * must reset flush_bio before calling into md_handle_request to avoid a
184 + * deadlock, because other bios passed md_handle_request suspend check
185 + * could wait for this and below md_handle_request could wait for those
186 + * bios because of suspend check
187 + */
188 + mddev->flush_bio = NULL;
189 + wake_up(&mddev->sb_wait);
190 +
191 + if (bio->bi_iter.bi_size == 0) {
192 + /* an empty barrier - all done */
193 + bio_endio(bio);
194 + } else {
195 + bio->bi_opf &= ~REQ_PREFLUSH;
196 + md_handle_request(mddev, bio);
197 }
198 }
199 +
200 +void md_flush_request(struct mddev *mddev, struct bio *bio)
201 +{
202 + spin_lock_irq(&mddev->lock);
203 + wait_event_lock_irq(mddev->sb_wait,
204 + !mddev->flush_bio,
205 + mddev->lock);
206 + mddev->flush_bio = bio;
207 + spin_unlock_irq(&mddev->lock);
208 +
209 + INIT_WORK(&mddev->flush_work, submit_flushes);
210 + queue_work(md_wq, &mddev->flush_work);
211 +}
212 EXPORT_SYMBOL(md_flush_request);
213
214 static inline struct mddev *mddev_get(struct mddev *mddev)
215 @@ -560,6 +537,7 @@ void mddev_init(struct mddev *mddev)
216 atomic_set(&mddev->openers, 0);
217 atomic_set(&mddev->active_io, 0);
218 spin_lock_init(&mddev->lock);
219 + atomic_set(&mddev->flush_pending, 0);
220 init_waitqueue_head(&mddev->sb_wait);
221 init_waitqueue_head(&mddev->recovery_wait);
222 mddev->reshape_position = MaxSector;
223 @@ -5511,22 +5489,6 @@ int md_run(struct mddev *mddev)
224 if (err)
225 return err;
226 }
227 - if (mddev->flush_pool == NULL) {
228 - mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc,
229 - flush_info_free, mddev);
230 - if (!mddev->flush_pool) {
231 - err = -ENOMEM;
232 - goto abort;
233 - }
234 - }
235 - if (mddev->flush_bio_pool == NULL) {
236 - mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc,
237 - flush_bio_free, mddev);
238 - if (!mddev->flush_bio_pool) {
239 - err = -ENOMEM;
240 - goto abort;
241 - }
242 - }
243
244 spin_lock(&pers_lock);
245 pers = find_pers(mddev->level, mddev->clevel);
246 @@ -5686,11 +5648,8 @@ int md_run(struct mddev *mddev)
247 return 0;
248
249 abort:
250 - mempool_destroy(mddev->flush_bio_pool);
251 - mddev->flush_bio_pool = NULL;
252 - mempool_destroy(mddev->flush_pool);
253 - mddev->flush_pool = NULL;
254 -
255 + bioset_exit(&mddev->bio_set);
256 + bioset_exit(&mddev->sync_set);
257 return err;
258 }
259 EXPORT_SYMBOL_GPL(md_run);
260 @@ -5894,14 +5853,6 @@ static void __md_stop(struct mddev *mdde
261 mddev->to_remove = &md_redundancy_group;
262 module_put(pers->owner);
263 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
264 - if (mddev->flush_bio_pool) {
265 - mempool_destroy(mddev->flush_bio_pool);
266 - mddev->flush_bio_pool = NULL;
267 - }
268 - if (mddev->flush_pool) {
269 - mempool_destroy(mddev->flush_pool);
270 - mddev->flush_pool = NULL;
271 - }
272 }
273
274 void md_stop(struct mddev *mddev)
275 --- a/drivers/md/md.h
276 +++ b/drivers/md/md.h
277 @@ -252,19 +252,6 @@ enum mddev_sb_flags {
278 MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */
279 };
280
281 -#define NR_FLUSH_INFOS 8
282 -#define NR_FLUSH_BIOS 64
283 -struct flush_info {
284 - struct bio *bio;
285 - struct mddev *mddev;
286 - struct work_struct flush_work;
287 - atomic_t flush_pending;
288 -};
289 -struct flush_bio {
290 - struct flush_info *fi;
291 - struct md_rdev *rdev;
292 -};
293 -
294 struct mddev {
295 void *private;
296 struct md_personality *pers;
297 @@ -470,8 +457,13 @@ struct mddev {
298 * metadata and bitmap writes
299 */
300
301 - mempool_t *flush_pool;
302 - mempool_t *flush_bio_pool;
303 + /* Generic flush handling.
304 + * The last to finish preflush schedules a worker to submit
305 + * the rest of the request (without the REQ_PREFLUSH flag).
306 + */
307 + struct bio *flush_bio;
308 + atomic_t flush_pending;
309 + struct work_struct flush_work;
310 struct work_struct event_work; /* used by dm to report failure event */
311 void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
312 struct md_cluster_info *cluster_info;