From 265bef01cb2f6201e79a80ffccb78b606ff60c33 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 9 May 2021 16:02:44 +0200 Subject: [PATCH] 5.4-stable patches added patches: dm-raid-fix-inconclusive-reshape-layout-on-fast-raid4-5-6-table-reload-sequences.patch fuse-fix-write-deadlock.patch md-raid1-properly-indicate-failure-when-ending-a-failed-write-request.patch --- ...ast-raid4-5-6-table-reload-sequences.patch | 134 +++++++++++++++ queue-5.4/fuse-fix-write-deadlock.patch | 162 ++++++++++++++++++ ...e-when-ending-a-failed-write-request.patch | 35 ++++ queue-5.4/series | 3 + 4 files changed, 334 insertions(+) create mode 100644 queue-5.4/dm-raid-fix-inconclusive-reshape-layout-on-fast-raid4-5-6-table-reload-sequences.patch create mode 100644 queue-5.4/fuse-fix-write-deadlock.patch create mode 100644 queue-5.4/md-raid1-properly-indicate-failure-when-ending-a-failed-write-request.patch diff --git a/queue-5.4/dm-raid-fix-inconclusive-reshape-layout-on-fast-raid4-5-6-table-reload-sequences.patch b/queue-5.4/dm-raid-fix-inconclusive-reshape-layout-on-fast-raid4-5-6-table-reload-sequences.patch new file mode 100644 index 00000000000..337106366ad --- /dev/null +++ b/queue-5.4/dm-raid-fix-inconclusive-reshape-layout-on-fast-raid4-5-6-table-reload-sequences.patch @@ -0,0 +1,134 @@ +From f99a8e4373eeacb279bc9696937a55adbff7a28a Mon Sep 17 00:00:00 2001 +From: Heinz Mauelshagen +Date: Wed, 21 Apr 2021 23:32:36 +0200 +Subject: dm raid: fix inconclusive reshape layout on fast raid4/5/6 table reload sequences + +From: Heinz Mauelshagen + +commit f99a8e4373eeacb279bc9696937a55adbff7a28a upstream. + +If fast table reloads occur during an ongoing reshape of raid4/5/6 +devices the target may race reading a superblock vs the the MD resync +thread; causing an inconclusive reshape state to be read in its +constructor. + +lvm2 test lvconvert-raid-reshape-stripes-load-reload.sh can cause +BUG_ON() to trigger in md_run(), e.g.: +"kernel BUG at drivers/md/raid5.c:7567!". + +Scenario triggering the bug: + +1. the MD sync thread calls end_reshape() from raid5_sync_request() + when done reshaping. However end_reshape() _only_ updates the + reshape position to MaxSector keeping the changed layout + configuration though (i.e. any delta disks, chunk sector or RAID + algorithm changes). That inconclusive configuration is stored in + the superblock. + +2. dm-raid constructs a mapping, loading named inconsistent superblock + as of step 1 before step 3 is able to finish resetting the reshape + state completely, and calls md_run() which leads to mentioned bug + in raid5.c. + +3. the MD RAID personality's finish_reshape() is called; which resets + the reshape information on chunk sectors, delta disks, etc. This + explains why the bug is rarely seen on multi-core machines, as MD's + finish_reshape() superblock update races with the dm-raid + constructor's superblock load in step 2. + +Fix identifies inconclusive superblock content in the dm-raid +constructor and resets it before calling md_run(), factoring out +identifying checks into rs_is_layout_change() to share in existing +rs_reshape_requested() and new rs_reset_inclonclusive_reshape(). Also +enhance a comment and remove an empty line. + +Cc: stable@vger.kernel.org +Signed-off-by: Heinz Mauelshagen +Signed-off-by: Mike Snitzer +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm-raid.c | 34 ++++++++++++++++++++++++++++------ + 1 file changed, 28 insertions(+), 6 deletions(-) + +--- a/drivers/md/dm-raid.c ++++ b/drivers/md/dm-raid.c +@@ -1892,6 +1892,14 @@ static bool rs_takeover_requested(struct + return rs->md.new_level != rs->md.level; + } + ++/* True if layout is set to reshape. */ ++static bool rs_is_layout_change(struct raid_set *rs, bool use_mddev) ++{ ++ return (use_mddev ? rs->md.delta_disks : rs->delta_disks) || ++ rs->md.new_layout != rs->md.layout || ++ rs->md.new_chunk_sectors != rs->md.chunk_sectors; ++} ++ + /* True if @rs is requested to reshape by ctr */ + static bool rs_reshape_requested(struct raid_set *rs) + { +@@ -1904,9 +1912,7 @@ static bool rs_reshape_requested(struct + if (rs_is_raid0(rs)) + return false; + +- change = mddev->new_layout != mddev->layout || +- mddev->new_chunk_sectors != mddev->chunk_sectors || +- rs->delta_disks; ++ change = rs_is_layout_change(rs, false); + + /* Historical case to support raid1 reshape without delta disks */ + if (rs_is_raid1(rs)) { +@@ -2843,7 +2849,7 @@ static sector_t _get_reshape_sectors(str + } + + /* +- * ++ * Reshape: + * - change raid layout + * - change chunk size + * - add disks +@@ -2953,6 +2959,20 @@ static int rs_setup_reshape(struct raid_ + } + + /* ++ * If the md resync thread has updated superblock with max reshape position ++ * at the end of a reshape but not (yet) reset the layout configuration ++ * changes -> reset the latter. ++ */ ++static void rs_reset_inconclusive_reshape(struct raid_set *rs) ++{ ++ if (!rs_is_reshaping(rs) && rs_is_layout_change(rs, true)) { ++ rs_set_cur(rs); ++ rs->md.delta_disks = 0; ++ rs->md.reshape_backwards = 0; ++ } ++} ++ ++/* + * Enable/disable discard support on RAID set depending on + * RAID level and discard properties of underlying RAID members. + */ +@@ -3216,11 +3236,14 @@ static int raid_ctr(struct dm_target *ti + if (r) + goto bad; + ++ /* Catch any inconclusive reshape superblock content. */ ++ rs_reset_inconclusive_reshape(rs); ++ + /* Start raid set read-only and assumed clean to change in raid_resume() */ + rs->md.ro = 1; + rs->md.in_sync = 1; + +- /* Keep array frozen */ ++ /* Keep array frozen until resume. */ + set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); + + /* Has to be held on running the array */ +@@ -3234,7 +3257,6 @@ static int raid_ctr(struct dm_target *ti + } + + r = md_start(&rs->md); +- + if (r) { + ti->error = "Failed to start raid array"; + mddev_unlock(&rs->md); diff --git a/queue-5.4/fuse-fix-write-deadlock.patch b/queue-5.4/fuse-fix-write-deadlock.patch new file mode 100644 index 00000000000..181e4ad4e6b --- /dev/null +++ b/queue-5.4/fuse-fix-write-deadlock.patch @@ -0,0 +1,162 @@ +From 4f06dd92b5d0a6f8eec6a34b8d6ef3e1f4ac1e10 Mon Sep 17 00:00:00 2001 +From: Vivek Goyal +Date: Wed, 21 Oct 2020 16:12:49 -0400 +Subject: fuse: fix write deadlock + +From: Vivek Goyal + +commit 4f06dd92b5d0a6f8eec6a34b8d6ef3e1f4ac1e10 upstream. + +There are two modes for write(2) and friends in fuse: + +a) write through (update page cache, send sync WRITE request to userspace) + +b) buffered write (update page cache, async writeout later) + +The write through method kept all the page cache pages locked that were +used for the request. Keeping more than one page locked is deadlock prone +and Qian Cai demonstrated this with trinity fuzzing. + +The reason for keeping the pages locked is that concurrent mapped reads +shouldn't try to pull possibly stale data into the page cache. + +For full page writes, the easy way to fix this is to make the cached page +be the authoritative source by marking the page PG_uptodate immediately. +After this the page can be safely unlocked, since mapped/cached reads will +take the written data from the cache. + +Concurrent mapped writes will now cause data in the original WRITE request +to be updated; this however doesn't cause any data inconsistency and this +scenario should be exceedingly rare anyway. + +If the WRITE request returns with an error in the above case, currently the +page is not marked uptodate; this means that a concurrent read will always +read consistent data. After this patch the page is uptodate between +writing to the cache and receiving the error: there's window where a cached +read will read the wrong data. While theoretically this could be a +regression, it is unlikely to be one in practice, since this is normal for +buffered writes. + +In case of a partial page write to an already uptodate page the locking is +also unnecessary, with the above caveats. + +Partial write of a not uptodate page still needs to be handled. One way +would be to read the complete page before doing the write. This is not +possible, since it might break filesystems that don't expect any READ +requests when the file was opened O_WRONLY. + +The other solution is to serialize the synchronous write with reads from +the partial pages. The easiest way to do this is to keep the partial pages +locked. The problem is that a write() may involve two such pages (one head +and one tail). This patch fixes it by only locking the partial tail page. +If there's a partial head page as well, then split that off as a separate +WRITE request. + +Reported-by: Qian Cai +Link: https://lore.kernel.org/linux-fsdevel/4794a3fa3742a5e84fb0f934944204b55730829b.camel@lca.pw/ +Fixes: ea9b9907b82a ("fuse: implement perform_write") +Cc: # v2.6.26 +Signed-off-by: Vivek Goyal +Signed-off-by: Miklos Szeredi +Signed-off-by: Greg Kroah-Hartman +--- + fs/fuse/file.c | 41 +++++++++++++++++++++++++++++------------ + fs/fuse/fuse_i.h | 1 + + 2 files changed, 30 insertions(+), 12 deletions(-) + +--- a/fs/fuse/file.c ++++ b/fs/fuse/file.c +@@ -1108,6 +1108,7 @@ static ssize_t fuse_send_write_pages(str + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = ff->fc; + unsigned int offset, i; ++ bool short_write; + int err; + + for (i = 0; i < ap->num_pages; i++) +@@ -1120,32 +1121,38 @@ static ssize_t fuse_send_write_pages(str + if (!err && ia->write.out.size > count) + err = -EIO; + ++ short_write = ia->write.out.size < count; + offset = ap->descs[0].offset; + count = ia->write.out.size; + for (i = 0; i < ap->num_pages; i++) { + struct page *page = ap->pages[i]; + +- if (!err && !offset && count >= PAGE_SIZE) +- SetPageUptodate(page); +- +- if (count > PAGE_SIZE - offset) +- count -= PAGE_SIZE - offset; +- else +- count = 0; +- offset = 0; +- +- unlock_page(page); ++ if (err) { ++ ClearPageUptodate(page); ++ } else { ++ if (count >= PAGE_SIZE - offset) ++ count -= PAGE_SIZE - offset; ++ else { ++ if (short_write) ++ ClearPageUptodate(page); ++ count = 0; ++ } ++ offset = 0; ++ } ++ if (ia->write.page_locked && (i == ap->num_pages - 1)) ++ unlock_page(page); + put_page(page); + } + + return err; + } + +-static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap, ++static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos, + unsigned int max_pages) + { ++ struct fuse_args_pages *ap = &ia->ap; + struct fuse_conn *fc = get_fuse_conn(mapping->host); + unsigned offset = pos & (PAGE_SIZE - 1); + size_t count = 0; +@@ -1198,6 +1205,16 @@ static ssize_t fuse_fill_write_pages(str + if (offset == PAGE_SIZE) + offset = 0; + ++ /* If we copied full page, mark it uptodate */ ++ if (tmp == PAGE_SIZE) ++ SetPageUptodate(page); ++ ++ if (PageUptodate(page)) { ++ unlock_page(page); ++ } else { ++ ia->write.page_locked = true; ++ break; ++ } + if (!fc->big_writes) + break; + } while (iov_iter_count(ii) && count < fc->max_write && +@@ -1241,7 +1258,7 @@ static ssize_t fuse_perform_write(struct + break; + } + +- count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages); ++ count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages); + if (count <= 0) { + err = count; + } else { +--- a/fs/fuse/fuse_i.h ++++ b/fs/fuse/fuse_i.h +@@ -845,6 +845,7 @@ struct fuse_io_args { + struct { + struct fuse_write_in in; + struct fuse_write_out out; ++ bool page_locked; + } write; + }; + struct fuse_args_pages ap; diff --git a/queue-5.4/md-raid1-properly-indicate-failure-when-ending-a-failed-write-request.patch b/queue-5.4/md-raid1-properly-indicate-failure-when-ending-a-failed-write-request.patch new file mode 100644 index 00000000000..c4813b6dd7e --- /dev/null +++ b/queue-5.4/md-raid1-properly-indicate-failure-when-ending-a-failed-write-request.patch @@ -0,0 +1,35 @@ +From 2417b9869b81882ab90fd5ed1081a1cb2d4db1dd Mon Sep 17 00:00:00 2001 +From: Paul Clements +Date: Thu, 15 Apr 2021 17:17:57 -0400 +Subject: md/raid1: properly indicate failure when ending a failed write request + +From: Paul Clements + +commit 2417b9869b81882ab90fd5ed1081a1cb2d4db1dd upstream. + +This patch addresses a data corruption bug in raid1 arrays using bitmaps. +Without this fix, the bitmap bits for the failed I/O end up being cleared. + +Since we are in the failure leg of raid1_end_write_request, the request +either needs to be retried (R1BIO_WriteError) or failed (R1BIO_Degraded). + +Fixes: eeba6809d8d5 ("md/raid1: end bio when the device faulty") +Cc: stable@vger.kernel.org # v5.2+ +Signed-off-by: Paul Clements +Signed-off-by: Song Liu +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/raid1.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/md/raid1.c ++++ b/drivers/md/raid1.c +@@ -458,6 +458,8 @@ static void raid1_end_write_request(stru + if (!test_bit(Faulty, &rdev->flags)) + set_bit(R1BIO_WriteError, &r1_bio->state); + else { ++ /* Fail the request */ ++ set_bit(R1BIO_Degraded, &r1_bio->state); + /* Finished with this branch */ + r1_bio->bios[mirror] = NULL; + to_put = bio; diff --git a/queue-5.4/series b/queue-5.4/series index 99a7e529778..188d128df2f 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -151,3 +151,6 @@ intel_th-pci-add-alder-lake-m-support.patch tpm-efi-use-local-variable-for-calculating-final-log-size.patch tpm-vtpm_proxy-avoid-reading-host-log-when-using-a-virtual-device.patch crypto-rng-fix-crypto_rng_reset-refcounting-when-crypto_stats.patch +md-raid1-properly-indicate-failure-when-ending-a-failed-write-request.patch +dm-raid-fix-inconclusive-reshape-layout-on-fast-raid4-5-6-table-reload-sequences.patch +fuse-fix-write-deadlock.patch -- 2.47.3