From 265bef01cb2f6201e79a80ffccb78b606ff60c33 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Sun, 9 May 2021 16:02:44 +0200
Subject: [PATCH] 5.4-stable patches

added patches:
	dm-raid-fix-inconclusive-reshape-layout-on-fast-raid4-5-6-table-reload-sequences.patch
	fuse-fix-write-deadlock.patch
	md-raid1-properly-indicate-failure-when-ending-a-failed-write-request.patch
---
 ...ast-raid4-5-6-table-reload-sequences.patch | 134 +++++++++++++++
 queue-5.4/fuse-fix-write-deadlock.patch       | 162 ++++++++++++++++++
 ...e-when-ending-a-failed-write-request.patch |  35 ++++
 queue-5.4/series                              |   3 +
 4 files changed, 334 insertions(+)
 create mode 100644 queue-5.4/dm-raid-fix-inconclusive-reshape-layout-on-fast-raid4-5-6-table-reload-sequences.patch
 create mode 100644 queue-5.4/fuse-fix-write-deadlock.patch
 create mode 100644 queue-5.4/md-raid1-properly-indicate-failure-when-ending-a-failed-write-request.patch

diff --git a/queue-5.4/dm-raid-fix-inconclusive-reshape-layout-on-fast-raid4-5-6-table-reload-sequences.patch b/queue-5.4/dm-raid-fix-inconclusive-reshape-layout-on-fast-raid4-5-6-table-reload-sequences.patch
new file mode 100644
index 00000000000..337106366ad
--- /dev/null
+++ b/queue-5.4/dm-raid-fix-inconclusive-reshape-layout-on-fast-raid4-5-6-table-reload-sequences.patch
@@ -0,0 +1,134 @@
+From f99a8e4373eeacb279bc9696937a55adbff7a28a Mon Sep 17 00:00:00 2001
+From: Heinz Mauelshagen <heinzm@redhat.com>
+Date: Wed, 21 Apr 2021 23:32:36 +0200
+Subject: dm raid: fix inconclusive reshape layout on fast raid4/5/6 table reload sequences
+
+From: Heinz Mauelshagen <heinzm@redhat.com>
+
+commit f99a8e4373eeacb279bc9696937a55adbff7a28a upstream.
+
+If fast table reloads occur during an ongoing reshape of raid4/5/6
+devices the target may race reading a superblock vs the the MD resync
+thread; causing an inconclusive reshape state to be read in its
+constructor.
+
+lvm2 test lvconvert-raid-reshape-stripes-load-reload.sh can cause
+BUG_ON() to trigger in md_run(), e.g.:
+"kernel BUG at drivers/md/raid5.c:7567!".
+
+Scenario triggering the bug:
+
+1. the MD sync thread calls end_reshape() from raid5_sync_request()
+   when done reshaping. However end_reshape() _only_ updates the
+   reshape position to MaxSector keeping the changed layout
+   configuration though (i.e. any delta disks, chunk sector or RAID
+   algorithm changes). That inconclusive configuration is stored in
+   the superblock.
+
+2. dm-raid constructs a mapping, loading named inconsistent superblock
+   as of step 1 before step 3 is able to finish resetting the reshape
+   state completely, and calls md_run() which leads to mentioned bug
+   in raid5.c.
+
+3. the MD RAID personality's finish_reshape() is called; which resets
+   the reshape information on chunk sectors, delta disks, etc. This
+   explains why the bug is rarely seen on multi-core machines, as MD's
+   finish_reshape() superblock update races with the dm-raid
+   constructor's superblock load in step 2.
+
+Fix identifies inconclusive superblock content in the dm-raid
+constructor and resets it before calling md_run(), factoring out
+identifying checks into rs_is_layout_change() to share in existing
+rs_reshape_requested() and new rs_reset_inclonclusive_reshape(). Also
+enhance a comment and remove an empty line.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
+Signed-off-by: Mike Snitzer <snitzer@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/dm-raid.c |   34 ++++++++++++++++++++++++++++------
+ 1 file changed, 28 insertions(+), 6 deletions(-)
+
+--- a/drivers/md/dm-raid.c
++++ b/drivers/md/dm-raid.c
+@@ -1892,6 +1892,14 @@ static bool rs_takeover_requested(struct
+ 	return rs->md.new_level != rs->md.level;
+ }
+ 
++/* True if layout is set to reshape. */
++static bool rs_is_layout_change(struct raid_set *rs, bool use_mddev)
++{
++	return (use_mddev ? rs->md.delta_disks : rs->delta_disks) ||
++	       rs->md.new_layout != rs->md.layout ||
++	       rs->md.new_chunk_sectors != rs->md.chunk_sectors;
++}
++
+ /* True if @rs is requested to reshape by ctr */
+ static bool rs_reshape_requested(struct raid_set *rs)
+ {
+@@ -1904,9 +1912,7 @@ static bool rs_reshape_requested(struct
+ 	if (rs_is_raid0(rs))
+ 		return false;
+ 
+-	change = mddev->new_layout != mddev->layout ||
+-		 mddev->new_chunk_sectors != mddev->chunk_sectors ||
+-		 rs->delta_disks;
++	change = rs_is_layout_change(rs, false);
+ 
+ 	/* Historical case to support raid1 reshape without delta disks */
+ 	if (rs_is_raid1(rs)) {
+@@ -2843,7 +2849,7 @@ static sector_t _get_reshape_sectors(str
+ }
+ 
+ /*
+- *
++ * Reshape:
+  * - change raid layout
+  * - change chunk size
+  * - add disks
+@@ -2953,6 +2959,20 @@ static int rs_setup_reshape(struct raid_
+ }
+ 
+ /*
++ * If the md resync thread has updated superblock with max reshape position
++ * at the end of a reshape but not (yet) reset the layout configuration
++ * changes -> reset the latter.
++ */
++static void rs_reset_inconclusive_reshape(struct raid_set *rs)
++{
++	if (!rs_is_reshaping(rs) && rs_is_layout_change(rs, true)) {
++		rs_set_cur(rs);
++		rs->md.delta_disks = 0;
++		rs->md.reshape_backwards = 0;
++	}
++}
++
++/*
+  * Enable/disable discard support on RAID set depending on
+  * RAID level and discard properties of underlying RAID members.
+  */
+@@ -3216,11 +3236,14 @@ static int raid_ctr(struct dm_target *ti
+ 	if (r)
+ 		goto bad;
+ 
++	/* Catch any inconclusive reshape superblock content. */
++	rs_reset_inconclusive_reshape(rs);
++
+ 	/* Start raid set read-only and assumed clean to change in raid_resume() */
+ 	rs->md.ro = 1;
+ 	rs->md.in_sync = 1;
+ 
+-	/* Keep array frozen */
++	/* Keep array frozen until resume. */
+ 	set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
+ 
+ 	/* Has to be held on running the array */
+@@ -3234,7 +3257,6 @@ static int raid_ctr(struct dm_target *ti
+ 	}
+ 
+ 	r = md_start(&rs->md);
+-
+ 	if (r) {
+ 		ti->error = "Failed to start raid array";
+ 		mddev_unlock(&rs->md);
diff --git a/queue-5.4/fuse-fix-write-deadlock.patch b/queue-5.4/fuse-fix-write-deadlock.patch
new file mode 100644
index 00000000000..181e4ad4e6b
--- /dev/null
+++ b/queue-5.4/fuse-fix-write-deadlock.patch
@@ -0,0 +1,162 @@
+From 4f06dd92b5d0a6f8eec6a34b8d6ef3e1f4ac1e10 Mon Sep 17 00:00:00 2001
+From: Vivek Goyal <vgoyal@redhat.com>
+Date: Wed, 21 Oct 2020 16:12:49 -0400
+Subject: fuse: fix write deadlock
+
+From: Vivek Goyal <vgoyal@redhat.com>
+
+commit 4f06dd92b5d0a6f8eec6a34b8d6ef3e1f4ac1e10 upstream.
+
+There are two modes for write(2) and friends in fuse:
+
+a) write through (update page cache, send sync WRITE request to userspace)
+
+b) buffered write (update page cache, async writeout later)
+
+The write through method kept all the page cache pages locked that were
+used for the request.  Keeping more than one page locked is deadlock prone
+and Qian Cai demonstrated this with trinity fuzzing.
+
+The reason for keeping the pages locked is that concurrent mapped reads
+shouldn't try to pull possibly stale data into the page cache.
+
+For full page writes, the easy way to fix this is to make the cached page
+be the authoritative source by marking the page PG_uptodate immediately.
+After this the page can be safely unlocked, since mapped/cached reads will
+take the written data from the cache.
+
+Concurrent mapped writes will now cause data in the original WRITE request
+to be updated; this however doesn't cause any data inconsistency and this
+scenario should be exceedingly rare anyway.
+
+If the WRITE request returns with an error in the above case, currently the
+page is not marked uptodate; this means that a concurrent read will always
+read consistent data.  After this patch the page is uptodate between
+writing to the cache and receiving the error: there's window where a cached
+read will read the wrong data.  While theoretically this could be a
+regression, it is unlikely to be one in practice, since this is normal for
+buffered writes.
+
+In case of a partial page write to an already uptodate page the locking is
+also unnecessary, with the above caveats.
+
+Partial write of a not uptodate page still needs to be handled.  One way
+would be to read the complete page before doing the write.  This is not
+possible, since it might break filesystems that don't expect any READ
+requests when the file was opened O_WRONLY.
+
+The other solution is to serialize the synchronous write with reads from
+the partial pages.  The easiest way to do this is to keep the partial pages
+locked.  The problem is that a write() may involve two such pages (one head
+and one tail).  This patch fixes it by only locking the partial tail page.
+If there's a partial head page as well, then split that off as a separate
+WRITE request.
+
+Reported-by: Qian Cai <cai@lca.pw>
+Link: https://lore.kernel.org/linux-fsdevel/4794a3fa3742a5e84fb0f934944204b55730829b.camel@lca.pw/
+Fixes: ea9b9907b82a ("fuse: implement perform_write")
+Cc: <stable@vger.kernel.org> # v2.6.26
+Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
+Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/fuse/file.c   |   41 +++++++++++++++++++++++++++++------------
+ fs/fuse/fuse_i.h |    1 +
+ 2 files changed, 30 insertions(+), 12 deletions(-)
+
+--- a/fs/fuse/file.c
++++ b/fs/fuse/file.c
+@@ -1108,6 +1108,7 @@ static ssize_t fuse_send_write_pages(str
+ 	struct fuse_file *ff = file->private_data;
+ 	struct fuse_conn *fc = ff->fc;
+ 	unsigned int offset, i;
++	bool short_write;
+ 	int err;
+ 
+ 	for (i = 0; i < ap->num_pages; i++)
+@@ -1120,32 +1121,38 @@ static ssize_t fuse_send_write_pages(str
+ 	if (!err && ia->write.out.size > count)
+ 		err = -EIO;
+ 
++	short_write = ia->write.out.size < count;
+ 	offset = ap->descs[0].offset;
+ 	count = ia->write.out.size;
+ 	for (i = 0; i < ap->num_pages; i++) {
+ 		struct page *page = ap->pages[i];
+ 
+-		if (!err && !offset && count >= PAGE_SIZE)
+-			SetPageUptodate(page);
+-
+-		if (count > PAGE_SIZE - offset)
+-			count -= PAGE_SIZE - offset;
+-		else
+-			count = 0;
+-		offset = 0;
+-
+-		unlock_page(page);
++		if (err) {
++			ClearPageUptodate(page);
++		} else {
++			if (count >= PAGE_SIZE - offset)
++				count -= PAGE_SIZE - offset;
++			else {
++				if (short_write)
++					ClearPageUptodate(page);
++				count = 0;
++			}
++			offset = 0;
++		}
++		if (ia->write.page_locked && (i == ap->num_pages - 1))
++			unlock_page(page);
+ 		put_page(page);
+ 	}
+ 
+ 	return err;
+ }
+ 
+-static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap,
++static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia,
+ 				     struct address_space *mapping,
+ 				     struct iov_iter *ii, loff_t pos,
+ 				     unsigned int max_pages)
+ {
++	struct fuse_args_pages *ap = &ia->ap;
+ 	struct fuse_conn *fc = get_fuse_conn(mapping->host);
+ 	unsigned offset = pos & (PAGE_SIZE - 1);
+ 	size_t count = 0;
+@@ -1198,6 +1205,16 @@ static ssize_t fuse_fill_write_pages(str
+ 		if (offset == PAGE_SIZE)
+ 			offset = 0;
+ 
++		/* If we copied full page, mark it uptodate */
++		if (tmp == PAGE_SIZE)
++			SetPageUptodate(page);
++
++		if (PageUptodate(page)) {
++			unlock_page(page);
++		} else {
++			ia->write.page_locked = true;
++			break;
++		}
+ 		if (!fc->big_writes)
+ 			break;
+ 	} while (iov_iter_count(ii) && count < fc->max_write &&
+@@ -1241,7 +1258,7 @@ static ssize_t fuse_perform_write(struct
+ 			break;
+ 		}
+ 
+-		count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages);
++		count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages);
+ 		if (count <= 0) {
+ 			err = count;
+ 		} else {
+--- a/fs/fuse/fuse_i.h
++++ b/fs/fuse/fuse_i.h
+@@ -845,6 +845,7 @@ struct fuse_io_args {
+ 		struct {
+ 			struct fuse_write_in in;
+ 			struct fuse_write_out out;
++			bool page_locked;
+ 		} write;
+ 	};
+ 	struct fuse_args_pages ap;
diff --git a/queue-5.4/md-raid1-properly-indicate-failure-when-ending-a-failed-write-request.patch b/queue-5.4/md-raid1-properly-indicate-failure-when-ending-a-failed-write-request.patch
new file mode 100644
index 00000000000..c4813b6dd7e
--- /dev/null
+++ b/queue-5.4/md-raid1-properly-indicate-failure-when-ending-a-failed-write-request.patch
@@ -0,0 +1,35 @@
+From 2417b9869b81882ab90fd5ed1081a1cb2d4db1dd Mon Sep 17 00:00:00 2001
+From: Paul Clements <paul.clements@us.sios.com>
+Date: Thu, 15 Apr 2021 17:17:57 -0400
+Subject: md/raid1: properly indicate failure when ending a failed write request
+
+From: Paul Clements <paul.clements@us.sios.com>
+
+commit 2417b9869b81882ab90fd5ed1081a1cb2d4db1dd upstream.
+
+This patch addresses a data corruption bug in raid1 arrays using bitmaps.
+Without this fix, the bitmap bits for the failed I/O end up being cleared.
+
+Since we are in the failure leg of raid1_end_write_request, the request
+either needs to be retried (R1BIO_WriteError) or failed (R1BIO_Degraded).
+
+Fixes: eeba6809d8d5 ("md/raid1: end bio when the device faulty")
+Cc: stable@vger.kernel.org # v5.2+
+Signed-off-by: Paul Clements <paul.clements@us.sios.com>
+Signed-off-by: Song Liu <song@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/md/raid1.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/drivers/md/raid1.c
++++ b/drivers/md/raid1.c
+@@ -458,6 +458,8 @@ static void raid1_end_write_request(stru
+ 		if (!test_bit(Faulty, &rdev->flags))
+ 			set_bit(R1BIO_WriteError, &r1_bio->state);
+ 		else {
++			/* Fail the request */
++			set_bit(R1BIO_Degraded, &r1_bio->state);
+ 			/* Finished with this branch */
+ 			r1_bio->bios[mirror] = NULL;
+ 			to_put = bio;
diff --git a/queue-5.4/series b/queue-5.4/series
index 99a7e529778..188d128df2f 100644
--- a/queue-5.4/series
+++ b/queue-5.4/series
@@ -151,3 +151,6 @@ intel_th-pci-add-alder-lake-m-support.patch
 tpm-efi-use-local-variable-for-calculating-final-log-size.patch
 tpm-vtpm_proxy-avoid-reading-host-log-when-using-a-virtual-device.patch
 crypto-rng-fix-crypto_rng_reset-refcounting-when-crypto_stats.patch
+md-raid1-properly-indicate-failure-when-ending-a-failed-write-request.patch
+dm-raid-fix-inconclusive-reshape-layout-on-fast-raid4-5-6-table-reload-sequences.patch
+fuse-fix-write-deadlock.patch
-- 
2.47.3