From cd1f8b2d6c3108036b145c09f41259ef67a80f66 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Fri, 15 Jul 2022 16:34:22 +0200
Subject: [PATCH] 5.15-stable patches

added patches:
	xfs-don-t-include-bnobt-blocks-when-reserving-free-block-pool.patch
	xfs-drop-async-cache-flushes-from-cil-commits.patch
	xfs-only-run-cow-extent-recovery-when-there-are-no-live-extents.patch
	xfs-run-callbacks-before-waking-waiters-in-xlog_state_shutdown_callbacks.patch
---
 queue-5.15/series                             |   4 +
 ...locks-when-reserving-free-block-pool.patch |  97 +++++
 ...async-cache-flushes-from-cil-commits.patch | 361 ++++++++++++++++++
 ...overy-when-there-are-no-live-extents.patch | 176 +++++++++
 ...ers-in-xlog_state_shutdown_callbacks.patch | 171 +++++++++
 5 files changed, 809 insertions(+)
 create mode 100644 queue-5.15/xfs-don-t-include-bnobt-blocks-when-reserving-free-block-pool.patch
 create mode 100644 queue-5.15/xfs-drop-async-cache-flushes-from-cil-commits.patch
 create mode 100644 queue-5.15/xfs-only-run-cow-extent-recovery-when-there-are-no-live-extents.patch
 create mode 100644 queue-5.15/xfs-run-callbacks-before-waking-waiters-in-xlog_state_shutdown_callbacks.patch

diff --git a/queue-5.15/series b/queue-5.15/series
index 8cc9026057c..673de22c000 100644
--- a/queue-5.15/series
+++ b/queue-5.15/series
@@ -23,3 +23,7 @@ fs-remap-constrain-dedupe-of-eof-blocks.patch
 nilfs2-fix-incorrect-masking-of-permission-flags-for-symlinks.patch
 sh-convert-nommu-io-re-un-map-to-static-inline-functions.patch
 revert-evm-fix-memleak-in-init_desc.patch
+xfs-only-run-cow-extent-recovery-when-there-are-no-live-extents.patch
+xfs-don-t-include-bnobt-blocks-when-reserving-free-block-pool.patch
+xfs-run-callbacks-before-waking-waiters-in-xlog_state_shutdown_callbacks.patch
+xfs-drop-async-cache-flushes-from-cil-commits.patch
diff --git a/queue-5.15/xfs-don-t-include-bnobt-blocks-when-reserving-free-block-pool.patch b/queue-5.15/xfs-don-t-include-bnobt-blocks-when-reserving-free-block-pool.patch
new file mode 100644
index 00000000000..a531a8ed5aa
--- /dev/null
+++ b/queue-5.15/xfs-don-t-include-bnobt-blocks-when-reserving-free-block-pool.patch
@@ -0,0 +1,97 @@
+From foo@baz Fri Jul 15 04:33:42 PM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Thu, 14 Jul 2022 15:23:40 -0700
+Subject: xfs: don't include bnobt blocks when reserving free block pool
+To: stable@vger.kernel.org, linux-xfs@vger.kernel.org
+Cc: "Darrick J. Wong" <djwong@kernel.org>, Brian Foster <bfoster@redhat.com>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220714222342.4013916-3-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit c8c568259772751a14e969b7230990508de73d9d ]
+
+xfs_reserve_blocks controls the size of the user-visible free space
+reserve pool.  Given the difference between the current and requested
+pool sizes, it will try to reserve free space from fdblocks.  However,
+the amount requested from fdblocks is also constrained by the amount of
+space that we think xfs_mod_fdblocks will give us.  If we forget to
+subtract m_allocbt_blks before calling xfs_mod_fdblocks, it will will
+return ENOSPC and we'll hang the kernel at mount due to the infinite
+loop.
+
+In commit fd43cf600cf6, we decided that xfs_mod_fdblocks should not hand
+out the "free space" used by the free space btrees, because some portion
+of the free space btrees hold in reserve space for future btree
+expansion.  Unfortunately, xfs_reserve_blocks' estimation of the number
+of blocks that it could request from xfs_mod_fdblocks was not updated to
+include m_allocbt_blks, so if space is extremely low, the caller hangs.
+
+Fix this by creating a function to estimate the number of blocks that
+can be reserved from fdblocks, which needs to exclude the set-aside and
+m_allocbt_blks.
+
+Found by running xfs/306 (which formats a single-AG 20MB filesystem)
+with an fstests configuration that specifies a 1k blocksize and a
+specially crafted log size that will consume 7/8 of the space (17920
+blocks, specifically) in that AG.
+
+Cc: Brian Foster <bfoster@redhat.com>
+Fixes: fd43cf600cf6 ("xfs: set aside allocation btree blocks from block reservation")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_fsops.c |    2 +-
+ fs/xfs/xfs_mount.c |    2 +-
+ fs/xfs/xfs_mount.h |   15 +++++++++++++++
+ 3 files changed, 17 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_fsops.c
++++ b/fs/xfs/xfs_fsops.c
+@@ -434,7 +434,7 @@ xfs_reserve_blocks(
+ 	error = -ENOSPC;
+ 	do {
+ 		free = percpu_counter_sum(&mp->m_fdblocks) -
+-						mp->m_alloc_set_aside;
++						xfs_fdblocks_unavailable(mp);
+ 		if (free <= 0)
+ 			break;
+ 
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -1132,7 +1132,7 @@ xfs_mod_fdblocks(
+ 	 * problems (i.e. transaction abort, pagecache discards, etc.) than
+ 	 * slightly premature -ENOSPC.
+ 	 */
+-	set_aside = mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
++	set_aside = xfs_fdblocks_unavailable(mp);
+ 	percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
+ 	if (__percpu_counter_compare(&mp->m_fdblocks, set_aside,
+ 				     XFS_FDBLOCKS_BATCH) >= 0) {
+--- a/fs/xfs/xfs_mount.h
++++ b/fs/xfs/xfs_mount.h
+@@ -478,6 +478,21 @@ extern void	xfs_unmountfs(xfs_mount_t *)
+  */
+ #define XFS_FDBLOCKS_BATCH	1024
+ 
++/*
++ * Estimate the amount of free space that is not available to userspace and is
++ * not explicitly reserved from the incore fdblocks.  This includes:
++ *
++ * - The minimum number of blocks needed to support splitting a bmap btree
++ * - The blocks currently in use by the freespace btrees because they record
++ *   the actual blocks that will fill per-AG metadata space reservations
++ */
++static inline uint64_t
++xfs_fdblocks_unavailable(
++	struct xfs_mount	*mp)
++{
++	return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
++}
++
+ extern int	xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
+ 				 bool reserved);
+ extern int	xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
diff --git a/queue-5.15/xfs-drop-async-cache-flushes-from-cil-commits.patch b/queue-5.15/xfs-drop-async-cache-flushes-from-cil-commits.patch
new file mode 100644
index 00000000000..d6df38d61c1
--- /dev/null
+++ b/queue-5.15/xfs-drop-async-cache-flushes-from-cil-commits.patch
@@ -0,0 +1,361 @@
+From foo@baz Fri Jul 15 04:33:42 PM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Thu, 14 Jul 2022 15:23:42 -0700
+Subject: xfs: drop async cache flushes from CIL commits.
+To: stable@vger.kernel.org, linux-xfs@vger.kernel.org
+Cc: Dave Chinner <dchinner@redhat.com>, Jan Kara <jack@suse.cz>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220714222342.4013916-5-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 919edbadebe17a67193533f531c2920c03e40fa4 ]
+
+Jan Kara reported a performance regression in dbench that he
+bisected down to commit bad77c375e8d ("xfs: CIL checkpoint
+flushes caches unconditionally").
+
+Whilst developing the journal flush/fua optimisations this cache was
+part of, it appeared to made a significant difference to
+performance. However, now that this patchset has settled and all the
+correctness issues fixed, there does not appear to be any
+significant performance benefit to asynchronous cache flushes.
+
+In fact, the opposite is true on some storage types and workloads,
+where additional cache flushes that can occur from fsync heavy
+workloads have measurable and significant impact on overall
+throughput.
+
+Local dbench testing shows little difference on dbench runs with
+sync vs async cache flushes on either fast or slow SSD storage, and
+no difference in streaming concurrent async transaction workloads
+like fs-mark.
+
+Fast NVME storage.
+
+>From `dbench -t 30`, CIL scale:
+
+clients		async			sync
+		BW	Latency		BW	Latency
+1		 935.18   0.855		 915.64   0.903
+8		2404.51   6.873		2341.77   6.511
+16		3003.42   6.460		2931.57   6.529
+32		3697.23   7.939		3596.28   7.894
+128		7237.43  15.495		7217.74  11.588
+512		5079.24  90.587		5167.08  95.822
+
+fsmark, 32 threads, create w/ 64 byte xattr w/32k logbsize
+
+	create		chown		unlink
+async   1m41s		1m16s		2m03s
+sync	1m40s		1m19s		1m54s
+
+Slower SATA SSD storage:
+
+>From `dbench -t 30`, CIL scale:
+
+clients		async			sync
+		BW	Latency		BW	Latency
+1		  78.59  15.792		  83.78  10.729
+8		 367.88  92.067		 404.63  59.943
+16		 564.51  72.524		 602.71  76.089
+32		 831.66 105.984		 870.26 110.482
+128		1659.76 102.969		1624.73  91.356
+512		2135.91 223.054		2603.07 161.160
+
+fsmark, 16 threads, create w/32k logbsize
+
+	create		unlink
+async   5m06s		4m15s
+sync	5m00s		4m22s
+
+And on Jan's test machine:
+
+                   5.18-rc8-vanilla       5.18-rc8-patched
+Amean     1        71.22 (   0.00%)       64.94 *   8.81%*
+Amean     2        93.03 (   0.00%)       84.80 *   8.85%*
+Amean     4       150.54 (   0.00%)      137.51 *   8.66%*
+Amean     8       252.53 (   0.00%)      242.24 *   4.08%*
+Amean     16      454.13 (   0.00%)      439.08 *   3.31%*
+Amean     32      835.24 (   0.00%)      829.74 *   0.66%*
+Amean     64     1740.59 (   0.00%)     1686.73 *   3.09%*
+
+Performance and cache flush behaviour is restored to pre-regression
+levels.
+
+As such, we can now consider the async cache flush mechanism an
+unnecessary exercise in premature optimisation and hence we can
+now remove it and the infrastructure it requires completely.
+
+Fixes: bad77c375e8d ("xfs: CIL checkpoint flushes caches unconditionally")
+Reported-and-tested-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_bio_io.c   |   35 -----------------------------------
+ fs/xfs/xfs_linux.h    |    2 --
+ fs/xfs/xfs_log.c      |   36 +++++++++++-------------------------
+ fs/xfs/xfs_log_cil.c  |   42 +++++++++++++-----------------------------
+ fs/xfs/xfs_log_priv.h |    3 +--
+ 5 files changed, 25 insertions(+), 93 deletions(-)
+
+--- a/fs/xfs/xfs_bio_io.c
++++ b/fs/xfs/xfs_bio_io.c
+@@ -9,41 +9,6 @@ static inline unsigned int bio_max_vecs(
+ 	return bio_max_segs(howmany(count, PAGE_SIZE));
+ }
+ 
+-static void
+-xfs_flush_bdev_async_endio(
+-	struct bio	*bio)
+-{
+-	complete(bio->bi_private);
+-}
+-
+-/*
+- * Submit a request for an async cache flush to run. If the request queue does
+- * not require flush operations, just skip it altogether. If the caller needs
+- * to wait for the flush completion at a later point in time, they must supply a
+- * valid completion. This will be signalled when the flush completes.  The
+- * caller never sees the bio that is issued here.
+- */
+-void
+-xfs_flush_bdev_async(
+-	struct bio		*bio,
+-	struct block_device	*bdev,
+-	struct completion	*done)
+-{
+-	struct request_queue	*q = bdev->bd_disk->queue;
+-
+-	if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
+-		complete(done);
+-		return;
+-	}
+-
+-	bio_init(bio, NULL, 0);
+-	bio_set_dev(bio, bdev);
+-	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+-	bio->bi_private = done;
+-	bio->bi_end_io = xfs_flush_bdev_async_endio;
+-
+-	submit_bio(bio);
+-}
+ int
+ xfs_rw_bdev(
+ 	struct block_device	*bdev,
+--- a/fs/xfs/xfs_linux.h
++++ b/fs/xfs/xfs_linux.h
+@@ -197,8 +197,6 @@ static inline uint64_t howmany_64(uint64
+ 
+ int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
+ 		char *data, unsigned int op);
+-void xfs_flush_bdev_async(struct bio *bio, struct block_device *bdev,
+-		struct completion *done);
+ 
+ #define ASSERT_ALWAYS(expr)	\
+ 	(likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__))
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -527,12 +527,6 @@ xlog_state_shutdown_callbacks(
+  * Flush iclog to disk if this is the last reference to the given iclog and the
+  * it is in the WANT_SYNC state.
+  *
+- * If the caller passes in a non-zero @old_tail_lsn and the current log tail
+- * does not match, there may be metadata on disk that must be persisted before
+- * this iclog is written.  To satisfy that requirement, set the
+- * XLOG_ICL_NEED_FLUSH flag as a condition for writing this iclog with the new
+- * log tail value.
+- *
+  * If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the
+  * log tail is updated correctly. NEED_FUA indicates that the iclog will be
+  * written to stable storage, and implies that a commit record is contained
+@@ -549,12 +543,10 @@ xlog_state_shutdown_callbacks(
+  * always capture the tail lsn on the iclog on the first NEED_FUA release
+  * regardless of the number of active reference counts on this iclog.
+  */
+-
+ int
+ xlog_state_release_iclog(
+ 	struct xlog		*log,
+-	struct xlog_in_core	*iclog,
+-	xfs_lsn_t		old_tail_lsn)
++	struct xlog_in_core	*iclog)
+ {
+ 	xfs_lsn_t		tail_lsn;
+ 	bool			last_ref;
+@@ -565,18 +557,14 @@ xlog_state_release_iclog(
+ 	/*
+ 	 * Grabbing the current log tail needs to be atomic w.r.t. the writing
+ 	 * of the tail LSN into the iclog so we guarantee that the log tail does
+-	 * not move between deciding if a cache flush is required and writing
+-	 * the LSN into the iclog below.
++	 * not move between the first time we know that the iclog needs to be
++	 * made stable and when we eventually submit it.
+ 	 */
+-	if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) {
++	if ((iclog->ic_state == XLOG_STATE_WANT_SYNC ||
++	     (iclog->ic_flags & XLOG_ICL_NEED_FUA)) &&
++	    !iclog->ic_header.h_tail_lsn) {
+ 		tail_lsn = xlog_assign_tail_lsn(log->l_mp);
+-
+-		if (old_tail_lsn && tail_lsn != old_tail_lsn)
+-			iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
+-
+-		if ((iclog->ic_flags & XLOG_ICL_NEED_FUA) &&
+-		    !iclog->ic_header.h_tail_lsn)
+-			iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
++		iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+ 	}
+ 
+ 	last_ref = atomic_dec_and_test(&iclog->ic_refcnt);
+@@ -601,8 +589,6 @@ xlog_state_release_iclog(
+ 	}
+ 
+ 	iclog->ic_state = XLOG_STATE_SYNCING;
+-	if (!iclog->ic_header.h_tail_lsn)
+-		iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+ 	xlog_verify_tail_lsn(log, iclog);
+ 	trace_xlog_iclog_syncing(iclog, _RET_IP_);
+ 
+@@ -875,7 +861,7 @@ xlog_force_iclog(
+ 	iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
+ 	if (iclog->ic_state == XLOG_STATE_ACTIVE)
+ 		xlog_state_switch_iclogs(iclog->ic_log, iclog, 0);
+-	return xlog_state_release_iclog(iclog->ic_log, iclog, 0);
++	return xlog_state_release_iclog(iclog->ic_log, iclog);
+ }
+ 
+ /*
+@@ -2413,7 +2399,7 @@ xlog_write_copy_finish(
+ 		ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+ 			xlog_is_shutdown(log));
+ release_iclog:
+-	error = xlog_state_release_iclog(log, iclog, 0);
++	error = xlog_state_release_iclog(log, iclog);
+ 	spin_unlock(&log->l_icloglock);
+ 	return error;
+ }
+@@ -2630,7 +2616,7 @@ next_lv:
+ 
+ 	spin_lock(&log->l_icloglock);
+ 	xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
+-	error = xlog_state_release_iclog(log, iclog, 0);
++	error = xlog_state_release_iclog(log, iclog);
+ 	spin_unlock(&log->l_icloglock);
+ 
+ 	return error;
+@@ -3054,7 +3040,7 @@ restart:
+ 		 * reference to the iclog.
+ 		 */
+ 		if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
+-			error = xlog_state_release_iclog(log, iclog, 0);
++			error = xlog_state_release_iclog(log, iclog);
+ 		spin_unlock(&log->l_icloglock);
+ 		if (error)
+ 			return error;
+--- a/fs/xfs/xfs_log_cil.c
++++ b/fs/xfs/xfs_log_cil.c
+@@ -681,11 +681,21 @@ xlog_cil_set_ctx_write_state(
+ 		 * The LSN we need to pass to the log items on transaction
+ 		 * commit is the LSN reported by the first log vector write, not
+ 		 * the commit lsn. If we use the commit record lsn then we can
+-		 * move the tail beyond the grant write head.
++		 * move the grant write head beyond the tail LSN and overwrite
++		 * it.
+ 		 */
+ 		ctx->start_lsn = lsn;
+ 		wake_up_all(&cil->xc_start_wait);
+ 		spin_unlock(&cil->xc_push_lock);
++
++		/*
++		 * Make sure the metadata we are about to overwrite in the log
++		 * has been flushed to stable storage before this iclog is
++		 * issued.
++		 */
++		spin_lock(&cil->xc_log->l_icloglock);
++		iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
++		spin_unlock(&cil->xc_log->l_icloglock);
+ 		return;
+ 	}
+ 
+@@ -864,10 +874,7 @@ xlog_cil_push_work(
+ 	struct xfs_trans_header thdr;
+ 	struct xfs_log_iovec	lhdr;
+ 	struct xfs_log_vec	lvhdr = { NULL };
+-	xfs_lsn_t		preflush_tail_lsn;
+ 	xfs_csn_t		push_seq;
+-	struct bio		bio;
+-	DECLARE_COMPLETION_ONSTACK(bdev_flush);
+ 	bool			push_commit_stable;
+ 
+ 	new_ctx = xlog_cil_ctx_alloc();
+@@ -938,23 +945,6 @@ xlog_cil_push_work(
+ 	spin_unlock(&cil->xc_push_lock);
+ 
+ 	/*
+-	 * The CIL is stable at this point - nothing new will be added to it
+-	 * because we hold the flush lock exclusively. Hence we can now issue
+-	 * a cache flush to ensure all the completed metadata in the journal we
+-	 * are about to overwrite is on stable storage.
+-	 *
+-	 * Because we are issuing this cache flush before we've written the
+-	 * tail lsn to the iclog, we can have metadata IO completions move the
+-	 * tail forwards between the completion of this flush and the iclog
+-	 * being written. In this case, we need to re-issue the cache flush
+-	 * before the iclog write. To detect whether the log tail moves, sample
+-	 * the tail LSN *before* we issue the flush.
+-	 */
+-	preflush_tail_lsn = atomic64_read(&log->l_tail_lsn);
+-	xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev,
+-				&bdev_flush);
+-
+-	/*
+ 	 * Pull all the log vectors off the items in the CIL, and remove the
+ 	 * items from the CIL. We don't need the CIL lock here because it's only
+ 	 * needed on the transaction commit side which is currently locked out
+@@ -1030,12 +1020,6 @@ xlog_cil_push_work(
+ 	lvhdr.lv_iovecp = &lhdr;
+ 	lvhdr.lv_next = ctx->lv_chain;
+ 
+-	/*
+-	 * Before we format and submit the first iclog, we have to ensure that
+-	 * the metadata writeback ordering cache flush is complete.
+-	 */
+-	wait_for_completion(&bdev_flush);
+-
+ 	error = xlog_cil_write_chain(ctx, &lvhdr);
+ 	if (error)
+ 		goto out_abort_free_ticket;
+@@ -1094,7 +1078,7 @@ xlog_cil_push_work(
+ 	if (push_commit_stable &&
+ 	    ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE)
+ 		xlog_state_switch_iclogs(log, ctx->commit_iclog, 0);
+-	xlog_state_release_iclog(log, ctx->commit_iclog, preflush_tail_lsn);
++	xlog_state_release_iclog(log, ctx->commit_iclog);
+ 
+ 	/* Not safe to reference ctx now! */
+ 
+@@ -1115,7 +1099,7 @@ out_abort_free_ticket:
+ 		return;
+ 	}
+ 	spin_lock(&log->l_icloglock);
+-	xlog_state_release_iclog(log, ctx->commit_iclog, 0);
++	xlog_state_release_iclog(log, ctx->commit_iclog);
+ 	/* Not safe to reference ctx now! */
+ 	spin_unlock(&log->l_icloglock);
+ }
+--- a/fs/xfs/xfs_log_priv.h
++++ b/fs/xfs/xfs_log_priv.h
+@@ -524,8 +524,7 @@ void	xfs_log_ticket_regrant(struct xlog
+ 
+ void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog,
+ 		int eventual_size);
+-int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog,
+-		xfs_lsn_t log_tail_lsn);
++int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog);
+ 
+ /*
+  * When we crack an atomic LSN, we sample it first so that the value will not
diff --git a/queue-5.15/xfs-only-run-cow-extent-recovery-when-there-are-no-live-extents.patch b/queue-5.15/xfs-only-run-cow-extent-recovery-when-there-are-no-live-extents.patch
new file mode 100644
index 00000000000..3bf777c3d20
--- /dev/null
+++ b/queue-5.15/xfs-only-run-cow-extent-recovery-when-there-are-no-live-extents.patch
@@ -0,0 +1,176 @@
+From foo@baz Fri Jul 15 04:33:42 PM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Thu, 14 Jul 2022 15:23:39 -0700
+Subject: xfs: only run COW extent recovery when there are no live extents
+To: stable@vger.kernel.org, linux-xfs@vger.kernel.org
+Cc: "Darrick J. Wong" <djwong@kernel.org>, Chandan Babu R <chandan.babu@oracle.com>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220714222342.4013916-2-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 7993f1a431bc5271369d359941485a9340658ac3 ]
+
+As part of multiple customer escalations due to file data corruption
+after copy on write operations, I wrote some fstests that use fsstress
+to hammer on COW to shake things loose.  Regrettably, I caught some
+filesystem shutdowns due to incorrect rmap operations with the following
+loop:
+
+mount <filesystem>				# (0)
+fsstress <run only readonly ops> &		# (1)
+while true; do
+	fsstress <run all ops>
+	mount -o remount,ro			# (2)
+	fsstress <run only readonly ops>
+	mount -o remount,rw			# (3)
+done
+
+When (2) happens, notice that (1) is still running.  xfs_remount_ro will
+call xfs_blockgc_stop to walk the inode cache to free all the COW
+extents, but the blockgc mechanism races with (1)'s reader threads to
+take IOLOCKs and loses, which means that it doesn't clean them all out.
+Call such a file (A).
+
+When (3) happens, xfs_remount_rw calls xfs_reflink_recover_cow, which
+walks the ondisk refcount btree and frees any COW extent that it finds.
+This function does not check the inode cache, which means that incore
+COW forks of inode (A) is now inconsistent with the ondisk metadata.  If
+one of those former COW extents are allocated and mapped into another
+file (B) and someone triggers a COW to the stale reservation in (A), A's
+dirty data will be written into (B) and once that's done, those blocks
+will be transferred to (A)'s data fork without bumping the refcount.
+
+The results are catastrophic -- file (B) and the refcount btree are now
+corrupt.  In the first patch, we fixed the race condition in (2) so that
+(A) will always flush the COW fork.  In this second patch, we move the
+_recover_cow call to the initial mount call in (0) for safety.
+
+As mentioned previously, xfs_reflink_recover_cow walks the refcount
+btree looking for COW staging extents, and frees them.  This was
+intended to be run at mount time (when we know there are no live inodes)
+to clean up any leftover staging events that may have been left behind
+during an unclean shutdown.  As a time "optimization" for readonly
+mounts, we deferred this to the ro->rw transition, not realizing that
+any failure to clean all COW forks during a rw->ro transition would
+result in catastrophic corruption.
+
+Therefore, remove this optimization and only run the recovery routine
+when we're guaranteed not to have any COW staging extents anywhere,
+which means we always run this at mount time.  While we're at it, move
+the callsite to xfs_log_mount_finish because any refcount btree
+expansion (however unlikely given that we're removing records from the
+right side of the index) must be fed by a per-AG reservation, which
+doesn't exist in its current location.
+
+Fixes: 174edb0e46e5 ("xfs: store in-progress CoW allocations in the refcount btree")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Chandan Babu R <chandan.babu@oracle.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c |   24 +++++++++++++++++++++++-
+ fs/xfs/xfs_mount.c       |   10 ----------
+ fs/xfs/xfs_reflink.c     |    5 ++++-
+ fs/xfs/xfs_super.c       |    9 ---------
+ 4 files changed, 27 insertions(+), 21 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -27,7 +27,7 @@
+ #include "xfs_buf_item.h"
+ #include "xfs_ag.h"
+ #include "xfs_quota.h"
+-
++#include "xfs_reflink.h"
+ 
+ #define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
+ 
+@@ -3502,6 +3502,28 @@ xlog_recover_finish(
+ 
+ 	xlog_recover_process_iunlinks(log);
+ 	xlog_recover_check_summary(log);
++
++	/*
++	 * Recover any CoW staging blocks that are still referenced by the
++	 * ondisk refcount metadata.  During mount there cannot be any live
++	 * staging extents as we have not permitted any user modifications.
++	 * Therefore, it is safe to free them all right now, even on a
++	 * read-only mount.
++	 */
++	error = xfs_reflink_recover_cow(log->l_mp);
++	if (error) {
++		xfs_alert(log->l_mp,
++	"Failed to recover leftover CoW staging extents, err %d.",
++				error);
++		/*
++		 * If we get an error here, make sure the log is shut down
++		 * but return zero so that any log items committed since the
++		 * end of intents processing can be pushed through the CIL
++		 * and AIL.
++		 */
++		xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
++	}
++
+ 	return 0;
+ }
+ 
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -922,15 +922,6 @@ xfs_mountfs(
+ 			xfs_warn(mp,
+ 	"Unable to allocate reserve blocks. Continuing without reserve pool.");
+ 
+-		/* Recover any CoW blocks that never got remapped. */
+-		error = xfs_reflink_recover_cow(mp);
+-		if (error) {
+-			xfs_err(mp,
+-	"Error %d recovering leftover CoW allocations.", error);
+-			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+-			goto out_quota;
+-		}
+-
+ 		/* Reserve AG blocks for future btree expansion. */
+ 		error = xfs_fs_reserve_ag_blocks(mp);
+ 		if (error && error != -ENOSPC)
+@@ -941,7 +932,6 @@ xfs_mountfs(
+ 
+  out_agresv:
+ 	xfs_fs_unreserve_ag_blocks(mp);
+- out_quota:
+ 	xfs_qm_unmount_quotas(mp);
+  out_rtunmount:
+ 	xfs_rtunmount_inodes(mp);
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -749,7 +749,10 @@ xfs_reflink_end_cow(
+ }
+ 
+ /*
+- * Free leftover CoW reservations that didn't get cleaned out.
++ * Free all CoW staging blocks that are still referenced by the ondisk refcount
++ * metadata.  The ondisk metadata does not track which inode created the
++ * staging extent, so callers must ensure that there are no cached inodes with
++ * live CoW staging extents.
+  */
+ int
+ xfs_reflink_recover_cow(
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1742,15 +1742,6 @@ xfs_remount_rw(
+ 	 */
+ 	xfs_restore_resvblks(mp);
+ 	xfs_log_work_queue(mp);
+-
+-	/* Recover any CoW blocks that never got remapped. */
+-	error = xfs_reflink_recover_cow(mp);
+-	if (error) {
+-		xfs_err(mp,
+-			"Error %d recovering leftover CoW allocations.", error);
+-		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+-		return error;
+-	}
+ 	xfs_blockgc_start(mp);
+ 
+ 	/* Create the per-AG metadata reservation pool .*/
diff --git a/queue-5.15/xfs-run-callbacks-before-waking-waiters-in-xlog_state_shutdown_callbacks.patch b/queue-5.15/xfs-run-callbacks-before-waking-waiters-in-xlog_state_shutdown_callbacks.patch
new file mode 100644
index 00000000000..e87a0adea11
--- /dev/null
+++ b/queue-5.15/xfs-run-callbacks-before-waking-waiters-in-xlog_state_shutdown_callbacks.patch
@@ -0,0 +1,171 @@
+From foo@baz Fri Jul 15 04:33:42 PM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Thu, 14 Jul 2022 15:23:41 -0700
+Subject: xfs: run callbacks before waking waiters in xlog_state_shutdown_callbacks
+To: stable@vger.kernel.org, linux-xfs@vger.kernel.org
+Cc: Dave Chinner <dchinner@redhat.com>, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220714222342.4013916-4-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit cd6f79d1fb324968a3bae92f82eeb7d28ca1fd22 ]
+
+Brian reported a null pointer dereference failure during unmount in
+xfs/006. He tracked the problem down to the AIL being torn down
+before a log shutdown had completed and removed all the items from
+the AIL. The failure occurred in this path while unmount was
+proceeding in another task:
+
+ xfs_trans_ail_delete+0x102/0x130 [xfs]
+ xfs_buf_item_done+0x22/0x30 [xfs]
+ xfs_buf_ioend+0x73/0x4d0 [xfs]
+ xfs_trans_committed_bulk+0x17e/0x2f0 [xfs]
+ xlog_cil_committed+0x2a9/0x300 [xfs]
+ xlog_cil_process_committed+0x69/0x80 [xfs]
+ xlog_state_shutdown_callbacks+0xce/0xf0 [xfs]
+ xlog_force_shutdown+0xdf/0x150 [xfs]
+ xfs_do_force_shutdown+0x5f/0x150 [xfs]
+ xlog_ioend_work+0x71/0x80 [xfs]
+ process_one_work+0x1c5/0x390
+ worker_thread+0x30/0x350
+ kthread+0xd7/0x100
+ ret_from_fork+0x1f/0x30
+
+This is processing an EIO error to a log write, and it's
+triggering a force shutdown. This causes the log to be shut down,
+and then it is running attached iclog callbacks from the shutdown
+context. That means the fs and log has already been marked as
+xfs_is_shutdown/xlog_is_shutdown and so high level code will abort
+(e.g. xfs_trans_commit(), xfs_log_force(), etc) with an error
+because of shutdown.
+
+The umount would have been blocked waiting for a log force
+completion inside xfs_log_cover() -> xfs_sync_sb(). The first thing
+for this situation to occur is for xfs_sync_sb() to exit without
+waiting for the iclog buffer to be comitted to disk. The
+above trace is the completion routine for the iclog buffer, and
+it is shutting down the filesystem.
+
+xlog_state_shutdown_callbacks() does this:
+
+{
+        struct xlog_in_core     *iclog;
+        LIST_HEAD(cb_list);
+
+        spin_lock(&log->l_icloglock);
+        iclog = log->l_iclog;
+        do {
+                if (atomic_read(&iclog->ic_refcnt)) {
+                        /* Reference holder will re-run iclog callbacks. */
+                        continue;
+                }
+                list_splice_init(&iclog->ic_callbacks, &cb_list);
+>>>>>>           wake_up_all(&iclog->ic_write_wait);
+>>>>>>           wake_up_all(&iclog->ic_force_wait);
+        } while ((iclog = iclog->ic_next) != log->l_iclog);
+
+        wake_up_all(&log->l_flush_wait);
+        spin_unlock(&log->l_icloglock);
+
+>>>>>>  xlog_cil_process_committed(&cb_list);
+}
+
+This wakes any thread waiting on IO completion of the iclog (in this
+case the umount log force) before shutdown processes all the pending
+callbacks.  That means the xfs_sync_sb() waiting on a sync
+transaction in xfs_log_force() on iclog->ic_force_wait will get
+woken before the callbacks attached to that iclog are run. This
+results in xfs_sync_sb() returning an error, and so unmount unblocks
+and continues to run whilst the log shutdown is still in progress.
+
+Normally this is just fine because the force waiter has nothing to
+do with AIL operations. But in the case of this unmount path, the
+log force waiter goes on to tear down the AIL because the log is now
+shut down and so nothing ever blocks it again from the wait point in
+xfs_log_cover().
+
+Hence it's a race to see who gets to the AIL first - the unmount
+code or xlog_cil_process_committed() killing the superblock buffer.
+
+To fix this, we just have to change the order of processing in
+xlog_state_shutdown_callbacks() to run the callbacks before it wakes
+any task waiting on completion of the iclog.
+
+Reported-by: Brian Foster <bfoster@redhat.com>
+Fixes: aad7272a9208 ("xfs: separate out log shutdown callback processing")
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c |   22 +++++++++++++---------
+ 1 file changed, 13 insertions(+), 9 deletions(-)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -487,7 +487,10 @@ out_error:
+  * Run all the pending iclog callbacks and wake log force waiters and iclog
+  * space waiters so they can process the newly set shutdown state. We really
+  * don't care what order we process callbacks here because the log is shut down
+- * and so state cannot change on disk anymore.
++ * and so state cannot change on disk anymore. However, we cannot wake waiters
++ * until the callbacks have been processed because we may be in unmount and
++ * we must ensure that all AIL operations the callbacks perform have completed
++ * before we tear down the AIL.
+  *
+  * We avoid processing actively referenced iclogs so that we don't run callbacks
+  * while the iclog owner might still be preparing the iclog for IO submssion.
+@@ -501,7 +504,6 @@ xlog_state_shutdown_callbacks(
+ 	struct xlog_in_core	*iclog;
+ 	LIST_HEAD(cb_list);
+ 
+-	spin_lock(&log->l_icloglock);
+ 	iclog = log->l_iclog;
+ 	do {
+ 		if (atomic_read(&iclog->ic_refcnt)) {
+@@ -509,14 +511,16 @@ xlog_state_shutdown_callbacks(
+ 			continue;
+ 		}
+ 		list_splice_init(&iclog->ic_callbacks, &cb_list);
++		spin_unlock(&log->l_icloglock);
++
++		xlog_cil_process_committed(&cb_list);
++
++		spin_lock(&log->l_icloglock);
+ 		wake_up_all(&iclog->ic_write_wait);
+ 		wake_up_all(&iclog->ic_force_wait);
+ 	} while ((iclog = iclog->ic_next) != log->l_iclog);
+ 
+ 	wake_up_all(&log->l_flush_wait);
+-	spin_unlock(&log->l_icloglock);
+-
+-	xlog_cil_process_committed(&cb_list);
+ }
+ 
+ /*
+@@ -583,11 +587,8 @@ xlog_state_release_iclog(
+ 		 * pending iclog callbacks that were waiting on the release of
+ 		 * this iclog.
+ 		 */
+-		if (last_ref) {
+-			spin_unlock(&log->l_icloglock);
++		if (last_ref)
+ 			xlog_state_shutdown_callbacks(log);
+-			spin_lock(&log->l_icloglock);
+-		}
+ 		return -EIO;
+ 	}
+ 
+@@ -3904,7 +3905,10 @@ xlog_force_shutdown(
+ 	wake_up_all(&log->l_cilp->xc_start_wait);
+ 	wake_up_all(&log->l_cilp->xc_commit_wait);
+ 	spin_unlock(&log->l_cilp->xc_push_lock);
++
++	spin_lock(&log->l_icloglock);
+ 	xlog_state_shutdown_callbacks(log);
++	spin_unlock(&log->l_icloglock);
+ 
+ 	return log_error;
+ }
-- 
2.47.3