nilfs2-fix-incorrect-masking-of-permission-flags-for-symlinks.patch
sh-convert-nommu-io-re-un-map-to-static-inline-functions.patch
revert-evm-fix-memleak-in-init_desc.patch
+xfs-only-run-cow-extent-recovery-when-there-are-no-live-extents.patch
+xfs-don-t-include-bnobt-blocks-when-reserving-free-block-pool.patch
+xfs-run-callbacks-before-waking-waiters-in-xlog_state_shutdown_callbacks.patch
+xfs-drop-async-cache-flushes-from-cil-commits.patch
--- /dev/null
+From foo@baz Fri Jul 15 04:33:42 PM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Thu, 14 Jul 2022 15:23:40 -0700
+Subject: xfs: don't include bnobt blocks when reserving free block pool
+To: stable@vger.kernel.org, linux-xfs@vger.kernel.org
+Cc: "Darrick J. Wong" <djwong@kernel.org>, Brian Foster <bfoster@redhat.com>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220714222342.4013916-3-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit c8c568259772751a14e969b7230990508de73d9d ]
+
+xfs_reserve_blocks controls the size of the user-visible free space
+reserve pool. Given the difference between the current and requested
+pool sizes, it will try to reserve free space from fdblocks. However,
+the amount requested from fdblocks is also constrained by the amount of
+space that we think xfs_mod_fdblocks will give us. If we forget to
+subtract m_allocbt_blks before calling xfs_mod_fdblocks, it will will
+return ENOSPC and we'll hang the kernel at mount due to the infinite
+loop.
+
+In commit fd43cf600cf6, we decided that xfs_mod_fdblocks should not hand
+out the "free space" used by the free space btrees, because some portion
+of the free space btrees hold in reserve space for future btree
+expansion. Unfortunately, xfs_reserve_blocks' estimation of the number
+of blocks that it could request from xfs_mod_fdblocks was not updated to
+include m_allocbt_blks, so if space is extremely low, the caller hangs.
+
+Fix this by creating a function to estimate the number of blocks that
+can be reserved from fdblocks, which needs to exclude the set-aside and
+m_allocbt_blks.
+
+Found by running xfs/306 (which formats a single-AG 20MB filesystem)
+with an fstests configuration that specifies a 1k blocksize and a
+specially crafted log size that will consume 7/8 of the space (17920
+blocks, specifically) in that AG.
+
+Cc: Brian Foster <bfoster@redhat.com>
+Fixes: fd43cf600cf6 ("xfs: set aside allocation btree blocks from block reservation")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_fsops.c | 2 +-
+ fs/xfs/xfs_mount.c | 2 +-
+ fs/xfs/xfs_mount.h | 15 +++++++++++++++
+ 3 files changed, 17 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_fsops.c
++++ b/fs/xfs/xfs_fsops.c
+@@ -434,7 +434,7 @@ xfs_reserve_blocks(
+ error = -ENOSPC;
+ do {
+ free = percpu_counter_sum(&mp->m_fdblocks) -
+- mp->m_alloc_set_aside;
++ xfs_fdblocks_unavailable(mp);
+ if (free <= 0)
+ break;
+
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -1132,7 +1132,7 @@ xfs_mod_fdblocks(
+ * problems (i.e. transaction abort, pagecache discards, etc.) than
+ * slightly premature -ENOSPC.
+ */
+- set_aside = mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
++ set_aside = xfs_fdblocks_unavailable(mp);
+ percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
+ if (__percpu_counter_compare(&mp->m_fdblocks, set_aside,
+ XFS_FDBLOCKS_BATCH) >= 0) {
+--- a/fs/xfs/xfs_mount.h
++++ b/fs/xfs/xfs_mount.h
+@@ -478,6 +478,21 @@ extern void xfs_unmountfs(xfs_mount_t *)
+ */
+ #define XFS_FDBLOCKS_BATCH 1024
+
++/*
++ * Estimate the amount of free space that is not available to userspace and is
++ * not explicitly reserved from the incore fdblocks. This includes:
++ *
++ * - The minimum number of blocks needed to support splitting a bmap btree
++ * - The blocks currently in use by the freespace btrees because they record
++ * the actual blocks that will fill per-AG metadata space reservations
++ */
++static inline uint64_t
++xfs_fdblocks_unavailable(
++ struct xfs_mount *mp)
++{
++ return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
++}
++
+ extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
+ bool reserved);
+ extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
--- /dev/null
+From foo@baz Fri Jul 15 04:33:42 PM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Thu, 14 Jul 2022 15:23:42 -0700
+Subject: xfs: drop async cache flushes from CIL commits.
+To: stable@vger.kernel.org, linux-xfs@vger.kernel.org
+Cc: Dave Chinner <dchinner@redhat.com>, Jan Kara <jack@suse.cz>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220714222342.4013916-5-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 919edbadebe17a67193533f531c2920c03e40fa4 ]
+
+Jan Kara reported a performance regression in dbench that he
+bisected down to commit bad77c375e8d ("xfs: CIL checkpoint
+flushes caches unconditionally").
+
+Whilst developing the journal flush/fua optimisations this cache was
+part of, it appeared to made a significant difference to
+performance. However, now that this patchset has settled and all the
+correctness issues fixed, there does not appear to be any
+significant performance benefit to asynchronous cache flushes.
+
+In fact, the opposite is true on some storage types and workloads,
+where additional cache flushes that can occur from fsync heavy
+workloads have measurable and significant impact on overall
+throughput.
+
+Local dbench testing shows little difference on dbench runs with
+sync vs async cache flushes on either fast or slow SSD storage, and
+no difference in streaming concurrent async transaction workloads
+like fs-mark.
+
+Fast NVME storage.
+
+>From `dbench -t 30`, CIL scale:
+
+clients async sync
+ BW Latency BW Latency
+1 935.18 0.855 915.64 0.903
+8 2404.51 6.873 2341.77 6.511
+16 3003.42 6.460 2931.57 6.529
+32 3697.23 7.939 3596.28 7.894
+128 7237.43 15.495 7217.74 11.588
+512 5079.24 90.587 5167.08 95.822
+
+fsmark, 32 threads, create w/ 64 byte xattr w/32k logbsize
+
+ create chown unlink
+async 1m41s 1m16s 2m03s
+sync 1m40s 1m19s 1m54s
+
+Slower SATA SSD storage:
+
+>From `dbench -t 30`, CIL scale:
+
+clients async sync
+ BW Latency BW Latency
+1 78.59 15.792 83.78 10.729
+8 367.88 92.067 404.63 59.943
+16 564.51 72.524 602.71 76.089
+32 831.66 105.984 870.26 110.482
+128 1659.76 102.969 1624.73 91.356
+512 2135.91 223.054 2603.07 161.160
+
+fsmark, 16 threads, create w/32k logbsize
+
+ create unlink
+async 5m06s 4m15s
+sync 5m00s 4m22s
+
+And on Jan's test machine:
+
+ 5.18-rc8-vanilla 5.18-rc8-patched
+Amean 1 71.22 ( 0.00%) 64.94 * 8.81%*
+Amean 2 93.03 ( 0.00%) 84.80 * 8.85%*
+Amean 4 150.54 ( 0.00%) 137.51 * 8.66%*
+Amean 8 252.53 ( 0.00%) 242.24 * 4.08%*
+Amean 16 454.13 ( 0.00%) 439.08 * 3.31%*
+Amean 32 835.24 ( 0.00%) 829.74 * 0.66%*
+Amean 64 1740.59 ( 0.00%) 1686.73 * 3.09%*
+
+Performance and cache flush behaviour is restored to pre-regression
+levels.
+
+As such, we can now consider the async cache flush mechanism an
+unnecessary exercise in premature optimisation and hence we can
+now remove it and the infrastructure it requires completely.
+
+Fixes: bad77c375e8d ("xfs: CIL checkpoint flushes caches unconditionally")
+Reported-and-tested-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_bio_io.c | 35 -----------------------------------
+ fs/xfs/xfs_linux.h | 2 --
+ fs/xfs/xfs_log.c | 36 +++++++++++-------------------------
+ fs/xfs/xfs_log_cil.c | 42 +++++++++++++-----------------------------
+ fs/xfs/xfs_log_priv.h | 3 +--
+ 5 files changed, 25 insertions(+), 93 deletions(-)
+
+--- a/fs/xfs/xfs_bio_io.c
++++ b/fs/xfs/xfs_bio_io.c
+@@ -9,41 +9,6 @@ static inline unsigned int bio_max_vecs(
+ return bio_max_segs(howmany(count, PAGE_SIZE));
+ }
+
+-static void
+-xfs_flush_bdev_async_endio(
+- struct bio *bio)
+-{
+- complete(bio->bi_private);
+-}
+-
+-/*
+- * Submit a request for an async cache flush to run. If the request queue does
+- * not require flush operations, just skip it altogether. If the caller needs
+- * to wait for the flush completion at a later point in time, they must supply a
+- * valid completion. This will be signalled when the flush completes. The
+- * caller never sees the bio that is issued here.
+- */
+-void
+-xfs_flush_bdev_async(
+- struct bio *bio,
+- struct block_device *bdev,
+- struct completion *done)
+-{
+- struct request_queue *q = bdev->bd_disk->queue;
+-
+- if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
+- complete(done);
+- return;
+- }
+-
+- bio_init(bio, NULL, 0);
+- bio_set_dev(bio, bdev);
+- bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+- bio->bi_private = done;
+- bio->bi_end_io = xfs_flush_bdev_async_endio;
+-
+- submit_bio(bio);
+-}
+ int
+ xfs_rw_bdev(
+ struct block_device *bdev,
+--- a/fs/xfs/xfs_linux.h
++++ b/fs/xfs/xfs_linux.h
+@@ -197,8 +197,6 @@ static inline uint64_t howmany_64(uint64
+
+ int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
+ char *data, unsigned int op);
+-void xfs_flush_bdev_async(struct bio *bio, struct block_device *bdev,
+- struct completion *done);
+
+ #define ASSERT_ALWAYS(expr) \
+ (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__))
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -527,12 +527,6 @@ xlog_state_shutdown_callbacks(
+ * Flush iclog to disk if this is the last reference to the given iclog and the
+ * it is in the WANT_SYNC state.
+ *
+- * If the caller passes in a non-zero @old_tail_lsn and the current log tail
+- * does not match, there may be metadata on disk that must be persisted before
+- * this iclog is written. To satisfy that requirement, set the
+- * XLOG_ICL_NEED_FLUSH flag as a condition for writing this iclog with the new
+- * log tail value.
+- *
+ * If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the
+ * log tail is updated correctly. NEED_FUA indicates that the iclog will be
+ * written to stable storage, and implies that a commit record is contained
+@@ -549,12 +543,10 @@ xlog_state_shutdown_callbacks(
+ * always capture the tail lsn on the iclog on the first NEED_FUA release
+ * regardless of the number of active reference counts on this iclog.
+ */
+-
+ int
+ xlog_state_release_iclog(
+ struct xlog *log,
+- struct xlog_in_core *iclog,
+- xfs_lsn_t old_tail_lsn)
++ struct xlog_in_core *iclog)
+ {
+ xfs_lsn_t tail_lsn;
+ bool last_ref;
+@@ -565,18 +557,14 @@ xlog_state_release_iclog(
+ /*
+ * Grabbing the current log tail needs to be atomic w.r.t. the writing
+ * of the tail LSN into the iclog so we guarantee that the log tail does
+- * not move between deciding if a cache flush is required and writing
+- * the LSN into the iclog below.
++ * not move between the first time we know that the iclog needs to be
++ * made stable and when we eventually submit it.
+ */
+- if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) {
++ if ((iclog->ic_state == XLOG_STATE_WANT_SYNC ||
++ (iclog->ic_flags & XLOG_ICL_NEED_FUA)) &&
++ !iclog->ic_header.h_tail_lsn) {
+ tail_lsn = xlog_assign_tail_lsn(log->l_mp);
+-
+- if (old_tail_lsn && tail_lsn != old_tail_lsn)
+- iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
+-
+- if ((iclog->ic_flags & XLOG_ICL_NEED_FUA) &&
+- !iclog->ic_header.h_tail_lsn)
+- iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
++ iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+ }
+
+ last_ref = atomic_dec_and_test(&iclog->ic_refcnt);
+@@ -601,8 +589,6 @@ xlog_state_release_iclog(
+ }
+
+ iclog->ic_state = XLOG_STATE_SYNCING;
+- if (!iclog->ic_header.h_tail_lsn)
+- iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+ xlog_verify_tail_lsn(log, iclog);
+ trace_xlog_iclog_syncing(iclog, _RET_IP_);
+
+@@ -875,7 +861,7 @@ xlog_force_iclog(
+ iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
+ if (iclog->ic_state == XLOG_STATE_ACTIVE)
+ xlog_state_switch_iclogs(iclog->ic_log, iclog, 0);
+- return xlog_state_release_iclog(iclog->ic_log, iclog, 0);
++ return xlog_state_release_iclog(iclog->ic_log, iclog);
+ }
+
+ /*
+@@ -2413,7 +2399,7 @@ xlog_write_copy_finish(
+ ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+ xlog_is_shutdown(log));
+ release_iclog:
+- error = xlog_state_release_iclog(log, iclog, 0);
++ error = xlog_state_release_iclog(log, iclog);
+ spin_unlock(&log->l_icloglock);
+ return error;
+ }
+@@ -2630,7 +2616,7 @@ next_lv:
+
+ spin_lock(&log->l_icloglock);
+ xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
+- error = xlog_state_release_iclog(log, iclog, 0);
++ error = xlog_state_release_iclog(log, iclog);
+ spin_unlock(&log->l_icloglock);
+
+ return error;
+@@ -3054,7 +3040,7 @@ restart:
+ * reference to the iclog.
+ */
+ if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
+- error = xlog_state_release_iclog(log, iclog, 0);
++ error = xlog_state_release_iclog(log, iclog);
+ spin_unlock(&log->l_icloglock);
+ if (error)
+ return error;
+--- a/fs/xfs/xfs_log_cil.c
++++ b/fs/xfs/xfs_log_cil.c
+@@ -681,11 +681,21 @@ xlog_cil_set_ctx_write_state(
+ * The LSN we need to pass to the log items on transaction
+ * commit is the LSN reported by the first log vector write, not
+ * the commit lsn. If we use the commit record lsn then we can
+- * move the tail beyond the grant write head.
++ * move the grant write head beyond the tail LSN and overwrite
++ * it.
+ */
+ ctx->start_lsn = lsn;
+ wake_up_all(&cil->xc_start_wait);
+ spin_unlock(&cil->xc_push_lock);
++
++ /*
++ * Make sure the metadata we are about to overwrite in the log
++ * has been flushed to stable storage before this iclog is
++ * issued.
++ */
++ spin_lock(&cil->xc_log->l_icloglock);
++ iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
++ spin_unlock(&cil->xc_log->l_icloglock);
+ return;
+ }
+
+@@ -864,10 +874,7 @@ xlog_cil_push_work(
+ struct xfs_trans_header thdr;
+ struct xfs_log_iovec lhdr;
+ struct xfs_log_vec lvhdr = { NULL };
+- xfs_lsn_t preflush_tail_lsn;
+ xfs_csn_t push_seq;
+- struct bio bio;
+- DECLARE_COMPLETION_ONSTACK(bdev_flush);
+ bool push_commit_stable;
+
+ new_ctx = xlog_cil_ctx_alloc();
+@@ -938,23 +945,6 @@ xlog_cil_push_work(
+ spin_unlock(&cil->xc_push_lock);
+
+ /*
+- * The CIL is stable at this point - nothing new will be added to it
+- * because we hold the flush lock exclusively. Hence we can now issue
+- * a cache flush to ensure all the completed metadata in the journal we
+- * are about to overwrite is on stable storage.
+- *
+- * Because we are issuing this cache flush before we've written the
+- * tail lsn to the iclog, we can have metadata IO completions move the
+- * tail forwards between the completion of this flush and the iclog
+- * being written. In this case, we need to re-issue the cache flush
+- * before the iclog write. To detect whether the log tail moves, sample
+- * the tail LSN *before* we issue the flush.
+- */
+- preflush_tail_lsn = atomic64_read(&log->l_tail_lsn);
+- xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev,
+- &bdev_flush);
+-
+- /*
+ * Pull all the log vectors off the items in the CIL, and remove the
+ * items from the CIL. We don't need the CIL lock here because it's only
+ * needed on the transaction commit side which is currently locked out
+@@ -1030,12 +1020,6 @@ xlog_cil_push_work(
+ lvhdr.lv_iovecp = &lhdr;
+ lvhdr.lv_next = ctx->lv_chain;
+
+- /*
+- * Before we format and submit the first iclog, we have to ensure that
+- * the metadata writeback ordering cache flush is complete.
+- */
+- wait_for_completion(&bdev_flush);
+-
+ error = xlog_cil_write_chain(ctx, &lvhdr);
+ if (error)
+ goto out_abort_free_ticket;
+@@ -1094,7 +1078,7 @@ xlog_cil_push_work(
+ if (push_commit_stable &&
+ ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE)
+ xlog_state_switch_iclogs(log, ctx->commit_iclog, 0);
+- xlog_state_release_iclog(log, ctx->commit_iclog, preflush_tail_lsn);
++ xlog_state_release_iclog(log, ctx->commit_iclog);
+
+ /* Not safe to reference ctx now! */
+
+@@ -1115,7 +1099,7 @@ out_abort_free_ticket:
+ return;
+ }
+ spin_lock(&log->l_icloglock);
+- xlog_state_release_iclog(log, ctx->commit_iclog, 0);
++ xlog_state_release_iclog(log, ctx->commit_iclog);
+ /* Not safe to reference ctx now! */
+ spin_unlock(&log->l_icloglock);
+ }
+--- a/fs/xfs/xfs_log_priv.h
++++ b/fs/xfs/xfs_log_priv.h
+@@ -524,8 +524,7 @@ void xfs_log_ticket_regrant(struct xlog
+
+ void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog,
+ int eventual_size);
+-int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog,
+- xfs_lsn_t log_tail_lsn);
++int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog);
+
+ /*
+ * When we crack an atomic LSN, we sample it first so that the value will not
--- /dev/null
+From foo@baz Fri Jul 15 04:33:42 PM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Thu, 14 Jul 2022 15:23:39 -0700
+Subject: xfs: only run COW extent recovery when there are no live extents
+To: stable@vger.kernel.org, linux-xfs@vger.kernel.org
+Cc: "Darrick J. Wong" <djwong@kernel.org>, Chandan Babu R <chandan.babu@oracle.com>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220714222342.4013916-2-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 7993f1a431bc5271369d359941485a9340658ac3 ]
+
+As part of multiple customer escalations due to file data corruption
+after copy on write operations, I wrote some fstests that use fsstress
+to hammer on COW to shake things loose. Regrettably, I caught some
+filesystem shutdowns due to incorrect rmap operations with the following
+loop:
+
+mount <filesystem> # (0)
+fsstress <run only readonly ops> & # (1)
+while true; do
+ fsstress <run all ops>
+ mount -o remount,ro # (2)
+ fsstress <run only readonly ops>
+ mount -o remount,rw # (3)
+done
+
+When (2) happens, notice that (1) is still running. xfs_remount_ro will
+call xfs_blockgc_stop to walk the inode cache to free all the COW
+extents, but the blockgc mechanism races with (1)'s reader threads to
+take IOLOCKs and loses, which means that it doesn't clean them all out.
+Call such a file (A).
+
+When (3) happens, xfs_remount_rw calls xfs_reflink_recover_cow, which
+walks the ondisk refcount btree and frees any COW extent that it finds.
+This function does not check the inode cache, which means that incore
+COW forks of inode (A) is now inconsistent with the ondisk metadata. If
+one of those former COW extents are allocated and mapped into another
+file (B) and someone triggers a COW to the stale reservation in (A), A's
+dirty data will be written into (B) and once that's done, those blocks
+will be transferred to (A)'s data fork without bumping the refcount.
+
+The results are catastrophic -- file (B) and the refcount btree are now
+corrupt. In the first patch, we fixed the race condition in (2) so that
+(A) will always flush the COW fork. In this second patch, we move the
+_recover_cow call to the initial mount call in (0) for safety.
+
+As mentioned previously, xfs_reflink_recover_cow walks the refcount
+btree looking for COW staging extents, and frees them. This was
+intended to be run at mount time (when we know there are no live inodes)
+to clean up any leftover staging events that may have been left behind
+during an unclean shutdown. As a time "optimization" for readonly
+mounts, we deferred this to the ro->rw transition, not realizing that
+any failure to clean all COW forks during a rw->ro transition would
+result in catastrophic corruption.
+
+Therefore, remove this optimization and only run the recovery routine
+when we're guaranteed not to have any COW staging extents anywhere,
+which means we always run this at mount time. While we're at it, move
+the callsite to xfs_log_mount_finish because any refcount btree
+expansion (however unlikely given that we're removing records from the
+right side of the index) must be fed by a per-AG reservation, which
+doesn't exist in its current location.
+
+Fixes: 174edb0e46e5 ("xfs: store in-progress CoW allocations in the refcount btree")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Chandan Babu R <chandan.babu@oracle.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c | 24 +++++++++++++++++++++++-
+ fs/xfs/xfs_mount.c | 10 ----------
+ fs/xfs/xfs_reflink.c | 5 ++++-
+ fs/xfs/xfs_super.c | 9 ---------
+ 4 files changed, 27 insertions(+), 21 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -27,7 +27,7 @@
+ #include "xfs_buf_item.h"
+ #include "xfs_ag.h"
+ #include "xfs_quota.h"
+-
++#include "xfs_reflink.h"
+
+ #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
+
+@@ -3502,6 +3502,28 @@ xlog_recover_finish(
+
+ xlog_recover_process_iunlinks(log);
+ xlog_recover_check_summary(log);
++
++ /*
++ * Recover any CoW staging blocks that are still referenced by the
++ * ondisk refcount metadata. During mount there cannot be any live
++ * staging extents as we have not permitted any user modifications.
++ * Therefore, it is safe to free them all right now, even on a
++ * read-only mount.
++ */
++ error = xfs_reflink_recover_cow(log->l_mp);
++ if (error) {
++ xfs_alert(log->l_mp,
++ "Failed to recover leftover CoW staging extents, err %d.",
++ error);
++ /*
++ * If we get an error here, make sure the log is shut down
++ * but return zero so that any log items committed since the
++ * end of intents processing can be pushed through the CIL
++ * and AIL.
++ */
++ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
++ }
++
+ return 0;
+ }
+
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -922,15 +922,6 @@ xfs_mountfs(
+ xfs_warn(mp,
+ "Unable to allocate reserve blocks. Continuing without reserve pool.");
+
+- /* Recover any CoW blocks that never got remapped. */
+- error = xfs_reflink_recover_cow(mp);
+- if (error) {
+- xfs_err(mp,
+- "Error %d recovering leftover CoW allocations.", error);
+- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+- goto out_quota;
+- }
+-
+ /* Reserve AG blocks for future btree expansion. */
+ error = xfs_fs_reserve_ag_blocks(mp);
+ if (error && error != -ENOSPC)
+@@ -941,7 +932,6 @@ xfs_mountfs(
+
+ out_agresv:
+ xfs_fs_unreserve_ag_blocks(mp);
+- out_quota:
+ xfs_qm_unmount_quotas(mp);
+ out_rtunmount:
+ xfs_rtunmount_inodes(mp);
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -749,7 +749,10 @@ xfs_reflink_end_cow(
+ }
+
+ /*
+- * Free leftover CoW reservations that didn't get cleaned out.
++ * Free all CoW staging blocks that are still referenced by the ondisk refcount
++ * metadata. The ondisk metadata does not track which inode created the
++ * staging extent, so callers must ensure that there are no cached inodes with
++ * live CoW staging extents.
+ */
+ int
+ xfs_reflink_recover_cow(
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1742,15 +1742,6 @@ xfs_remount_rw(
+ */
+ xfs_restore_resvblks(mp);
+ xfs_log_work_queue(mp);
+-
+- /* Recover any CoW blocks that never got remapped. */
+- error = xfs_reflink_recover_cow(mp);
+- if (error) {
+- xfs_err(mp,
+- "Error %d recovering leftover CoW allocations.", error);
+- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+- return error;
+- }
+ xfs_blockgc_start(mp);
+
+ /* Create the per-AG metadata reservation pool .*/
--- /dev/null
+From foo@baz Fri Jul 15 04:33:42 PM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Thu, 14 Jul 2022 15:23:41 -0700
+Subject: xfs: run callbacks before waking waiters in xlog_state_shutdown_callbacks
+To: stable@vger.kernel.org, linux-xfs@vger.kernel.org
+Cc: Dave Chinner <dchinner@redhat.com>, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220714222342.4013916-4-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit cd6f79d1fb324968a3bae92f82eeb7d28ca1fd22 ]
+
+Brian reported a null pointer dereference failure during unmount in
+xfs/006. He tracked the problem down to the AIL being torn down
+before a log shutdown had completed and removed all the items from
+the AIL. The failure occurred in this path while unmount was
+proceeding in another task:
+
+ xfs_trans_ail_delete+0x102/0x130 [xfs]
+ xfs_buf_item_done+0x22/0x30 [xfs]
+ xfs_buf_ioend+0x73/0x4d0 [xfs]
+ xfs_trans_committed_bulk+0x17e/0x2f0 [xfs]
+ xlog_cil_committed+0x2a9/0x300 [xfs]
+ xlog_cil_process_committed+0x69/0x80 [xfs]
+ xlog_state_shutdown_callbacks+0xce/0xf0 [xfs]
+ xlog_force_shutdown+0xdf/0x150 [xfs]
+ xfs_do_force_shutdown+0x5f/0x150 [xfs]
+ xlog_ioend_work+0x71/0x80 [xfs]
+ process_one_work+0x1c5/0x390
+ worker_thread+0x30/0x350
+ kthread+0xd7/0x100
+ ret_from_fork+0x1f/0x30
+
+This is processing an EIO error to a log write, and it's
+triggering a force shutdown. This causes the log to be shut down,
+and then it is running attached iclog callbacks from the shutdown
+context. That means the fs and log has already been marked as
+xfs_is_shutdown/xlog_is_shutdown and so high level code will abort
+(e.g. xfs_trans_commit(), xfs_log_force(), etc) with an error
+because of shutdown.
+
+The umount would have been blocked waiting for a log force
+completion inside xfs_log_cover() -> xfs_sync_sb(). The first thing
+for this situation to occur is for xfs_sync_sb() to exit without
+waiting for the iclog buffer to be comitted to disk. The
+above trace is the completion routine for the iclog buffer, and
+it is shutting down the filesystem.
+
+xlog_state_shutdown_callbacks() does this:
+
+{
+ struct xlog_in_core *iclog;
+ LIST_HEAD(cb_list);
+
+ spin_lock(&log->l_icloglock);
+ iclog = log->l_iclog;
+ do {
+ if (atomic_read(&iclog->ic_refcnt)) {
+ /* Reference holder will re-run iclog callbacks. */
+ continue;
+ }
+ list_splice_init(&iclog->ic_callbacks, &cb_list);
+>>>>>> wake_up_all(&iclog->ic_write_wait);
+>>>>>> wake_up_all(&iclog->ic_force_wait);
+ } while ((iclog = iclog->ic_next) != log->l_iclog);
+
+ wake_up_all(&log->l_flush_wait);
+ spin_unlock(&log->l_icloglock);
+
+>>>>>> xlog_cil_process_committed(&cb_list);
+}
+
+This wakes any thread waiting on IO completion of the iclog (in this
+case the umount log force) before shutdown processes all the pending
+callbacks. That means the xfs_sync_sb() waiting on a sync
+transaction in xfs_log_force() on iclog->ic_force_wait will get
+woken before the callbacks attached to that iclog are run. This
+results in xfs_sync_sb() returning an error, and so unmount unblocks
+and continues to run whilst the log shutdown is still in progress.
+
+Normally this is just fine because the force waiter has nothing to
+do with AIL operations. But in the case of this unmount path, the
+log force waiter goes on to tear down the AIL because the log is now
+shut down and so nothing ever blocks it again from the wait point in
+xfs_log_cover().
+
+Hence it's a race to see who gets to the AIL first - the unmount
+code or xlog_cil_process_committed() killing the superblock buffer.
+
+To fix this, we just have to change the order of processing in
+xlog_state_shutdown_callbacks() to run the callbacks before it wakes
+any task waiting on completion of the iclog.
+
+Reported-by: Brian Foster <bfoster@redhat.com>
+Fixes: aad7272a9208 ("xfs: separate out log shutdown callback processing")
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c | 22 +++++++++++++---------
+ 1 file changed, 13 insertions(+), 9 deletions(-)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -487,7 +487,10 @@ out_error:
+ * Run all the pending iclog callbacks and wake log force waiters and iclog
+ * space waiters so they can process the newly set shutdown state. We really
+ * don't care what order we process callbacks here because the log is shut down
+- * and so state cannot change on disk anymore.
++ * and so state cannot change on disk anymore. However, we cannot wake waiters
++ * until the callbacks have been processed because we may be in unmount and
++ * we must ensure that all AIL operations the callbacks perform have completed
++ * before we tear down the AIL.
+ *
+ * We avoid processing actively referenced iclogs so that we don't run callbacks
+ * while the iclog owner might still be preparing the iclog for IO submssion.
+@@ -501,7 +504,6 @@ xlog_state_shutdown_callbacks(
+ struct xlog_in_core *iclog;
+ LIST_HEAD(cb_list);
+
+- spin_lock(&log->l_icloglock);
+ iclog = log->l_iclog;
+ do {
+ if (atomic_read(&iclog->ic_refcnt)) {
+@@ -509,14 +511,16 @@ xlog_state_shutdown_callbacks(
+ continue;
+ }
+ list_splice_init(&iclog->ic_callbacks, &cb_list);
++ spin_unlock(&log->l_icloglock);
++
++ xlog_cil_process_committed(&cb_list);
++
++ spin_lock(&log->l_icloglock);
+ wake_up_all(&iclog->ic_write_wait);
+ wake_up_all(&iclog->ic_force_wait);
+ } while ((iclog = iclog->ic_next) != log->l_iclog);
+
+ wake_up_all(&log->l_flush_wait);
+- spin_unlock(&log->l_icloglock);
+-
+- xlog_cil_process_committed(&cb_list);
+ }
+
+ /*
+@@ -583,11 +587,8 @@ xlog_state_release_iclog(
+ * pending iclog callbacks that were waiting on the release of
+ * this iclog.
+ */
+- if (last_ref) {
+- spin_unlock(&log->l_icloglock);
++ if (last_ref)
+ xlog_state_shutdown_callbacks(log);
+- spin_lock(&log->l_icloglock);
+- }
+ return -EIO;
+ }
+
+@@ -3904,7 +3905,10 @@ xlog_force_shutdown(
+ wake_up_all(&log->l_cilp->xc_start_wait);
+ wake_up_all(&log->l_cilp->xc_commit_wait);
+ spin_unlock(&log->l_cilp->xc_push_lock);
++
++ spin_lock(&log->l_icloglock);
+ xlog_state_shutdown_callbacks(log);
++ spin_unlock(&log->l_icloglock);
+
+ return log_error;
+ }