From cd1f8b2d6c3108036b145c09f41259ef67a80f66 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 15 Jul 2022 16:34:22 +0200 Subject: [PATCH] 5.15-stable patches added patches: xfs-don-t-include-bnobt-blocks-when-reserving-free-block-pool.patch xfs-drop-async-cache-flushes-from-cil-commits.patch xfs-only-run-cow-extent-recovery-when-there-are-no-live-extents.patch xfs-run-callbacks-before-waking-waiters-in-xlog_state_shutdown_callbacks.patch --- queue-5.15/series | 4 + ...locks-when-reserving-free-block-pool.patch | 97 +++++ ...async-cache-flushes-from-cil-commits.patch | 361 ++++++++++++++++++ ...overy-when-there-are-no-live-extents.patch | 176 +++++++++ ...ers-in-xlog_state_shutdown_callbacks.patch | 171 +++++++++ 5 files changed, 809 insertions(+) create mode 100644 queue-5.15/xfs-don-t-include-bnobt-blocks-when-reserving-free-block-pool.patch create mode 100644 queue-5.15/xfs-drop-async-cache-flushes-from-cil-commits.patch create mode 100644 queue-5.15/xfs-only-run-cow-extent-recovery-when-there-are-no-live-extents.patch create mode 100644 queue-5.15/xfs-run-callbacks-before-waking-waiters-in-xlog_state_shutdown_callbacks.patch diff --git a/queue-5.15/series b/queue-5.15/series index 8cc9026057c..673de22c000 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -23,3 +23,7 @@ fs-remap-constrain-dedupe-of-eof-blocks.patch nilfs2-fix-incorrect-masking-of-permission-flags-for-symlinks.patch sh-convert-nommu-io-re-un-map-to-static-inline-functions.patch revert-evm-fix-memleak-in-init_desc.patch +xfs-only-run-cow-extent-recovery-when-there-are-no-live-extents.patch +xfs-don-t-include-bnobt-blocks-when-reserving-free-block-pool.patch +xfs-run-callbacks-before-waking-waiters-in-xlog_state_shutdown_callbacks.patch +xfs-drop-async-cache-flushes-from-cil-commits.patch diff --git a/queue-5.15/xfs-don-t-include-bnobt-blocks-when-reserving-free-block-pool.patch b/queue-5.15/xfs-don-t-include-bnobt-blocks-when-reserving-free-block-pool.patch new file mode 100644 index 00000000000..a531a8ed5aa --- /dev/null +++ b/queue-5.15/xfs-don-t-include-bnobt-blocks-when-reserving-free-block-pool.patch @@ -0,0 +1,97 @@ +From foo@baz Fri Jul 15 04:33:42 PM CEST 2022 +From: Leah Rumancik +Date: Thu, 14 Jul 2022 15:23:40 -0700 +Subject: xfs: don't include bnobt blocks when reserving free block pool +To: stable@vger.kernel.org, linux-xfs@vger.kernel.org +Cc: "Darrick J. Wong" , Brian Foster , Dave Chinner , Leah Rumancik +Message-ID: <20220714222342.4013916-3-leah.rumancik@gmail.com> + +From: "Darrick J. Wong" + +[ Upstream commit c8c568259772751a14e969b7230990508de73d9d ] + +xfs_reserve_blocks controls the size of the user-visible free space +reserve pool. Given the difference between the current and requested +pool sizes, it will try to reserve free space from fdblocks. However, +the amount requested from fdblocks is also constrained by the amount of +space that we think xfs_mod_fdblocks will give us. If we forget to +subtract m_allocbt_blks before calling xfs_mod_fdblocks, it will will +return ENOSPC and we'll hang the kernel at mount due to the infinite +loop. + +In commit fd43cf600cf6, we decided that xfs_mod_fdblocks should not hand +out the "free space" used by the free space btrees, because some portion +of the free space btrees hold in reserve space for future btree +expansion. Unfortunately, xfs_reserve_blocks' estimation of the number +of blocks that it could request from xfs_mod_fdblocks was not updated to +include m_allocbt_blks, so if space is extremely low, the caller hangs. + +Fix this by creating a function to estimate the number of blocks that +can be reserved from fdblocks, which needs to exclude the set-aside and +m_allocbt_blks. + +Found by running xfs/306 (which formats a single-AG 20MB filesystem) +with an fstests configuration that specifies a 1k blocksize and a +specially crafted log size that will consume 7/8 of the space (17920 +blocks, specifically) in that AG. + +Cc: Brian Foster +Fixes: fd43cf600cf6 ("xfs: set aside allocation btree blocks from block reservation") +Signed-off-by: Darrick J. Wong +Reviewed-by: Brian Foster +Reviewed-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_fsops.c | 2 +- + fs/xfs/xfs_mount.c | 2 +- + fs/xfs/xfs_mount.h | 15 +++++++++++++++ + 3 files changed, 17 insertions(+), 2 deletions(-) + +--- a/fs/xfs/xfs_fsops.c ++++ b/fs/xfs/xfs_fsops.c +@@ -434,7 +434,7 @@ xfs_reserve_blocks( + error = -ENOSPC; + do { + free = percpu_counter_sum(&mp->m_fdblocks) - +- mp->m_alloc_set_aside; ++ xfs_fdblocks_unavailable(mp); + if (free <= 0) + break; + +--- a/fs/xfs/xfs_mount.c ++++ b/fs/xfs/xfs_mount.c +@@ -1132,7 +1132,7 @@ xfs_mod_fdblocks( + * problems (i.e. transaction abort, pagecache discards, etc.) than + * slightly premature -ENOSPC. + */ +- set_aside = mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); ++ set_aside = xfs_fdblocks_unavailable(mp); + percpu_counter_add_batch(&mp->m_fdblocks, delta, batch); + if (__percpu_counter_compare(&mp->m_fdblocks, set_aside, + XFS_FDBLOCKS_BATCH) >= 0) { +--- a/fs/xfs/xfs_mount.h ++++ b/fs/xfs/xfs_mount.h +@@ -478,6 +478,21 @@ extern void xfs_unmountfs(xfs_mount_t *) + */ + #define XFS_FDBLOCKS_BATCH 1024 + ++/* ++ * Estimate the amount of free space that is not available to userspace and is ++ * not explicitly reserved from the incore fdblocks. This includes: ++ * ++ * - The minimum number of blocks needed to support splitting a bmap btree ++ * - The blocks currently in use by the freespace btrees because they record ++ * the actual blocks that will fill per-AG metadata space reservations ++ */ ++static inline uint64_t ++xfs_fdblocks_unavailable( ++ struct xfs_mount *mp) ++{ ++ return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); ++} ++ + extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, + bool reserved); + extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); diff --git a/queue-5.15/xfs-drop-async-cache-flushes-from-cil-commits.patch b/queue-5.15/xfs-drop-async-cache-flushes-from-cil-commits.patch new file mode 100644 index 00000000000..d6df38d61c1 --- /dev/null +++ b/queue-5.15/xfs-drop-async-cache-flushes-from-cil-commits.patch @@ -0,0 +1,361 @@ +From foo@baz Fri Jul 15 04:33:42 PM CEST 2022 +From: Leah Rumancik +Date: Thu, 14 Jul 2022 15:23:42 -0700 +Subject: xfs: drop async cache flushes from CIL commits. +To: stable@vger.kernel.org, linux-xfs@vger.kernel.org +Cc: Dave Chinner , Jan Kara , "Darrick J . Wong" , Leah Rumancik +Message-ID: <20220714222342.4013916-5-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit 919edbadebe17a67193533f531c2920c03e40fa4 ] + +Jan Kara reported a performance regression in dbench that he +bisected down to commit bad77c375e8d ("xfs: CIL checkpoint +flushes caches unconditionally"). + +Whilst developing the journal flush/fua optimisations this cache was +part of, it appeared to made a significant difference to +performance. However, now that this patchset has settled and all the +correctness issues fixed, there does not appear to be any +significant performance benefit to asynchronous cache flushes. + +In fact, the opposite is true on some storage types and workloads, +where additional cache flushes that can occur from fsync heavy +workloads have measurable and significant impact on overall +throughput. + +Local dbench testing shows little difference on dbench runs with +sync vs async cache flushes on either fast or slow SSD storage, and +no difference in streaming concurrent async transaction workloads +like fs-mark. + +Fast NVME storage. + +>From `dbench -t 30`, CIL scale: + +clients async sync + BW Latency BW Latency +1 935.18 0.855 915.64 0.903 +8 2404.51 6.873 2341.77 6.511 +16 3003.42 6.460 2931.57 6.529 +32 3697.23 7.939 3596.28 7.894 +128 7237.43 15.495 7217.74 11.588 +512 5079.24 90.587 5167.08 95.822 + +fsmark, 32 threads, create w/ 64 byte xattr w/32k logbsize + + create chown unlink +async 1m41s 1m16s 2m03s +sync 1m40s 1m19s 1m54s + +Slower SATA SSD storage: + +>From `dbench -t 30`, CIL scale: + +clients async sync + BW Latency BW Latency +1 78.59 15.792 83.78 10.729 +8 367.88 92.067 404.63 59.943 +16 564.51 72.524 602.71 76.089 +32 831.66 105.984 870.26 110.482 +128 1659.76 102.969 1624.73 91.356 +512 2135.91 223.054 2603.07 161.160 + +fsmark, 16 threads, create w/32k logbsize + + create unlink +async 5m06s 4m15s +sync 5m00s 4m22s + +And on Jan's test machine: + + 5.18-rc8-vanilla 5.18-rc8-patched +Amean 1 71.22 ( 0.00%) 64.94 * 8.81%* +Amean 2 93.03 ( 0.00%) 84.80 * 8.85%* +Amean 4 150.54 ( 0.00%) 137.51 * 8.66%* +Amean 8 252.53 ( 0.00%) 242.24 * 4.08%* +Amean 16 454.13 ( 0.00%) 439.08 * 3.31%* +Amean 32 835.24 ( 0.00%) 829.74 * 0.66%* +Amean 64 1740.59 ( 0.00%) 1686.73 * 3.09%* + +Performance and cache flush behaviour is restored to pre-regression +levels. + +As such, we can now consider the async cache flush mechanism an +unnecessary exercise in premature optimisation and hence we can +now remove it and the infrastructure it requires completely. + +Fixes: bad77c375e8d ("xfs: CIL checkpoint flushes caches unconditionally") +Reported-and-tested-by: Jan Kara +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_bio_io.c | 35 ----------------------------------- + fs/xfs/xfs_linux.h | 2 -- + fs/xfs/xfs_log.c | 36 +++++++++++------------------------- + fs/xfs/xfs_log_cil.c | 42 +++++++++++++----------------------------- + fs/xfs/xfs_log_priv.h | 3 +-- + 5 files changed, 25 insertions(+), 93 deletions(-) + +--- a/fs/xfs/xfs_bio_io.c ++++ b/fs/xfs/xfs_bio_io.c +@@ -9,41 +9,6 @@ static inline unsigned int bio_max_vecs( + return bio_max_segs(howmany(count, PAGE_SIZE)); + } + +-static void +-xfs_flush_bdev_async_endio( +- struct bio *bio) +-{ +- complete(bio->bi_private); +-} +- +-/* +- * Submit a request for an async cache flush to run. If the request queue does +- * not require flush operations, just skip it altogether. If the caller needs +- * to wait for the flush completion at a later point in time, they must supply a +- * valid completion. This will be signalled when the flush completes. The +- * caller never sees the bio that is issued here. +- */ +-void +-xfs_flush_bdev_async( +- struct bio *bio, +- struct block_device *bdev, +- struct completion *done) +-{ +- struct request_queue *q = bdev->bd_disk->queue; +- +- if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { +- complete(done); +- return; +- } +- +- bio_init(bio, NULL, 0); +- bio_set_dev(bio, bdev); +- bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; +- bio->bi_private = done; +- bio->bi_end_io = xfs_flush_bdev_async_endio; +- +- submit_bio(bio); +-} + int + xfs_rw_bdev( + struct block_device *bdev, +--- a/fs/xfs/xfs_linux.h ++++ b/fs/xfs/xfs_linux.h +@@ -197,8 +197,6 @@ static inline uint64_t howmany_64(uint64 + + int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count, + char *data, unsigned int op); +-void xfs_flush_bdev_async(struct bio *bio, struct block_device *bdev, +- struct completion *done); + + #define ASSERT_ALWAYS(expr) \ + (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__)) +--- a/fs/xfs/xfs_log.c ++++ b/fs/xfs/xfs_log.c +@@ -527,12 +527,6 @@ xlog_state_shutdown_callbacks( + * Flush iclog to disk if this is the last reference to the given iclog and the + * it is in the WANT_SYNC state. + * +- * If the caller passes in a non-zero @old_tail_lsn and the current log tail +- * does not match, there may be metadata on disk that must be persisted before +- * this iclog is written. To satisfy that requirement, set the +- * XLOG_ICL_NEED_FLUSH flag as a condition for writing this iclog with the new +- * log tail value. +- * + * If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the + * log tail is updated correctly. NEED_FUA indicates that the iclog will be + * written to stable storage, and implies that a commit record is contained +@@ -549,12 +543,10 @@ xlog_state_shutdown_callbacks( + * always capture the tail lsn on the iclog on the first NEED_FUA release + * regardless of the number of active reference counts on this iclog. + */ +- + int + xlog_state_release_iclog( + struct xlog *log, +- struct xlog_in_core *iclog, +- xfs_lsn_t old_tail_lsn) ++ struct xlog_in_core *iclog) + { + xfs_lsn_t tail_lsn; + bool last_ref; +@@ -565,18 +557,14 @@ xlog_state_release_iclog( + /* + * Grabbing the current log tail needs to be atomic w.r.t. the writing + * of the tail LSN into the iclog so we guarantee that the log tail does +- * not move between deciding if a cache flush is required and writing +- * the LSN into the iclog below. ++ * not move between the first time we know that the iclog needs to be ++ * made stable and when we eventually submit it. + */ +- if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) { ++ if ((iclog->ic_state == XLOG_STATE_WANT_SYNC || ++ (iclog->ic_flags & XLOG_ICL_NEED_FUA)) && ++ !iclog->ic_header.h_tail_lsn) { + tail_lsn = xlog_assign_tail_lsn(log->l_mp); +- +- if (old_tail_lsn && tail_lsn != old_tail_lsn) +- iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; +- +- if ((iclog->ic_flags & XLOG_ICL_NEED_FUA) && +- !iclog->ic_header.h_tail_lsn) +- iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); ++ iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); + } + + last_ref = atomic_dec_and_test(&iclog->ic_refcnt); +@@ -601,8 +589,6 @@ xlog_state_release_iclog( + } + + iclog->ic_state = XLOG_STATE_SYNCING; +- if (!iclog->ic_header.h_tail_lsn) +- iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn); + xlog_verify_tail_lsn(log, iclog); + trace_xlog_iclog_syncing(iclog, _RET_IP_); + +@@ -875,7 +861,7 @@ xlog_force_iclog( + iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA; + if (iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(iclog->ic_log, iclog, 0); +- return xlog_state_release_iclog(iclog->ic_log, iclog, 0); ++ return xlog_state_release_iclog(iclog->ic_log, iclog); + } + + /* +@@ -2413,7 +2399,7 @@ xlog_write_copy_finish( + ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC || + xlog_is_shutdown(log)); + release_iclog: +- error = xlog_state_release_iclog(log, iclog, 0); ++ error = xlog_state_release_iclog(log, iclog); + spin_unlock(&log->l_icloglock); + return error; + } +@@ -2630,7 +2616,7 @@ next_lv: + + spin_lock(&log->l_icloglock); + xlog_state_finish_copy(log, iclog, record_cnt, data_cnt); +- error = xlog_state_release_iclog(log, iclog, 0); ++ error = xlog_state_release_iclog(log, iclog); + spin_unlock(&log->l_icloglock); + + return error; +@@ -3054,7 +3040,7 @@ restart: + * reference to the iclog. + */ + if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1)) +- error = xlog_state_release_iclog(log, iclog, 0); ++ error = xlog_state_release_iclog(log, iclog); + spin_unlock(&log->l_icloglock); + if (error) + return error; +--- a/fs/xfs/xfs_log_cil.c ++++ b/fs/xfs/xfs_log_cil.c +@@ -681,11 +681,21 @@ xlog_cil_set_ctx_write_state( + * The LSN we need to pass to the log items on transaction + * commit is the LSN reported by the first log vector write, not + * the commit lsn. If we use the commit record lsn then we can +- * move the tail beyond the grant write head. ++ * move the grant write head beyond the tail LSN and overwrite ++ * it. + */ + ctx->start_lsn = lsn; + wake_up_all(&cil->xc_start_wait); + spin_unlock(&cil->xc_push_lock); ++ ++ /* ++ * Make sure the metadata we are about to overwrite in the log ++ * has been flushed to stable storage before this iclog is ++ * issued. ++ */ ++ spin_lock(&cil->xc_log->l_icloglock); ++ iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; ++ spin_unlock(&cil->xc_log->l_icloglock); + return; + } + +@@ -864,10 +874,7 @@ xlog_cil_push_work( + struct xfs_trans_header thdr; + struct xfs_log_iovec lhdr; + struct xfs_log_vec lvhdr = { NULL }; +- xfs_lsn_t preflush_tail_lsn; + xfs_csn_t push_seq; +- struct bio bio; +- DECLARE_COMPLETION_ONSTACK(bdev_flush); + bool push_commit_stable; + + new_ctx = xlog_cil_ctx_alloc(); +@@ -938,23 +945,6 @@ xlog_cil_push_work( + spin_unlock(&cil->xc_push_lock); + + /* +- * The CIL is stable at this point - nothing new will be added to it +- * because we hold the flush lock exclusively. Hence we can now issue +- * a cache flush to ensure all the completed metadata in the journal we +- * are about to overwrite is on stable storage. +- * +- * Because we are issuing this cache flush before we've written the +- * tail lsn to the iclog, we can have metadata IO completions move the +- * tail forwards between the completion of this flush and the iclog +- * being written. In this case, we need to re-issue the cache flush +- * before the iclog write. To detect whether the log tail moves, sample +- * the tail LSN *before* we issue the flush. +- */ +- preflush_tail_lsn = atomic64_read(&log->l_tail_lsn); +- xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev, +- &bdev_flush); +- +- /* + * Pull all the log vectors off the items in the CIL, and remove the + * items from the CIL. We don't need the CIL lock here because it's only + * needed on the transaction commit side which is currently locked out +@@ -1030,12 +1020,6 @@ xlog_cil_push_work( + lvhdr.lv_iovecp = &lhdr; + lvhdr.lv_next = ctx->lv_chain; + +- /* +- * Before we format and submit the first iclog, we have to ensure that +- * the metadata writeback ordering cache flush is complete. +- */ +- wait_for_completion(&bdev_flush); +- + error = xlog_cil_write_chain(ctx, &lvhdr); + if (error) + goto out_abort_free_ticket; +@@ -1094,7 +1078,7 @@ xlog_cil_push_work( + if (push_commit_stable && + ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE) + xlog_state_switch_iclogs(log, ctx->commit_iclog, 0); +- xlog_state_release_iclog(log, ctx->commit_iclog, preflush_tail_lsn); ++ xlog_state_release_iclog(log, ctx->commit_iclog); + + /* Not safe to reference ctx now! */ + +@@ -1115,7 +1099,7 @@ out_abort_free_ticket: + return; + } + spin_lock(&log->l_icloglock); +- xlog_state_release_iclog(log, ctx->commit_iclog, 0); ++ xlog_state_release_iclog(log, ctx->commit_iclog); + /* Not safe to reference ctx now! */ + spin_unlock(&log->l_icloglock); + } +--- a/fs/xfs/xfs_log_priv.h ++++ b/fs/xfs/xfs_log_priv.h +@@ -524,8 +524,7 @@ void xfs_log_ticket_regrant(struct xlog + + void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog, + int eventual_size); +-int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog, +- xfs_lsn_t log_tail_lsn); ++int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog); + + /* + * When we crack an atomic LSN, we sample it first so that the value will not diff --git a/queue-5.15/xfs-only-run-cow-extent-recovery-when-there-are-no-live-extents.patch b/queue-5.15/xfs-only-run-cow-extent-recovery-when-there-are-no-live-extents.patch new file mode 100644 index 00000000000..3bf777c3d20 --- /dev/null +++ b/queue-5.15/xfs-only-run-cow-extent-recovery-when-there-are-no-live-extents.patch @@ -0,0 +1,176 @@ +From foo@baz Fri Jul 15 04:33:42 PM CEST 2022 +From: Leah Rumancik +Date: Thu, 14 Jul 2022 15:23:39 -0700 +Subject: xfs: only run COW extent recovery when there are no live extents +To: stable@vger.kernel.org, linux-xfs@vger.kernel.org +Cc: "Darrick J. Wong" , Chandan Babu R , Dave Chinner , Leah Rumancik +Message-ID: <20220714222342.4013916-2-leah.rumancik@gmail.com> + +From: "Darrick J. Wong" + +[ Upstream commit 7993f1a431bc5271369d359941485a9340658ac3 ] + +As part of multiple customer escalations due to file data corruption +after copy on write operations, I wrote some fstests that use fsstress +to hammer on COW to shake things loose. Regrettably, I caught some +filesystem shutdowns due to incorrect rmap operations with the following +loop: + +mount # (0) +fsstress & # (1) +while true; do + fsstress + mount -o remount,ro # (2) + fsstress + mount -o remount,rw # (3) +done + +When (2) happens, notice that (1) is still running. xfs_remount_ro will +call xfs_blockgc_stop to walk the inode cache to free all the COW +extents, but the blockgc mechanism races with (1)'s reader threads to +take IOLOCKs and loses, which means that it doesn't clean them all out. +Call such a file (A). + +When (3) happens, xfs_remount_rw calls xfs_reflink_recover_cow, which +walks the ondisk refcount btree and frees any COW extent that it finds. +This function does not check the inode cache, which means that incore +COW forks of inode (A) is now inconsistent with the ondisk metadata. If +one of those former COW extents are allocated and mapped into another +file (B) and someone triggers a COW to the stale reservation in (A), A's +dirty data will be written into (B) and once that's done, those blocks +will be transferred to (A)'s data fork without bumping the refcount. + +The results are catastrophic -- file (B) and the refcount btree are now +corrupt. In the first patch, we fixed the race condition in (2) so that +(A) will always flush the COW fork. In this second patch, we move the +_recover_cow call to the initial mount call in (0) for safety. + +As mentioned previously, xfs_reflink_recover_cow walks the refcount +btree looking for COW staging extents, and frees them. This was +intended to be run at mount time (when we know there are no live inodes) +to clean up any leftover staging events that may have been left behind +during an unclean shutdown. As a time "optimization" for readonly +mounts, we deferred this to the ro->rw transition, not realizing that +any failure to clean all COW forks during a rw->ro transition would +result in catastrophic corruption. + +Therefore, remove this optimization and only run the recovery routine +when we're guaranteed not to have any COW staging extents anywhere, +which means we always run this at mount time. While we're at it, move +the callsite to xfs_log_mount_finish because any refcount btree +expansion (however unlikely given that we're removing records from the +right side of the index) must be fed by a per-AG reservation, which +doesn't exist in its current location. + +Fixes: 174edb0e46e5 ("xfs: store in-progress CoW allocations in the refcount btree") +Signed-off-by: Darrick J. Wong +Reviewed-by: Chandan Babu R +Reviewed-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_log_recover.c | 24 +++++++++++++++++++++++- + fs/xfs/xfs_mount.c | 10 ---------- + fs/xfs/xfs_reflink.c | 5 ++++- + fs/xfs/xfs_super.c | 9 --------- + 4 files changed, 27 insertions(+), 21 deletions(-) + +--- a/fs/xfs/xfs_log_recover.c ++++ b/fs/xfs/xfs_log_recover.c +@@ -27,7 +27,7 @@ + #include "xfs_buf_item.h" + #include "xfs_ag.h" + #include "xfs_quota.h" +- ++#include "xfs_reflink.h" + + #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) + +@@ -3502,6 +3502,28 @@ xlog_recover_finish( + + xlog_recover_process_iunlinks(log); + xlog_recover_check_summary(log); ++ ++ /* ++ * Recover any CoW staging blocks that are still referenced by the ++ * ondisk refcount metadata. During mount there cannot be any live ++ * staging extents as we have not permitted any user modifications. ++ * Therefore, it is safe to free them all right now, even on a ++ * read-only mount. ++ */ ++ error = xfs_reflink_recover_cow(log->l_mp); ++ if (error) { ++ xfs_alert(log->l_mp, ++ "Failed to recover leftover CoW staging extents, err %d.", ++ error); ++ /* ++ * If we get an error here, make sure the log is shut down ++ * but return zero so that any log items committed since the ++ * end of intents processing can be pushed through the CIL ++ * and AIL. ++ */ ++ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR); ++ } ++ + return 0; + } + +--- a/fs/xfs/xfs_mount.c ++++ b/fs/xfs/xfs_mount.c +@@ -922,15 +922,6 @@ xfs_mountfs( + xfs_warn(mp, + "Unable to allocate reserve blocks. Continuing without reserve pool."); + +- /* Recover any CoW blocks that never got remapped. */ +- error = xfs_reflink_recover_cow(mp); +- if (error) { +- xfs_err(mp, +- "Error %d recovering leftover CoW allocations.", error); +- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); +- goto out_quota; +- } +- + /* Reserve AG blocks for future btree expansion. */ + error = xfs_fs_reserve_ag_blocks(mp); + if (error && error != -ENOSPC) +@@ -941,7 +932,6 @@ xfs_mountfs( + + out_agresv: + xfs_fs_unreserve_ag_blocks(mp); +- out_quota: + xfs_qm_unmount_quotas(mp); + out_rtunmount: + xfs_rtunmount_inodes(mp); +--- a/fs/xfs/xfs_reflink.c ++++ b/fs/xfs/xfs_reflink.c +@@ -749,7 +749,10 @@ xfs_reflink_end_cow( + } + + /* +- * Free leftover CoW reservations that didn't get cleaned out. ++ * Free all CoW staging blocks that are still referenced by the ondisk refcount ++ * metadata. The ondisk metadata does not track which inode created the ++ * staging extent, so callers must ensure that there are no cached inodes with ++ * live CoW staging extents. + */ + int + xfs_reflink_recover_cow( +--- a/fs/xfs/xfs_super.c ++++ b/fs/xfs/xfs_super.c +@@ -1742,15 +1742,6 @@ xfs_remount_rw( + */ + xfs_restore_resvblks(mp); + xfs_log_work_queue(mp); +- +- /* Recover any CoW blocks that never got remapped. */ +- error = xfs_reflink_recover_cow(mp); +- if (error) { +- xfs_err(mp, +- "Error %d recovering leftover CoW allocations.", error); +- xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); +- return error; +- } + xfs_blockgc_start(mp); + + /* Create the per-AG metadata reservation pool .*/ diff --git a/queue-5.15/xfs-run-callbacks-before-waking-waiters-in-xlog_state_shutdown_callbacks.patch b/queue-5.15/xfs-run-callbacks-before-waking-waiters-in-xlog_state_shutdown_callbacks.patch new file mode 100644 index 00000000000..e87a0adea11 --- /dev/null +++ b/queue-5.15/xfs-run-callbacks-before-waking-waiters-in-xlog_state_shutdown_callbacks.patch @@ -0,0 +1,171 @@ +From foo@baz Fri Jul 15 04:33:42 PM CEST 2022 +From: Leah Rumancik +Date: Thu, 14 Jul 2022 15:23:41 -0700 +Subject: xfs: run callbacks before waking waiters in xlog_state_shutdown_callbacks +To: stable@vger.kernel.org, linux-xfs@vger.kernel.org +Cc: Dave Chinner , Brian Foster , "Darrick J . Wong" , Leah Rumancik +Message-ID: <20220714222342.4013916-4-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit cd6f79d1fb324968a3bae92f82eeb7d28ca1fd22 ] + +Brian reported a null pointer dereference failure during unmount in +xfs/006. He tracked the problem down to the AIL being torn down +before a log shutdown had completed and removed all the items from +the AIL. The failure occurred in this path while unmount was +proceeding in another task: + + xfs_trans_ail_delete+0x102/0x130 [xfs] + xfs_buf_item_done+0x22/0x30 [xfs] + xfs_buf_ioend+0x73/0x4d0 [xfs] + xfs_trans_committed_bulk+0x17e/0x2f0 [xfs] + xlog_cil_committed+0x2a9/0x300 [xfs] + xlog_cil_process_committed+0x69/0x80 [xfs] + xlog_state_shutdown_callbacks+0xce/0xf0 [xfs] + xlog_force_shutdown+0xdf/0x150 [xfs] + xfs_do_force_shutdown+0x5f/0x150 [xfs] + xlog_ioend_work+0x71/0x80 [xfs] + process_one_work+0x1c5/0x390 + worker_thread+0x30/0x350 + kthread+0xd7/0x100 + ret_from_fork+0x1f/0x30 + +This is processing an EIO error to a log write, and it's +triggering a force shutdown. This causes the log to be shut down, +and then it is running attached iclog callbacks from the shutdown +context. That means the fs and log has already been marked as +xfs_is_shutdown/xlog_is_shutdown and so high level code will abort +(e.g. xfs_trans_commit(), xfs_log_force(), etc) with an error +because of shutdown. + +The umount would have been blocked waiting for a log force +completion inside xfs_log_cover() -> xfs_sync_sb(). The first thing +for this situation to occur is for xfs_sync_sb() to exit without +waiting for the iclog buffer to be comitted to disk. The +above trace is the completion routine for the iclog buffer, and +it is shutting down the filesystem. + +xlog_state_shutdown_callbacks() does this: + +{ + struct xlog_in_core *iclog; + LIST_HEAD(cb_list); + + spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + do { + if (atomic_read(&iclog->ic_refcnt)) { + /* Reference holder will re-run iclog callbacks. */ + continue; + } + list_splice_init(&iclog->ic_callbacks, &cb_list); +>>>>>> wake_up_all(&iclog->ic_write_wait); +>>>>>> wake_up_all(&iclog->ic_force_wait); + } while ((iclog = iclog->ic_next) != log->l_iclog); + + wake_up_all(&log->l_flush_wait); + spin_unlock(&log->l_icloglock); + +>>>>>> xlog_cil_process_committed(&cb_list); +} + +This wakes any thread waiting on IO completion of the iclog (in this +case the umount log force) before shutdown processes all the pending +callbacks. That means the xfs_sync_sb() waiting on a sync +transaction in xfs_log_force() on iclog->ic_force_wait will get +woken before the callbacks attached to that iclog are run. This +results in xfs_sync_sb() returning an error, and so unmount unblocks +and continues to run whilst the log shutdown is still in progress. + +Normally this is just fine because the force waiter has nothing to +do with AIL operations. But in the case of this unmount path, the +log force waiter goes on to tear down the AIL because the log is now +shut down and so nothing ever blocks it again from the wait point in +xfs_log_cover(). + +Hence it's a race to see who gets to the AIL first - the unmount +code or xlog_cil_process_committed() killing the superblock buffer. + +To fix this, we just have to change the order of processing in +xlog_state_shutdown_callbacks() to run the callbacks before it wakes +any task waiting on completion of the iclog. + +Reported-by: Brian Foster +Fixes: aad7272a9208 ("xfs: separate out log shutdown callback processing") +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_log.c | 22 +++++++++++++--------- + 1 file changed, 13 insertions(+), 9 deletions(-) + +--- a/fs/xfs/xfs_log.c ++++ b/fs/xfs/xfs_log.c +@@ -487,7 +487,10 @@ out_error: + * Run all the pending iclog callbacks and wake log force waiters and iclog + * space waiters so they can process the newly set shutdown state. We really + * don't care what order we process callbacks here because the log is shut down +- * and so state cannot change on disk anymore. ++ * and so state cannot change on disk anymore. However, we cannot wake waiters ++ * until the callbacks have been processed because we may be in unmount and ++ * we must ensure that all AIL operations the callbacks perform have completed ++ * before we tear down the AIL. + * + * We avoid processing actively referenced iclogs so that we don't run callbacks + * while the iclog owner might still be preparing the iclog for IO submssion. +@@ -501,7 +504,6 @@ xlog_state_shutdown_callbacks( + struct xlog_in_core *iclog; + LIST_HEAD(cb_list); + +- spin_lock(&log->l_icloglock); + iclog = log->l_iclog; + do { + if (atomic_read(&iclog->ic_refcnt)) { +@@ -509,14 +511,16 @@ xlog_state_shutdown_callbacks( + continue; + } + list_splice_init(&iclog->ic_callbacks, &cb_list); ++ spin_unlock(&log->l_icloglock); ++ ++ xlog_cil_process_committed(&cb_list); ++ ++ spin_lock(&log->l_icloglock); + wake_up_all(&iclog->ic_write_wait); + wake_up_all(&iclog->ic_force_wait); + } while ((iclog = iclog->ic_next) != log->l_iclog); + + wake_up_all(&log->l_flush_wait); +- spin_unlock(&log->l_icloglock); +- +- xlog_cil_process_committed(&cb_list); + } + + /* +@@ -583,11 +587,8 @@ xlog_state_release_iclog( + * pending iclog callbacks that were waiting on the release of + * this iclog. + */ +- if (last_ref) { +- spin_unlock(&log->l_icloglock); ++ if (last_ref) + xlog_state_shutdown_callbacks(log); +- spin_lock(&log->l_icloglock); +- } + return -EIO; + } + +@@ -3904,7 +3905,10 @@ xlog_force_shutdown( + wake_up_all(&log->l_cilp->xc_start_wait); + wake_up_all(&log->l_cilp->xc_commit_wait); + spin_unlock(&log->l_cilp->xc_push_lock); ++ ++ spin_lock(&log->l_icloglock); + xlog_state_shutdown_callbacks(log); ++ spin_unlock(&log->l_icloglock); + + return log_error; + } -- 2.47.3