]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.15-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 15 Jul 2022 14:34:22 +0000 (16:34 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 15 Jul 2022 14:34:22 +0000 (16:34 +0200)
added patches:
xfs-don-t-include-bnobt-blocks-when-reserving-free-block-pool.patch
xfs-drop-async-cache-flushes-from-cil-commits.patch
xfs-only-run-cow-extent-recovery-when-there-are-no-live-extents.patch
xfs-run-callbacks-before-waking-waiters-in-xlog_state_shutdown_callbacks.patch

queue-5.15/series
queue-5.15/xfs-don-t-include-bnobt-blocks-when-reserving-free-block-pool.patch [new file with mode: 0644]
queue-5.15/xfs-drop-async-cache-flushes-from-cil-commits.patch [new file with mode: 0644]
queue-5.15/xfs-only-run-cow-extent-recovery-when-there-are-no-live-extents.patch [new file with mode: 0644]
queue-5.15/xfs-run-callbacks-before-waking-waiters-in-xlog_state_shutdown_callbacks.patch [new file with mode: 0644]

index 8cc9026057c48937cb7a01c294bebe157f62fa4d..673de22c000deeeb436ec8ab498de8e3d88503a7 100644 (file)
@@ -23,3 +23,7 @@ fs-remap-constrain-dedupe-of-eof-blocks.patch
 nilfs2-fix-incorrect-masking-of-permission-flags-for-symlinks.patch
 sh-convert-nommu-io-re-un-map-to-static-inline-functions.patch
 revert-evm-fix-memleak-in-init_desc.patch
+xfs-only-run-cow-extent-recovery-when-there-are-no-live-extents.patch
+xfs-don-t-include-bnobt-blocks-when-reserving-free-block-pool.patch
+xfs-run-callbacks-before-waking-waiters-in-xlog_state_shutdown_callbacks.patch
+xfs-drop-async-cache-flushes-from-cil-commits.patch
diff --git a/queue-5.15/xfs-don-t-include-bnobt-blocks-when-reserving-free-block-pool.patch b/queue-5.15/xfs-don-t-include-bnobt-blocks-when-reserving-free-block-pool.patch
new file mode 100644 (file)
index 0000000..a531a8e
--- /dev/null
@@ -0,0 +1,97 @@
+From foo@baz Fri Jul 15 04:33:42 PM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Thu, 14 Jul 2022 15:23:40 -0700
+Subject: xfs: don't include bnobt blocks when reserving free block pool
+To: stable@vger.kernel.org, linux-xfs@vger.kernel.org
+Cc: "Darrick J. Wong" <djwong@kernel.org>, Brian Foster <bfoster@redhat.com>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220714222342.4013916-3-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit c8c568259772751a14e969b7230990508de73d9d ]
+
+xfs_reserve_blocks controls the size of the user-visible free space
+reserve pool.  Given the difference between the current and requested
+pool sizes, it will try to reserve free space from fdblocks.  However,
+the amount requested from fdblocks is also constrained by the amount of
+space that we think xfs_mod_fdblocks will give us.  If we forget to
+subtract m_allocbt_blks before calling xfs_mod_fdblocks, it will will
+return ENOSPC and we'll hang the kernel at mount due to the infinite
+loop.
+
+In commit fd43cf600cf6, we decided that xfs_mod_fdblocks should not hand
+out the "free space" used by the free space btrees, because some portion
+of the free space btrees hold in reserve space for future btree
+expansion.  Unfortunately, xfs_reserve_blocks' estimation of the number
+of blocks that it could request from xfs_mod_fdblocks was not updated to
+include m_allocbt_blks, so if space is extremely low, the caller hangs.
+
+Fix this by creating a function to estimate the number of blocks that
+can be reserved from fdblocks, which needs to exclude the set-aside and
+m_allocbt_blks.
+
+Found by running xfs/306 (which formats a single-AG 20MB filesystem)
+with an fstests configuration that specifies a 1k blocksize and a
+specially crafted log size that will consume 7/8 of the space (17920
+blocks, specifically) in that AG.
+
+Cc: Brian Foster <bfoster@redhat.com>
+Fixes: fd43cf600cf6 ("xfs: set aside allocation btree blocks from block reservation")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Brian Foster <bfoster@redhat.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_fsops.c |    2 +-
+ fs/xfs/xfs_mount.c |    2 +-
+ fs/xfs/xfs_mount.h |   15 +++++++++++++++
+ 3 files changed, 17 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_fsops.c
++++ b/fs/xfs/xfs_fsops.c
+@@ -434,7 +434,7 @@ xfs_reserve_blocks(
+       error = -ENOSPC;
+       do {
+               free = percpu_counter_sum(&mp->m_fdblocks) -
+-                                              mp->m_alloc_set_aside;
++                                              xfs_fdblocks_unavailable(mp);
+               if (free <= 0)
+                       break;
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -1132,7 +1132,7 @@ xfs_mod_fdblocks(
+        * problems (i.e. transaction abort, pagecache discards, etc.) than
+        * slightly premature -ENOSPC.
+        */
+-      set_aside = mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
++      set_aside = xfs_fdblocks_unavailable(mp);
+       percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
+       if (__percpu_counter_compare(&mp->m_fdblocks, set_aside,
+                                    XFS_FDBLOCKS_BATCH) >= 0) {
+--- a/fs/xfs/xfs_mount.h
++++ b/fs/xfs/xfs_mount.h
+@@ -478,6 +478,21 @@ extern void       xfs_unmountfs(xfs_mount_t *)
+  */
+ #define XFS_FDBLOCKS_BATCH    1024
++/*
++ * Estimate the amount of free space that is not available to userspace and is
++ * not explicitly reserved from the incore fdblocks.  This includes:
++ *
++ * - The minimum number of blocks needed to support splitting a bmap btree
++ * - The blocks currently in use by the freespace btrees because they record
++ *   the actual blocks that will fill per-AG metadata space reservations
++ */
++static inline uint64_t
++xfs_fdblocks_unavailable(
++      struct xfs_mount        *mp)
++{
++      return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks);
++}
++
+ extern int    xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
+                                bool reserved);
+ extern int    xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
diff --git a/queue-5.15/xfs-drop-async-cache-flushes-from-cil-commits.patch b/queue-5.15/xfs-drop-async-cache-flushes-from-cil-commits.patch
new file mode 100644 (file)
index 0000000..d6df38d
--- /dev/null
@@ -0,0 +1,361 @@
+From foo@baz Fri Jul 15 04:33:42 PM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Thu, 14 Jul 2022 15:23:42 -0700
+Subject: xfs: drop async cache flushes from CIL commits.
+To: stable@vger.kernel.org, linux-xfs@vger.kernel.org
+Cc: Dave Chinner <dchinner@redhat.com>, Jan Kara <jack@suse.cz>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220714222342.4013916-5-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 919edbadebe17a67193533f531c2920c03e40fa4 ]
+
+Jan Kara reported a performance regression in dbench that he
+bisected down to commit bad77c375e8d ("xfs: CIL checkpoint
+flushes caches unconditionally").
+
+Whilst developing the journal flush/fua optimisations this cache was
+part of, it appeared to made a significant difference to
+performance. However, now that this patchset has settled and all the
+correctness issues fixed, there does not appear to be any
+significant performance benefit to asynchronous cache flushes.
+
+In fact, the opposite is true on some storage types and workloads,
+where additional cache flushes that can occur from fsync heavy
+workloads have measurable and significant impact on overall
+throughput.
+
+Local dbench testing shows little difference on dbench runs with
+sync vs async cache flushes on either fast or slow SSD storage, and
+no difference in streaming concurrent async transaction workloads
+like fs-mark.
+
+Fast NVME storage.
+
+>From `dbench -t 30`, CIL scale:
+
+clients                async                   sync
+               BW      Latency         BW      Latency
+1               935.18   0.855          915.64   0.903
+8              2404.51   6.873         2341.77   6.511
+16             3003.42   6.460         2931.57   6.529
+32             3697.23   7.939         3596.28   7.894
+128            7237.43  15.495         7217.74  11.588
+512            5079.24  90.587         5167.08  95.822
+
+fsmark, 32 threads, create w/ 64 byte xattr w/32k logbsize
+
+       create          chown           unlink
+async   1m41s          1m16s           2m03s
+sync   1m40s           1m19s           1m54s
+
+Slower SATA SSD storage:
+
+>From `dbench -t 30`, CIL scale:
+
+clients                async                   sync
+               BW      Latency         BW      Latency
+1                78.59  15.792           83.78  10.729
+8               367.88  92.067          404.63  59.943
+16              564.51  72.524          602.71  76.089
+32              831.66 105.984          870.26 110.482
+128            1659.76 102.969         1624.73  91.356
+512            2135.91 223.054         2603.07 161.160
+
+fsmark, 16 threads, create w/32k logbsize
+
+       create          unlink
+async   5m06s          4m15s
+sync   5m00s           4m22s
+
+And on Jan's test machine:
+
+                   5.18-rc8-vanilla       5.18-rc8-patched
+Amean     1        71.22 (   0.00%)       64.94 *   8.81%*
+Amean     2        93.03 (   0.00%)       84.80 *   8.85%*
+Amean     4       150.54 (   0.00%)      137.51 *   8.66%*
+Amean     8       252.53 (   0.00%)      242.24 *   4.08%*
+Amean     16      454.13 (   0.00%)      439.08 *   3.31%*
+Amean     32      835.24 (   0.00%)      829.74 *   0.66%*
+Amean     64     1740.59 (   0.00%)     1686.73 *   3.09%*
+
+Performance and cache flush behaviour is restored to pre-regression
+levels.
+
+As such, we can now consider the async cache flush mechanism an
+unnecessary exercise in premature optimisation and hence we can
+now remove it and the infrastructure it requires completely.
+
+Fixes: bad77c375e8d ("xfs: CIL checkpoint flushes caches unconditionally")
+Reported-and-tested-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_bio_io.c   |   35 -----------------------------------
+ fs/xfs/xfs_linux.h    |    2 --
+ fs/xfs/xfs_log.c      |   36 +++++++++++-------------------------
+ fs/xfs/xfs_log_cil.c  |   42 +++++++++++++-----------------------------
+ fs/xfs/xfs_log_priv.h |    3 +--
+ 5 files changed, 25 insertions(+), 93 deletions(-)
+
+--- a/fs/xfs/xfs_bio_io.c
++++ b/fs/xfs/xfs_bio_io.c
+@@ -9,41 +9,6 @@ static inline unsigned int bio_max_vecs(
+       return bio_max_segs(howmany(count, PAGE_SIZE));
+ }
+-static void
+-xfs_flush_bdev_async_endio(
+-      struct bio      *bio)
+-{
+-      complete(bio->bi_private);
+-}
+-
+-/*
+- * Submit a request for an async cache flush to run. If the request queue does
+- * not require flush operations, just skip it altogether. If the caller needs
+- * to wait for the flush completion at a later point in time, they must supply a
+- * valid completion. This will be signalled when the flush completes.  The
+- * caller never sees the bio that is issued here.
+- */
+-void
+-xfs_flush_bdev_async(
+-      struct bio              *bio,
+-      struct block_device     *bdev,
+-      struct completion       *done)
+-{
+-      struct request_queue    *q = bdev->bd_disk->queue;
+-
+-      if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
+-              complete(done);
+-              return;
+-      }
+-
+-      bio_init(bio, NULL, 0);
+-      bio_set_dev(bio, bdev);
+-      bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
+-      bio->bi_private = done;
+-      bio->bi_end_io = xfs_flush_bdev_async_endio;
+-
+-      submit_bio(bio);
+-}
+ int
+ xfs_rw_bdev(
+       struct block_device     *bdev,
+--- a/fs/xfs/xfs_linux.h
++++ b/fs/xfs/xfs_linux.h
+@@ -197,8 +197,6 @@ static inline uint64_t howmany_64(uint64
+ int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
+               char *data, unsigned int op);
+-void xfs_flush_bdev_async(struct bio *bio, struct block_device *bdev,
+-              struct completion *done);
+ #define ASSERT_ALWAYS(expr)   \
+       (likely(expr) ? (void)0 : assfail(NULL, #expr, __FILE__, __LINE__))
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -527,12 +527,6 @@ xlog_state_shutdown_callbacks(
+  * Flush iclog to disk if this is the last reference to the given iclog and the
+  * it is in the WANT_SYNC state.
+  *
+- * If the caller passes in a non-zero @old_tail_lsn and the current log tail
+- * does not match, there may be metadata on disk that must be persisted before
+- * this iclog is written.  To satisfy that requirement, set the
+- * XLOG_ICL_NEED_FLUSH flag as a condition for writing this iclog with the new
+- * log tail value.
+- *
+  * If XLOG_ICL_NEED_FUA is already set on the iclog, we need to ensure that the
+  * log tail is updated correctly. NEED_FUA indicates that the iclog will be
+  * written to stable storage, and implies that a commit record is contained
+@@ -549,12 +543,10 @@ xlog_state_shutdown_callbacks(
+  * always capture the tail lsn on the iclog on the first NEED_FUA release
+  * regardless of the number of active reference counts on this iclog.
+  */
+-
+ int
+ xlog_state_release_iclog(
+       struct xlog             *log,
+-      struct xlog_in_core     *iclog,
+-      xfs_lsn_t               old_tail_lsn)
++      struct xlog_in_core     *iclog)
+ {
+       xfs_lsn_t               tail_lsn;
+       bool                    last_ref;
+@@ -565,18 +557,14 @@ xlog_state_release_iclog(
+       /*
+        * Grabbing the current log tail needs to be atomic w.r.t. the writing
+        * of the tail LSN into the iclog so we guarantee that the log tail does
+-       * not move between deciding if a cache flush is required and writing
+-       * the LSN into the iclog below.
++       * not move between the first time we know that the iclog needs to be
++       * made stable and when we eventually submit it.
+        */
+-      if (old_tail_lsn || iclog->ic_state == XLOG_STATE_WANT_SYNC) {
++      if ((iclog->ic_state == XLOG_STATE_WANT_SYNC ||
++           (iclog->ic_flags & XLOG_ICL_NEED_FUA)) &&
++          !iclog->ic_header.h_tail_lsn) {
+               tail_lsn = xlog_assign_tail_lsn(log->l_mp);
+-
+-              if (old_tail_lsn && tail_lsn != old_tail_lsn)
+-                      iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
+-
+-              if ((iclog->ic_flags & XLOG_ICL_NEED_FUA) &&
+-                  !iclog->ic_header.h_tail_lsn)
+-                      iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
++              iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+       }
+       last_ref = atomic_dec_and_test(&iclog->ic_refcnt);
+@@ -601,8 +589,6 @@ xlog_state_release_iclog(
+       }
+       iclog->ic_state = XLOG_STATE_SYNCING;
+-      if (!iclog->ic_header.h_tail_lsn)
+-              iclog->ic_header.h_tail_lsn = cpu_to_be64(tail_lsn);
+       xlog_verify_tail_lsn(log, iclog);
+       trace_xlog_iclog_syncing(iclog, _RET_IP_);
+@@ -875,7 +861,7 @@ xlog_force_iclog(
+       iclog->ic_flags |= XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA;
+       if (iclog->ic_state == XLOG_STATE_ACTIVE)
+               xlog_state_switch_iclogs(iclog->ic_log, iclog, 0);
+-      return xlog_state_release_iclog(iclog->ic_log, iclog, 0);
++      return xlog_state_release_iclog(iclog->ic_log, iclog);
+ }
+ /*
+@@ -2413,7 +2399,7 @@ xlog_write_copy_finish(
+               ASSERT(iclog->ic_state == XLOG_STATE_WANT_SYNC ||
+                       xlog_is_shutdown(log));
+ release_iclog:
+-      error = xlog_state_release_iclog(log, iclog, 0);
++      error = xlog_state_release_iclog(log, iclog);
+       spin_unlock(&log->l_icloglock);
+       return error;
+ }
+@@ -2630,7 +2616,7 @@ next_lv:
+       spin_lock(&log->l_icloglock);
+       xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
+-      error = xlog_state_release_iclog(log, iclog, 0);
++      error = xlog_state_release_iclog(log, iclog);
+       spin_unlock(&log->l_icloglock);
+       return error;
+@@ -3054,7 +3040,7 @@ restart:
+                * reference to the iclog.
+                */
+               if (!atomic_add_unless(&iclog->ic_refcnt, -1, 1))
+-                      error = xlog_state_release_iclog(log, iclog, 0);
++                      error = xlog_state_release_iclog(log, iclog);
+               spin_unlock(&log->l_icloglock);
+               if (error)
+                       return error;
+--- a/fs/xfs/xfs_log_cil.c
++++ b/fs/xfs/xfs_log_cil.c
+@@ -681,11 +681,21 @@ xlog_cil_set_ctx_write_state(
+                * The LSN we need to pass to the log items on transaction
+                * commit is the LSN reported by the first log vector write, not
+                * the commit lsn. If we use the commit record lsn then we can
+-               * move the tail beyond the grant write head.
++               * move the grant write head beyond the tail LSN and overwrite
++               * it.
+                */
+               ctx->start_lsn = lsn;
+               wake_up_all(&cil->xc_start_wait);
+               spin_unlock(&cil->xc_push_lock);
++
++              /*
++               * Make sure the metadata we are about to overwrite in the log
++               * has been flushed to stable storage before this iclog is
++               * issued.
++               */
++              spin_lock(&cil->xc_log->l_icloglock);
++              iclog->ic_flags |= XLOG_ICL_NEED_FLUSH;
++              spin_unlock(&cil->xc_log->l_icloglock);
+               return;
+       }
+@@ -864,10 +874,7 @@ xlog_cil_push_work(
+       struct xfs_trans_header thdr;
+       struct xfs_log_iovec    lhdr;
+       struct xfs_log_vec      lvhdr = { NULL };
+-      xfs_lsn_t               preflush_tail_lsn;
+       xfs_csn_t               push_seq;
+-      struct bio              bio;
+-      DECLARE_COMPLETION_ONSTACK(bdev_flush);
+       bool                    push_commit_stable;
+       new_ctx = xlog_cil_ctx_alloc();
+@@ -938,23 +945,6 @@ xlog_cil_push_work(
+       spin_unlock(&cil->xc_push_lock);
+       /*
+-       * The CIL is stable at this point - nothing new will be added to it
+-       * because we hold the flush lock exclusively. Hence we can now issue
+-       * a cache flush to ensure all the completed metadata in the journal we
+-       * are about to overwrite is on stable storage.
+-       *
+-       * Because we are issuing this cache flush before we've written the
+-       * tail lsn to the iclog, we can have metadata IO completions move the
+-       * tail forwards between the completion of this flush and the iclog
+-       * being written. In this case, we need to re-issue the cache flush
+-       * before the iclog write. To detect whether the log tail moves, sample
+-       * the tail LSN *before* we issue the flush.
+-       */
+-      preflush_tail_lsn = atomic64_read(&log->l_tail_lsn);
+-      xfs_flush_bdev_async(&bio, log->l_mp->m_ddev_targp->bt_bdev,
+-                              &bdev_flush);
+-
+-      /*
+        * Pull all the log vectors off the items in the CIL, and remove the
+        * items from the CIL. We don't need the CIL lock here because it's only
+        * needed on the transaction commit side which is currently locked out
+@@ -1030,12 +1020,6 @@ xlog_cil_push_work(
+       lvhdr.lv_iovecp = &lhdr;
+       lvhdr.lv_next = ctx->lv_chain;
+-      /*
+-       * Before we format and submit the first iclog, we have to ensure that
+-       * the metadata writeback ordering cache flush is complete.
+-       */
+-      wait_for_completion(&bdev_flush);
+-
+       error = xlog_cil_write_chain(ctx, &lvhdr);
+       if (error)
+               goto out_abort_free_ticket;
+@@ -1094,7 +1078,7 @@ xlog_cil_push_work(
+       if (push_commit_stable &&
+           ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE)
+               xlog_state_switch_iclogs(log, ctx->commit_iclog, 0);
+-      xlog_state_release_iclog(log, ctx->commit_iclog, preflush_tail_lsn);
++      xlog_state_release_iclog(log, ctx->commit_iclog);
+       /* Not safe to reference ctx now! */
+@@ -1115,7 +1099,7 @@ out_abort_free_ticket:
+               return;
+       }
+       spin_lock(&log->l_icloglock);
+-      xlog_state_release_iclog(log, ctx->commit_iclog, 0);
++      xlog_state_release_iclog(log, ctx->commit_iclog);
+       /* Not safe to reference ctx now! */
+       spin_unlock(&log->l_icloglock);
+ }
+--- a/fs/xfs/xfs_log_priv.h
++++ b/fs/xfs/xfs_log_priv.h
+@@ -524,8 +524,7 @@ void       xfs_log_ticket_regrant(struct xlog
+ void xlog_state_switch_iclogs(struct xlog *log, struct xlog_in_core *iclog,
+               int eventual_size);
+-int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog,
+-              xfs_lsn_t log_tail_lsn);
++int xlog_state_release_iclog(struct xlog *log, struct xlog_in_core *iclog);
+ /*
+  * When we crack an atomic LSN, we sample it first so that the value will not
diff --git a/queue-5.15/xfs-only-run-cow-extent-recovery-when-there-are-no-live-extents.patch b/queue-5.15/xfs-only-run-cow-extent-recovery-when-there-are-no-live-extents.patch
new file mode 100644 (file)
index 0000000..3bf777c
--- /dev/null
@@ -0,0 +1,176 @@
+From foo@baz Fri Jul 15 04:33:42 PM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Thu, 14 Jul 2022 15:23:39 -0700
+Subject: xfs: only run COW extent recovery when there are no live extents
+To: stable@vger.kernel.org, linux-xfs@vger.kernel.org
+Cc: "Darrick J. Wong" <djwong@kernel.org>, Chandan Babu R <chandan.babu@oracle.com>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220714222342.4013916-2-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 7993f1a431bc5271369d359941485a9340658ac3 ]
+
+As part of multiple customer escalations due to file data corruption
+after copy on write operations, I wrote some fstests that use fsstress
+to hammer on COW to shake things loose.  Regrettably, I caught some
+filesystem shutdowns due to incorrect rmap operations with the following
+loop:
+
+mount <filesystem>                             # (0)
+fsstress <run only readonly ops> &             # (1)
+while true; do
+       fsstress <run all ops>
+       mount -o remount,ro                     # (2)
+       fsstress <run only readonly ops>
+       mount -o remount,rw                     # (3)
+done
+
+When (2) happens, notice that (1) is still running.  xfs_remount_ro will
+call xfs_blockgc_stop to walk the inode cache to free all the COW
+extents, but the blockgc mechanism races with (1)'s reader threads to
+take IOLOCKs and loses, which means that it doesn't clean them all out.
+Call such a file (A).
+
+When (3) happens, xfs_remount_rw calls xfs_reflink_recover_cow, which
+walks the ondisk refcount btree and frees any COW extent that it finds.
+This function does not check the inode cache, which means that incore
+COW forks of inode (A) is now inconsistent with the ondisk metadata.  If
+one of those former COW extents are allocated and mapped into another
+file (B) and someone triggers a COW to the stale reservation in (A), A's
+dirty data will be written into (B) and once that's done, those blocks
+will be transferred to (A)'s data fork without bumping the refcount.
+
+The results are catastrophic -- file (B) and the refcount btree are now
+corrupt.  In the first patch, we fixed the race condition in (2) so that
+(A) will always flush the COW fork.  In this second patch, we move the
+_recover_cow call to the initial mount call in (0) for safety.
+
+As mentioned previously, xfs_reflink_recover_cow walks the refcount
+btree looking for COW staging extents, and frees them.  This was
+intended to be run at mount time (when we know there are no live inodes)
+to clean up any leftover staging events that may have been left behind
+during an unclean shutdown.  As a time "optimization" for readonly
+mounts, we deferred this to the ro->rw transition, not realizing that
+any failure to clean all COW forks during a rw->ro transition would
+result in catastrophic corruption.
+
+Therefore, remove this optimization and only run the recovery routine
+when we're guaranteed not to have any COW staging extents anywhere,
+which means we always run this at mount time.  While we're at it, move
+the callsite to xfs_log_mount_finish because any refcount btree
+expansion (however unlikely given that we're removing records from the
+right side of the index) must be fed by a per-AG reservation, which
+doesn't exist in its current location.
+
+Fixes: 174edb0e46e5 ("xfs: store in-progress CoW allocations in the refcount btree")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Chandan Babu R <chandan.babu@oracle.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c |   24 +++++++++++++++++++++++-
+ fs/xfs/xfs_mount.c       |   10 ----------
+ fs/xfs/xfs_reflink.c     |    5 ++++-
+ fs/xfs/xfs_super.c       |    9 ---------
+ 4 files changed, 27 insertions(+), 21 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -27,7 +27,7 @@
+ #include "xfs_buf_item.h"
+ #include "xfs_ag.h"
+ #include "xfs_quota.h"
+-
++#include "xfs_reflink.h"
+ #define BLK_AVG(blk1, blk2)   ((blk1+blk2) >> 1)
+@@ -3502,6 +3502,28 @@ xlog_recover_finish(
+       xlog_recover_process_iunlinks(log);
+       xlog_recover_check_summary(log);
++
++      /*
++       * Recover any CoW staging blocks that are still referenced by the
++       * ondisk refcount metadata.  During mount there cannot be any live
++       * staging extents as we have not permitted any user modifications.
++       * Therefore, it is safe to free them all right now, even on a
++       * read-only mount.
++       */
++      error = xfs_reflink_recover_cow(log->l_mp);
++      if (error) {
++              xfs_alert(log->l_mp,
++      "Failed to recover leftover CoW staging extents, err %d.",
++                              error);
++              /*
++               * If we get an error here, make sure the log is shut down
++               * but return zero so that any log items committed since the
++               * end of intents processing can be pushed through the CIL
++               * and AIL.
++               */
++              xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
++      }
++
+       return 0;
+ }
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -922,15 +922,6 @@ xfs_mountfs(
+                       xfs_warn(mp,
+       "Unable to allocate reserve blocks. Continuing without reserve pool.");
+-              /* Recover any CoW blocks that never got remapped. */
+-              error = xfs_reflink_recover_cow(mp);
+-              if (error) {
+-                      xfs_err(mp,
+-      "Error %d recovering leftover CoW allocations.", error);
+-                      xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+-                      goto out_quota;
+-              }
+-
+               /* Reserve AG blocks for future btree expansion. */
+               error = xfs_fs_reserve_ag_blocks(mp);
+               if (error && error != -ENOSPC)
+@@ -941,7 +932,6 @@ xfs_mountfs(
+  out_agresv:
+       xfs_fs_unreserve_ag_blocks(mp);
+- out_quota:
+       xfs_qm_unmount_quotas(mp);
+  out_rtunmount:
+       xfs_rtunmount_inodes(mp);
+--- a/fs/xfs/xfs_reflink.c
++++ b/fs/xfs/xfs_reflink.c
+@@ -749,7 +749,10 @@ xfs_reflink_end_cow(
+ }
+ /*
+- * Free leftover CoW reservations that didn't get cleaned out.
++ * Free all CoW staging blocks that are still referenced by the ondisk refcount
++ * metadata.  The ondisk metadata does not track which inode created the
++ * staging extent, so callers must ensure that there are no cached inodes with
++ * live CoW staging extents.
+  */
+ int
+ xfs_reflink_recover_cow(
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1742,15 +1742,6 @@ xfs_remount_rw(
+        */
+       xfs_restore_resvblks(mp);
+       xfs_log_work_queue(mp);
+-
+-      /* Recover any CoW blocks that never got remapped. */
+-      error = xfs_reflink_recover_cow(mp);
+-      if (error) {
+-              xfs_err(mp,
+-                      "Error %d recovering leftover CoW allocations.", error);
+-              xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+-              return error;
+-      }
+       xfs_blockgc_start(mp);
+       /* Create the per-AG metadata reservation pool .*/
diff --git a/queue-5.15/xfs-run-callbacks-before-waking-waiters-in-xlog_state_shutdown_callbacks.patch b/queue-5.15/xfs-run-callbacks-before-waking-waiters-in-xlog_state_shutdown_callbacks.patch
new file mode 100644 (file)
index 0000000..e87a0ad
--- /dev/null
@@ -0,0 +1,171 @@
+From foo@baz Fri Jul 15 04:33:42 PM CEST 2022
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Thu, 14 Jul 2022 15:23:41 -0700
+Subject: xfs: run callbacks before waking waiters in xlog_state_shutdown_callbacks
+To: stable@vger.kernel.org, linux-xfs@vger.kernel.org
+Cc: Dave Chinner <dchinner@redhat.com>, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20220714222342.4013916-4-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit cd6f79d1fb324968a3bae92f82eeb7d28ca1fd22 ]
+
+Brian reported a null pointer dereference failure during unmount in
+xfs/006. He tracked the problem down to the AIL being torn down
+before a log shutdown had completed and removed all the items from
+the AIL. The failure occurred in this path while unmount was
+proceeding in another task:
+
+ xfs_trans_ail_delete+0x102/0x130 [xfs]
+ xfs_buf_item_done+0x22/0x30 [xfs]
+ xfs_buf_ioend+0x73/0x4d0 [xfs]
+ xfs_trans_committed_bulk+0x17e/0x2f0 [xfs]
+ xlog_cil_committed+0x2a9/0x300 [xfs]
+ xlog_cil_process_committed+0x69/0x80 [xfs]
+ xlog_state_shutdown_callbacks+0xce/0xf0 [xfs]
+ xlog_force_shutdown+0xdf/0x150 [xfs]
+ xfs_do_force_shutdown+0x5f/0x150 [xfs]
+ xlog_ioend_work+0x71/0x80 [xfs]
+ process_one_work+0x1c5/0x390
+ worker_thread+0x30/0x350
+ kthread+0xd7/0x100
+ ret_from_fork+0x1f/0x30
+
+This is processing an EIO error to a log write, and it's
+triggering a force shutdown. This causes the log to be shut down,
+and then it is running attached iclog callbacks from the shutdown
+context. That means the fs and log has already been marked as
+xfs_is_shutdown/xlog_is_shutdown and so high level code will abort
+(e.g. xfs_trans_commit(), xfs_log_force(), etc) with an error
+because of shutdown.
+
+The umount would have been blocked waiting for a log force
+completion inside xfs_log_cover() -> xfs_sync_sb(). The first thing
+for this situation to occur is for xfs_sync_sb() to exit without
+waiting for the iclog buffer to be comitted to disk. The
+above trace is the completion routine for the iclog buffer, and
+it is shutting down the filesystem.
+
+xlog_state_shutdown_callbacks() does this:
+
+{
+        struct xlog_in_core     *iclog;
+        LIST_HEAD(cb_list);
+
+        spin_lock(&log->l_icloglock);
+        iclog = log->l_iclog;
+        do {
+                if (atomic_read(&iclog->ic_refcnt)) {
+                        /* Reference holder will re-run iclog callbacks. */
+                        continue;
+                }
+                list_splice_init(&iclog->ic_callbacks, &cb_list);
+>>>>>>           wake_up_all(&iclog->ic_write_wait);
+>>>>>>           wake_up_all(&iclog->ic_force_wait);
+        } while ((iclog = iclog->ic_next) != log->l_iclog);
+
+        wake_up_all(&log->l_flush_wait);
+        spin_unlock(&log->l_icloglock);
+
+>>>>>>  xlog_cil_process_committed(&cb_list);
+}
+
+This wakes any thread waiting on IO completion of the iclog (in this
+case the umount log force) before shutdown processes all the pending
+callbacks.  That means the xfs_sync_sb() waiting on a sync
+transaction in xfs_log_force() on iclog->ic_force_wait will get
+woken before the callbacks attached to that iclog are run. This
+results in xfs_sync_sb() returning an error, and so unmount unblocks
+and continues to run whilst the log shutdown is still in progress.
+
+Normally this is just fine because the force waiter has nothing to
+do with AIL operations. But in the case of this unmount path, the
+log force waiter goes on to tear down the AIL because the log is now
+shut down and so nothing ever blocks it again from the wait point in
+xfs_log_cover().
+
+Hence it's a race to see who gets to the AIL first - the unmount
+code or xlog_cil_process_committed() killing the superblock buffer.
+
+To fix this, we just have to change the order of processing in
+xlog_state_shutdown_callbacks() to run the callbacks before it wakes
+any task waiting on completion of the iclog.
+
+Reported-by: Brian Foster <bfoster@redhat.com>
+Fixes: aad7272a9208 ("xfs: separate out log shutdown callback processing")
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c |   22 +++++++++++++---------
+ 1 file changed, 13 insertions(+), 9 deletions(-)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -487,7 +487,10 @@ out_error:
+  * Run all the pending iclog callbacks and wake log force waiters and iclog
+  * space waiters so they can process the newly set shutdown state. We really
+  * don't care what order we process callbacks here because the log is shut down
+- * and so state cannot change on disk anymore.
++ * and so state cannot change on disk anymore. However, we cannot wake waiters
++ * until the callbacks have been processed because we may be in unmount and
++ * we must ensure that all AIL operations the callbacks perform have completed
++ * before we tear down the AIL.
+  *
+  * We avoid processing actively referenced iclogs so that we don't run callbacks
+  * while the iclog owner might still be preparing the iclog for IO submssion.
+@@ -501,7 +504,6 @@ xlog_state_shutdown_callbacks(
+       struct xlog_in_core     *iclog;
+       LIST_HEAD(cb_list);
+-      spin_lock(&log->l_icloglock);
+       iclog = log->l_iclog;
+       do {
+               if (atomic_read(&iclog->ic_refcnt)) {
+@@ -509,14 +511,16 @@ xlog_state_shutdown_callbacks(
+                       continue;
+               }
+               list_splice_init(&iclog->ic_callbacks, &cb_list);
++              spin_unlock(&log->l_icloglock);
++
++              xlog_cil_process_committed(&cb_list);
++
++              spin_lock(&log->l_icloglock);
+               wake_up_all(&iclog->ic_write_wait);
+               wake_up_all(&iclog->ic_force_wait);
+       } while ((iclog = iclog->ic_next) != log->l_iclog);
+       wake_up_all(&log->l_flush_wait);
+-      spin_unlock(&log->l_icloglock);
+-
+-      xlog_cil_process_committed(&cb_list);
+ }
+ /*
+@@ -583,11 +587,8 @@ xlog_state_release_iclog(
+                * pending iclog callbacks that were waiting on the release of
+                * this iclog.
+                */
+-              if (last_ref) {
+-                      spin_unlock(&log->l_icloglock);
++              if (last_ref)
+                       xlog_state_shutdown_callbacks(log);
+-                      spin_lock(&log->l_icloglock);
+-              }
+               return -EIO;
+       }
+@@ -3904,7 +3905,10 @@ xlog_force_shutdown(
+       wake_up_all(&log->l_cilp->xc_start_wait);
+       wake_up_all(&log->l_cilp->xc_commit_wait);
+       spin_unlock(&log->l_cilp->xc_push_lock);
++
++      spin_lock(&log->l_icloglock);
+       xlog_state_shutdown_callbacks(log);
++      spin_unlock(&log->l_icloglock);
+       return log_error;
+ }