6.1-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 27 Sep 2024 06:58:13 +0000 (08:58 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Fri, 27 Sep 2024 06:58:13 +0000 (08:58 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 27 Sep 2024 06:58:13 +0000 (08:58 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 27 Sep 2024 06:58:13 +0000 (08:58 +0200)
diff --git a/queue-6.1/netfilter-nft_socket-fix-a-null-vs-is_err-bug-in-nft_socket_cgroup_subtree_level.patch b/queue-6.1/netfilter-nft_socket-fix-a-null-vs-is_err-bug-in-nft_socket_cgroup_subtree_level.patch

new file mode 100644 (file)

index 0000000..63861d9
--- /dev/null
+++ b/queue-6.1/netfilter-nft_socket-fix-a-null-vs-is_err-bug-in-nft_socket_cgroup_subtree_level.patch
@@ -0,0 +1,36 @@
+From 7052622fccb1efb850c6b55de477f65d03525a30 Mon Sep 17 00:00:00 2001
+From: Dan Carpenter <dan.carpenter@linaro.org>
+Date: Sat, 14 Sep 2024 12:56:51 +0300
+Subject: netfilter: nft_socket: Fix a NULL vs IS_ERR() bug in nft_socket_cgroup_subtree_level()
+
+From: Dan Carpenter <dan.carpenter@linaro.org>
+
+commit 7052622fccb1efb850c6b55de477f65d03525a30 upstream.
+
+The cgroup_get_from_path() function never returns NULL, it returns error
+pointers.  Update the error handling to match.
+
+Fixes: 7f3287db6543 ("netfilter: nft_socket: make cgroupsv2 matching work with namespaces")
+Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
+Acked-by: Florian Westphal <fw@strlen.de>
+Acked-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Link: https://patch.msgid.link/bbc0c4e0-05cc-4f44-8797-2f4b3920a820@stanley.mountain
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/netfilter/nft_socket.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/net/netfilter/nft_socket.c
++++ b/net/netfilter/nft_socket.c
+@@ -61,8 +61,8 @@ static noinline int nft_socket_cgroup_su
+       struct cgroup *cgrp = cgroup_get_from_path("/");
+       int level;
+ 
+-      if (!cgrp)
+-              return -ENOENT;
++      if (IS_ERR(cgrp))
++              return PTR_ERR(cgrp);
+ 
+       level = cgrp->level;
+ 
diff --git a/queue-6.1/netfilter-nft_socket-make-cgroupsv2-matching-work-with-namespaces.patch b/queue-6.1/netfilter-nft_socket-make-cgroupsv2-matching-work-with-namespaces.patch

new file mode 100644 (file)

index 0000000..75090b0
--- /dev/null
+++ b/queue-6.1/netfilter-nft_socket-make-cgroupsv2-matching-work-with-namespaces.patch
@@ -0,0 +1,144 @@
+From 7f3287db654395f9c5ddd246325ff7889f550286 Mon Sep 17 00:00:00 2001
+From: Florian Westphal <fw@strlen.de>
+Date: Sat, 7 Sep 2024 16:07:49 +0200
+Subject: netfilter: nft_socket: make cgroupsv2 matching work with namespaces
+
+From: Florian Westphal <fw@strlen.de>
+
+commit 7f3287db654395f9c5ddd246325ff7889f550286 upstream.
+
+When running in container environmment, /sys/fs/cgroup/ might not be
+the real root node of the sk-attached cgroup.
+
+Example:
+
+In container:
+% stat /sys//fs/cgroup/
+Device: 0,21    Inode: 2214  ..
+% stat /sys/fs/cgroup/foo
+Device: 0,21    Inode: 2264  ..
+
+The expectation would be for:
+
+  nft add rule .. socket cgroupv2 level 1 "foo" counter
+
+to match traffic from a process that got added to "foo" via
+"echo $pid > /sys/fs/cgroup/foo/cgroup.procs".
+
+However, 'level 3' is needed to make this work.
+
+Seen from initial namespace, the complete hierarchy is:
+
+% stat /sys/fs/cgroup/system.slice/docker-.../foo
+  Device: 0,21    Inode: 2264 ..
+
+i.e. hierarchy is
+0    1               2              3
+/ -> system.slice -> docker-1... -> foo
+
+... but the container doesn't know that its "/" is the "docker-1.."
+cgroup.  Current code will retrieve the 'system.slice' cgroup node
+and store its kn->id in the destination register, so compare with
+2264 ("foo" cgroup id) will not match.
+
+Fetch "/" cgroup from ->init() and add its level to the level we try to
+extract.  cgroup root-level is 0 for the init-namespace or the level
+of the ancestor that is exposed as the cgroup root inside the container.
+
+In the above case, cgrp->level of "/" resolved in the container is 2
+(docker-1...scope/) and request for 'level 1' will get adjusted
+to fetch the actual level (3).
+
+v2: use CONFIG_SOCK_CGROUP_DATA, eval function depends on it.
+    (kernel test robot)
+
+Cc: cgroups@vger.kernel.org
+Fixes: e0bb96db96f8 ("netfilter: nft_socket: add support for cgroupsv2")
+Reported-by: Nadia Pinaeva <n.m.pinaeva@gmail.com>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/netfilter/nft_socket.c |   41 ++++++++++++++++++++++++++++++++++++++---
+ 1 file changed, 38 insertions(+), 3 deletions(-)
+
+--- a/net/netfilter/nft_socket.c
++++ b/net/netfilter/nft_socket.c
+@@ -9,7 +9,8 @@
+ 
+ struct nft_socket {
+       enum nft_socket_keys            key:8;
+-      u8                              level;
++      u8                              level;          /* cgroupv2 level to extract */
++      u8                              level_user;     /* cgroupv2 level provided by userspace */
+       u8                              len;
+       union {
+               u8                      dreg;
+@@ -53,6 +54,28 @@ nft_sock_get_eval_cgroupv2(u32 *dest, st
+       memcpy(dest, &cgid, sizeof(u64));
+       return true;
+ }
++
++/* process context only, uses current->nsproxy. */
++static noinline int nft_socket_cgroup_subtree_level(void)
++{
++      struct cgroup *cgrp = cgroup_get_from_path("/");
++      int level;
++
++      if (!cgrp)
++              return -ENOENT;
++
++      level = cgrp->level;
++
++      cgroup_put(cgrp);
++
++      if (WARN_ON_ONCE(level > 255))
++              return -ERANGE;
++
++      if (WARN_ON_ONCE(level < 0))
++              return -EINVAL;
++
++      return level;
++}
+ #endif
+ 
+ static struct sock *nft_socket_do_lookup(const struct nft_pktinfo *pkt)
+@@ -174,9 +197,10 @@ static int nft_socket_init(const struct
+       case NFT_SOCKET_MARK:
+               len = sizeof(u32);
+               break;
+-#ifdef CONFIG_CGROUPS
++#ifdef CONFIG_SOCK_CGROUP_DATA
+       case NFT_SOCKET_CGROUPV2: {
+               unsigned int level;
++              int err;
+ 
+               if (!tb[NFTA_SOCKET_LEVEL])
+                       return -EINVAL;
+@@ -185,6 +209,17 @@ static int nft_socket_init(const struct
+               if (level > 255)
+                       return -EOPNOTSUPP;
+ 
++              err = nft_socket_cgroup_subtree_level();
++              if (err < 0)
++                      return err;
++
++              priv->level_user = level;
++
++              level += err;
++              /* Implies a giant cgroup tree */
++              if (WARN_ON_ONCE(level > 255))
++                      return -EOPNOTSUPP;
++
+               priv->level = level;
+               len = sizeof(u64);
+               break;
+@@ -209,7 +244,7 @@ static int nft_socket_dump(struct sk_buf
+       if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg))
+               return -1;
+       if (priv->key == NFT_SOCKET_CGROUPV2 &&
+-          nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level)))
++          nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level_user)))
+               return -1;
+       return 0;
+ }
diff --git a/queue-6.1/series b/queue-6.1/series

index 1d30fd1648ff6183096fb4d2603029cb1c4bbd0f..aa5be7ab59332e059d377b13425e00e497905d96 100644 (file)
--- a/queue-6.1/series
+++ b/queue-6.1/series
@@ -29,3 +29,32 @@ block-fix-where-bio-io-priority-gets-set.patch
  spi-spidev-add-missing-spi_device_id-for-jg10309-01.patch
  ocfs2-add-bounds-checking-to-ocfs2_xattr_find_entry.patch
  ocfs2-strict-bound-check-before-memcmp-in-ocfs2_xatt.patch
+xfs-dquot-shrinker-doesn-t-check-for-xfs_dqflag_freeing.patch
+xfs-fix-deadlock-on-xfs_inodegc_worker.patch
+xfs-fix-extent-busy-updating.patch
+xfs-don-t-use-bmbt-btree-split-workers-for-io-completion.patch
+xfs-fix-low-space-alloc-deadlock.patch
+xfs-prefer-free-inodes-at-enospc-over-chunk-allocation.patch
+xfs-block-reservation-too-large-for-minleft-allocation.patch
+xfs-fix-uninitialized-variable-access.patch
+xfs-quotacheck-failure-can-race-with-background-inode-inactivation.patch
+xfs-fix-bug_on-in-xfs_getbmap.patch
+xfs-buffer-pins-need-to-hold-a-buffer-reference.patch
+xfs-defered-work-could-create-precommits.patch
+xfs-fix-agf-vs-inode-cluster-buffer-deadlock.patch
+xfs-collect-errors-from-inodegc-for-unlinked-inode-recovery.patch
+xfs-fix-ag-count-overflow-during-growfs.patch
+xfs-remove-warn-when-dquot-cache-insertion-fails.patch
+xfs-fix-the-calculation-for-end-and-length.patch
+xfs-load-uncached-unlinked-inodes-into-memory-on-demand.patch
+xfs-fix-negative-array-access-in-xfs_getbmap.patch
+xfs-fix-unlink-vs-cluster-buffer-instantiation-race.patch
+xfs-correct-calculation-for-agend-and-blockcount.patch
+xfs-use-i_prev_unlinked-to-distinguish-inodes-that-are-not-on-the-unlinked-list.patch
+xfs-reload-entire-unlinked-bucket-lists.patch
+xfs-make-inode-unlinked-bucket-recovery-work-with-quotacheck.patch
+xfs-fix-reloading-entire-unlinked-bucket-lists.patch
+xfs-set-bnobt-cntbt-numrecs-correctly-when-formatting-new-ags.patch
+xfs-journal-geometry-is-not-properly-bounds-checked.patch
+netfilter-nft_socket-make-cgroupsv2-matching-work-with-namespaces.patch
+netfilter-nft_socket-fix-a-null-vs-is_err-bug-in-nft_socket_cgroup_subtree_level.patch
diff --git a/queue-6.1/xfs-block-reservation-too-large-for-minleft-allocation.patch b/queue-6.1/xfs-block-reservation-too-large-for-minleft-allocation.patch

new file mode 100644 (file)

index 0000000..4acc4bd
--- /dev/null
+++ b/queue-6.1/xfs-block-reservation-too-large-for-minleft-allocation.patch
@@ -0,0 +1,100 @@
+From stable+bounces-77022-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:43 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:32 -0700
+Subject: xfs: block reservation too large for minleft allocation
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner <dchinner@redhat.com>, Allison Henderson <allison.henderson@oracle.com>, "Darrick J. Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-8-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit d5753847b216db0e553e8065aa825cfe497ad143 ]
+
+When we enter xfs_bmbt_alloc_block() without having first allocated
+a data extent (i.e. tp->t_firstblock == NULLFSBLOCK) because we
+are doing something like unwritten extent conversion, the transaction
+block reservation is used as the minleft value.
+
+This works for operations like unwritten extent conversion, but it
+assumes that the block reservation is only for a BMBT split. THis is
+not always true, and sometimes results in larger than necessary
+minleft values being set. We only actually need enough space for a
+btree split, something we already handle correctly in
+xfs_bmapi_write() via the xfs_bmapi_minleft() calculation.
+
+We should use xfs_bmapi_minleft() in xfs_bmbt_alloc_block() to
+calculate the number of blocks a BMBT split on this inode is going to
+require, not use the transaction block reservation that contains the
+maximum number of blocks this transaction may consume in it...
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_bmap.c       |    2 +-
+ fs/xfs/libxfs/xfs_bmap.h       |    2 ++
+ fs/xfs/libxfs/xfs_bmap_btree.c |   19 +++++++++----------
+ 3 files changed, 12 insertions(+), 11 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -4242,7 +4242,7 @@ xfs_bmapi_convert_unwritten(
+       return 0;
+ }
+ 
+-static inline xfs_extlen_t
++xfs_extlen_t
+ xfs_bmapi_minleft(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip,
+--- a/fs/xfs/libxfs/xfs_bmap.h
++++ b/fs/xfs/libxfs/xfs_bmap.h
+@@ -220,6 +220,8 @@ int        xfs_bmap_add_extent_unwritten_real(s
+               struct xfs_inode *ip, int whichfork,
+               struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp,
+               struct xfs_bmbt_irec *new, int *logflagsp);
++xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip,
++              int fork);
+ 
+ enum xfs_bmap_intent_type {
+       XFS_BMAP_MAP = 1,
+--- a/fs/xfs/libxfs/xfs_bmap_btree.c
++++ b/fs/xfs/libxfs/xfs_bmap_btree.c
+@@ -213,18 +213,16 @@ xfs_bmbt_alloc_block(
+       if (args.fsbno == NULLFSBLOCK) {
+               args.fsbno = be64_to_cpu(start->l);
+               args.type = XFS_ALLOCTYPE_START_BNO;
++
+               /*
+-               * Make sure there is sufficient room left in the AG to
+-               * complete a full tree split for an extent insert.  If
+-               * we are converting the middle part of an extent then
+-               * we may need space for two tree splits.
+-               *
+-               * We are relying on the caller to make the correct block
+-               * reservation for this operation to succeed.  If the
+-               * reservation amount is insufficient then we may fail a
+-               * block allocation here and corrupt the filesystem.
++               * If we are coming here from something like unwritten extent
++               * conversion, there has been no data extent allocation already
++               * done, so we have to ensure that we attempt to locate the
++               * entire set of bmbt allocations in the same AG, as
++               * xfs_bmapi_write() would have reserved.
+                */
+-              args.minleft = args.tp->t_blk_res;
++              args.minleft = xfs_bmapi_minleft(cur->bc_tp, cur->bc_ino.ip,
++                                              cur->bc_ino.whichfork);
+       } else if (cur->bc_tp->t_flags & XFS_TRANS_LOWMODE) {
+               args.type = XFS_ALLOCTYPE_START_BNO;
+       } else {
+@@ -248,6 +246,7 @@ xfs_bmbt_alloc_block(
+                * successful activate the lowspace algorithm.
+                */
+               args.fsbno = 0;
++              args.minleft = 0;
+               args.type = XFS_ALLOCTYPE_FIRST_AG;
+               error = xfs_alloc_vextent(&args);
+               if (error)
diff --git a/queue-6.1/xfs-buffer-pins-need-to-hold-a-buffer-reference.patch b/queue-6.1/xfs-buffer-pins-need-to-hold-a-buffer-reference.patch

new file mode 100644 (file)

index 0000000..d3bf328
--- /dev/null
+++ b/queue-6.1/xfs-buffer-pins-need-to-hold-a-buffer-reference.patch
@@ -0,0 +1,184 @@
+From stable+bounces-77026-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:53 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:36 -0700
+Subject: xfs: buffer pins need to hold a buffer reference
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner <dchinner@redhat.com>, yangerkun <yangerkun@huawei.com>, "Darrick J. Wong" <djwong@kernel.org>, Christoph Hellwig <hch@lst.de>, Dave Chinner <david@fromorbit.com>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-12-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 89a4bf0dc3857569a77061d3d5ea2ac85f7e13c6 ]
+
+When a buffer is unpinned by xfs_buf_item_unpin(), we need to access
+the buffer after we've dropped the buffer log item reference count.
+This opens a window where we can have two racing unpins for the
+buffer item (e.g. shutdown checkpoint context callback processing
+racing with journal IO iclog completion processing) and both attempt
+to access the buffer after dropping the BLI reference count.  If we
+are unlucky, the "BLI freed" context wins the race and frees the
+buffer before the "BLI still active" case checks the buffer pin
+count.
+
+This results in a use after free that can only be triggered
+in active filesystem shutdown situations.
+
+To fix this, we need to ensure that buffer existence extends beyond
+the BLI reference count checks and until the unpin processing is
+complete. This implies that a buffer pin operation must also take a
+buffer reference to ensure that the buffer cannot be freed until the
+buffer unpin processing is complete.
+
+Reported-by: yangerkun <yangerkun@huawei.com>
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c |   88 ++++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 65 insertions(+), 23 deletions(-)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -452,10 +452,18 @@ xfs_buf_item_format(
+  * This is called to pin the buffer associated with the buf log item in memory
+  * so it cannot be written out.
+  *
+- * We also always take a reference to the buffer log item here so that the bli
+- * is held while the item is pinned in memory. This means that we can
+- * unconditionally drop the reference count a transaction holds when the
+- * transaction is completed.
++ * We take a reference to the buffer log item here so that the BLI life cycle
++ * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and
++ * inserted into the AIL.
++ *
++ * We also need to take a reference to the buffer itself as the BLI unpin
++ * processing requires accessing the buffer after the BLI has dropped the final
++ * BLI reference. See xfs_buf_item_unpin() for an explanation.
++ * If unpins race to drop the final BLI reference and only the
++ * BLI owns a reference to the buffer, then the loser of the race can have the
++ * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per
++ * pin count ensures the life cycle of the buffer extends for as
++ * long as we hold the buffer pin reference in xfs_buf_item_unpin().
+  */
+ STATIC void
+ xfs_buf_item_pin(
+@@ -470,13 +478,30 @@ xfs_buf_item_pin(
+ 
+       trace_xfs_buf_item_pin(bip);
+ 
++      xfs_buf_hold(bip->bli_buf);
+       atomic_inc(&bip->bli_refcount);
+       atomic_inc(&bip->bli_buf->b_pin_count);
+ }
+ 
+ /*
+- * This is called to unpin the buffer associated with the buf log item which
+- * was previously pinned with a call to xfs_buf_item_pin().
++ * This is called to unpin the buffer associated with the buf log item which was
++ * previously pinned with a call to xfs_buf_item_pin().  We enter this function
++ * with a buffer pin count, a buffer reference and a BLI reference.
++ *
++ * We must drop the BLI reference before we unpin the buffer because the AIL
++ * doesn't acquire a BLI reference whenever it accesses it. Therefore if the
++ * refcount drops to zero, the bli could still be AIL resident and the buffer
++ * submitted for I/O at any point before we return. This can result in IO
++ * completion freeing the buffer while we are still trying to access it here.
++ * This race condition can also occur in shutdown situations where we abort and
++ * unpin buffers from contexts other that journal IO completion.
++ *
++ * Hence we have to hold a buffer reference per pin count to ensure that the
++ * buffer cannot be freed until we have finished processing the unpin operation.
++ * The reference is taken in xfs_buf_item_pin(), and we must hold it until we
++ * are done processing the buffer state. In the case of an abort (remove =
++ * true) then we re-use the current pin reference as the IO reference we hand
++ * off to IO failure handling.
+  */
+ STATIC void
+ xfs_buf_item_unpin(
+@@ -493,24 +518,18 @@ xfs_buf_item_unpin(
+ 
+       trace_xfs_buf_item_unpin(bip);
+ 
+-      /*
+-       * Drop the bli ref associated with the pin and grab the hold required
+-       * for the I/O simulation failure in the abort case. We have to do this
+-       * before the pin count drops because the AIL doesn't acquire a bli
+-       * reference. Therefore if the refcount drops to zero, the bli could
+-       * still be AIL resident and the buffer submitted for I/O (and freed on
+-       * completion) at any point before we return. This can be removed once
+-       * the AIL properly holds a reference on the bli.
+-       */
+       freed = atomic_dec_and_test(&bip->bli_refcount);
+-      if (freed && !stale && remove)
+-              xfs_buf_hold(bp);
+       if (atomic_dec_and_test(&bp->b_pin_count))
+               wake_up_all(&bp->b_waiters);
+ 
+-       /* nothing to do but drop the pin count if the bli is active */
+-      if (!freed)
++      /*
++       * Nothing to do but drop the buffer pin reference if the BLI is
++       * still active.
++       */
++      if (!freed) {
++              xfs_buf_rele(bp);
+               return;
++      }
+ 
+       if (stale) {
+               ASSERT(bip->bli_flags & XFS_BLI_STALE);
+@@ -523,6 +542,15 @@ xfs_buf_item_unpin(
+               trace_xfs_buf_item_unpin_stale(bip);
+ 
+               /*
++               * The buffer has been locked and referenced since it was marked
++               * stale so we own both lock and reference exclusively here. We
++               * do not need the pin reference any more, so drop it now so
++               * that we only have one reference to drop once item completion
++               * processing is complete.
++               */
++              xfs_buf_rele(bp);
++
++              /*
+                * If we get called here because of an IO error, we may or may
+                * not have the item on the AIL. xfs_trans_ail_delete() will
+                * take care of that situation. xfs_trans_ail_delete() drops
+@@ -538,16 +566,30 @@ xfs_buf_item_unpin(
+                       ASSERT(bp->b_log_item == NULL);
+               }
+               xfs_buf_relse(bp);
+-      } else if (remove) {
++              return;
++      }
++
++      if (remove) {
+               /*
+-               * The buffer must be locked and held by the caller to simulate
+-               * an async I/O failure. We acquired the hold for this case
+-               * before the buffer was unpinned.
++               * We need to simulate an async IO failures here to ensure that
++               * the correct error completion is run on this buffer. This
++               * requires a reference to the buffer and for the buffer to be
++               * locked. We can safely pass ownership of the pin reference to
++               * the IO to ensure that nothing can free the buffer while we
++               * wait for the lock and then run the IO failure completion.
+                */
+               xfs_buf_lock(bp);
+               bp->b_flags |= XBF_ASYNC;
+               xfs_buf_ioend_fail(bp);
++              return;
+       }
++
++      /*
++       * BLI has no more active references - it will be moved to the AIL to
++       * manage the remaining BLI/buffer life cycle. There is nothing left for
++       * us to do here so drop the pin reference to the buffer.
++       */
++      xfs_buf_rele(bp);
+ }
+ 
+ STATIC uint
diff --git a/queue-6.1/xfs-collect-errors-from-inodegc-for-unlinked-inode-recovery.patch b/queue-6.1/xfs-collect-errors-from-inodegc-for-unlinked-inode-recovery.patch

new file mode 100644 (file)

index 0000000..a4e4bba
--- /dev/null
+++ b/queue-6.1/xfs-collect-errors-from-inodegc-for-unlinked-inode-recovery.patch
@@ -0,0 +1,340 @@
+From stable+bounces-77029-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:01 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:39 -0700
+Subject: xfs: collect errors from inodegc for unlinked inode recovery
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner <dchinner@redhat.com>, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <david@fromorbit.com>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-15-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit d4d12c02bf5f768f1b423c7ae2909c5afdfe0d5f ]
+
+Unlinked list recovery requires errors removing the inode the from
+the unlinked list get fed back to the main recovery loop. Now that
+we offload the unlinking to the inodegc work, we don't get errors
+being fed back when we trip over a corruption that prevents the
+inode from being removed from the unlinked list.
+
+This means we never clear the corrupt unlinked list bucket,
+resulting in runtime operations eventually tripping over it and
+shutting down.
+
+Fix this by collecting inodegc worker errors and feed them
+back to the flush caller. This is largely best effort - the only
+context that really cares is log recovery, and it only flushes a
+single inode at a time so we don't need complex synchronised
+handling. Essentially the inodegc workers will capture the first
+error that occurs and the next flush will gather them and clear
+them. The flush itself will only report the first gathered error.
+
+In the cases where callers can return errors, propagate the
+collected inodegc flush error up the error handling chain.
+
+In the case of inode unlinked list recovery, there are several
+superfluous calls to flush queued unlinked inodes -
+xlog_recover_iunlink_bucket() guarantees that it has flushed the
+inodegc and collected errors before it returns. Hence nothing in the
+calling path needs to run a flush, even when an error is returned.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_icache.c      |   46 +++++++++++++++++++++++++++++++++++++---------
+ fs/xfs/xfs_icache.h      |    4 ++--
+ fs/xfs/xfs_inode.c       |   20 ++++++--------------
+ fs/xfs/xfs_inode.h       |    2 +-
+ fs/xfs/xfs_log_recover.c |   19 +++++++++----------
+ fs/xfs/xfs_mount.h       |    1 +
+ fs/xfs/xfs_super.c       |    1 +
+ fs/xfs/xfs_trans.c       |    4 +++-
+ 8 files changed, 60 insertions(+), 37 deletions(-)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -454,6 +454,27 @@ xfs_inodegc_queue_all(
+       return ret;
+ }
+ 
++/* Wait for all queued work and collect errors */
++static int
++xfs_inodegc_wait_all(
++      struct xfs_mount        *mp)
++{
++      int                     cpu;
++      int                     error = 0;
++
++      flush_workqueue(mp->m_inodegc_wq);
++      for_each_online_cpu(cpu) {
++              struct xfs_inodegc      *gc;
++
++              gc = per_cpu_ptr(mp->m_inodegc, cpu);
++              if (gc->error && !error)
++                      error = gc->error;
++              gc->error = 0;
++      }
++
++      return error;
++}
++
+ /*
+  * Check the validity of the inode we just found it the cache
+  */
+@@ -1490,15 +1511,14 @@ xfs_blockgc_free_space(
+       if (error)
+               return error;
+ 
+-      xfs_inodegc_flush(mp);
+-      return 0;
++      return xfs_inodegc_flush(mp);
+ }
+ 
+ /*
+  * Reclaim all the free space that we can by scheduling the background blockgc
+  * and inodegc workers immediately and waiting for them all to clear.
+  */
+-void
++int
+ xfs_blockgc_flush_all(
+       struct xfs_mount        *mp)
+ {
+@@ -1519,7 +1539,7 @@ xfs_blockgc_flush_all(
+       for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
+               flush_delayed_work(&pag->pag_blockgc_work);
+ 
+-      xfs_inodegc_flush(mp);
++      return xfs_inodegc_flush(mp);
+ }
+ 
+ /*
+@@ -1841,13 +1861,17 @@ xfs_inodegc_set_reclaimable(
+  * This is the last chance to make changes to an otherwise unreferenced file
+  * before incore reclamation happens.
+  */
+-static void
++static int
+ xfs_inodegc_inactivate(
+       struct xfs_inode        *ip)
+ {
++      int                     error;
++
+       trace_xfs_inode_inactivating(ip);
+-      xfs_inactive(ip);
++      error = xfs_inactive(ip);
+       xfs_inodegc_set_reclaimable(ip);
++      return error;
++
+ }
+ 
+ void
+@@ -1879,8 +1903,12 @@ xfs_inodegc_worker(
+ 
+       WRITE_ONCE(gc->shrinker_hits, 0);
+       llist_for_each_entry_safe(ip, n, node, i_gclist) {
++              int     error;
++
+               xfs_iflags_set(ip, XFS_INACTIVATING);
+-              xfs_inodegc_inactivate(ip);
++              error = xfs_inodegc_inactivate(ip);
++              if (error && !gc->error)
++                      gc->error = error;
+       }
+ 
+       memalloc_nofs_restore(nofs_flag);
+@@ -1904,13 +1932,13 @@ xfs_inodegc_push(
+  * Force all currently queued inode inactivation work to run immediately and
+  * wait for the work to finish.
+  */
+-void
++int
+ xfs_inodegc_flush(
+       struct xfs_mount        *mp)
+ {
+       xfs_inodegc_push(mp);
+       trace_xfs_inodegc_flush(mp, __return_address);
+-      flush_workqueue(mp->m_inodegc_wq);
++      return xfs_inodegc_wait_all(mp);
+ }
+ 
+ /*
+--- a/fs/xfs/xfs_icache.h
++++ b/fs/xfs/xfs_icache.h
+@@ -59,7 +59,7 @@ int xfs_blockgc_free_dquots(struct xfs_m
+               unsigned int iwalk_flags);
+ int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags);
+ int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_icwalk *icm);
+-void xfs_blockgc_flush_all(struct xfs_mount *mp);
++int xfs_blockgc_flush_all(struct xfs_mount *mp);
+ 
+ void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
+ void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
+@@ -77,7 +77,7 @@ void xfs_blockgc_start(struct xfs_mount
+ 
+ void xfs_inodegc_worker(struct work_struct *work);
+ void xfs_inodegc_push(struct xfs_mount *mp);
+-void xfs_inodegc_flush(struct xfs_mount *mp);
++int xfs_inodegc_flush(struct xfs_mount *mp);
+ void xfs_inodegc_stop(struct xfs_mount *mp);
+ void xfs_inodegc_start(struct xfs_mount *mp);
+ void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu);
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1620,16 +1620,7 @@ xfs_inactive_ifree(
+        */
+       xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
+ 
+-      /*
+-       * Just ignore errors at this point.  There is nothing we can do except
+-       * to try to keep going. Make sure it's not a silent error.
+-       */
+-      error = xfs_trans_commit(tp);
+-      if (error)
+-              xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
+-                      __func__, error);
+-
+-      return 0;
++      return xfs_trans_commit(tp);
+ }
+ 
+ /*
+@@ -1696,12 +1687,12 @@ xfs_inode_needs_inactive(
+  * now be truncated.  Also, we clear all of the read-ahead state
+  * kept for the inode here since the file is now closed.
+  */
+-void
++int
+ xfs_inactive(
+       xfs_inode_t     *ip)
+ {
+       struct xfs_mount        *mp;
+-      int                     error;
++      int                     error = 0;
+       int                     truncate = 0;
+ 
+       /*
+@@ -1742,7 +1733,7 @@ xfs_inactive(
+                * reference to the inode at this point anyways.
+                */
+               if (xfs_can_free_eofblocks(ip, true))
+-                      xfs_free_eofblocks(ip);
++                      error = xfs_free_eofblocks(ip);
+ 
+               goto out;
+       }
+@@ -1779,7 +1770,7 @@ xfs_inactive(
+       /*
+        * Free the inode.
+        */
+-      xfs_inactive_ifree(ip);
++      error = xfs_inactive_ifree(ip);
+ 
+ out:
+       /*
+@@ -1787,6 +1778,7 @@ out:
+        * the attached dquots.
+        */
+       xfs_qm_dqdetach(ip);
++      return error;
+ }
+ 
+ /*
+--- a/fs/xfs/xfs_inode.h
++++ b/fs/xfs/xfs_inode.h
+@@ -470,7 +470,7 @@ enum layout_break_reason {
+       (xfs_has_grpid((pip)->i_mount) || (VFS_I(pip)->i_mode & S_ISGID))
+ 
+ int           xfs_release(struct xfs_inode *ip);
+-void          xfs_inactive(struct xfs_inode *ip);
++int           xfs_inactive(struct xfs_inode *ip);
+ int           xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name,
+                          struct xfs_inode **ipp, struct xfs_name *ci_name);
+ int           xfs_create(struct user_namespace *mnt_userns,
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -2711,7 +2711,9 @@ xlog_recover_iunlink_bucket(
+                        * just to flush the inodegc queue and wait for it to
+                        * complete.
+                        */
+-                      xfs_inodegc_flush(mp);
++                      error = xfs_inodegc_flush(mp);
++                      if (error)
++                              break;
+               }
+ 
+               prev_agino = agino;
+@@ -2719,10 +2721,15 @@ xlog_recover_iunlink_bucket(
+       }
+ 
+       if (prev_ip) {
++              int     error2;
++
+               ip->i_prev_unlinked = prev_agino;
+               xfs_irele(prev_ip);
++
++              error2 = xfs_inodegc_flush(mp);
++              if (error2 && !error)
++                      return error2;
+       }
+-      xfs_inodegc_flush(mp);
+       return error;
+ }
+ 
+@@ -2789,7 +2796,6 @@ xlog_recover_iunlink_ag(
+                        * bucket and remaining inodes on it unreferenced and
+                        * unfreeable.
+                        */
+-                      xfs_inodegc_flush(pag->pag_mount);
+                       xlog_recover_clear_agi_bucket(pag, bucket);
+               }
+       }
+@@ -2806,13 +2812,6 @@ xlog_recover_process_iunlinks(
+ 
+       for_each_perag(log->l_mp, agno, pag)
+               xlog_recover_iunlink_ag(pag);
+-
+-      /*
+-       * Flush the pending unlinked inodes to ensure that the inactivations
+-       * are fully completed on disk and the incore inodes can be reclaimed
+-       * before we signal that recovery is complete.
+-       */
+-      xfs_inodegc_flush(log->l_mp);
+ }
+ 
+ STATIC void
+--- a/fs/xfs/xfs_mount.h
++++ b/fs/xfs/xfs_mount.h
+@@ -62,6 +62,7 @@ struct xfs_error_cfg {
+ struct xfs_inodegc {
+       struct llist_head       list;
+       struct delayed_work     work;
++      int                     error;
+ 
+       /* approximate count of inodes in the list */
+       unsigned int            items;
+--- a/fs/xfs/xfs_super.c
++++ b/fs/xfs/xfs_super.c
+@@ -1089,6 +1089,7 @@ xfs_inodegc_init_percpu(
+ #endif
+               init_llist_head(&gc->list);
+               gc->items = 0;
++              gc->error = 0;
+               INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker);
+       }
+       return 0;
+--- a/fs/xfs/xfs_trans.c
++++ b/fs/xfs/xfs_trans.c
+@@ -290,7 +290,9 @@ retry:
+                * Do not perform a synchronous scan because callers can hold
+                * other locks.
+                */
+-              xfs_blockgc_flush_all(mp);
++              error = xfs_blockgc_flush_all(mp);
++              if (error)
++                      return error;
+               want_retry = false;
+               goto retry;
+       }
diff --git a/queue-6.1/xfs-correct-calculation-for-agend-and-blockcount.patch b/queue-6.1/xfs-correct-calculation-for-agend-and-blockcount.patch

new file mode 100644 (file)

index 0000000..66c37f3
--- /dev/null
+++ b/queue-6.1/xfs-correct-calculation-for-agend-and-blockcount.patch
@@ -0,0 +1,54 @@
+From stable+bounces-77036-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:21 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:46 -0700
+Subject: xfs: correct calculation for agend and blockcount
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Shiyang Ruan <ruansy.fnst@fujitsu.com>, "Darrick J. Wong" <djwong@kernel.org>, Chandan Babu R <chandanbabu@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240924183851.1901667-22-leah.rumancik@gmail.com>
+
+From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
+
+[ Upstream commit 3c90c01e49342b166e5c90ec2c85b220be15a20e ]
+
+The agend should be "start + length - 1", then, blockcount should be
+"end + 1 - start".  Correct 2 calculation mistakes.
+
+Also, rename "agend" to "range_agend" because it's not the end of the AG
+per se; it's the end of the dead region within an AG's agblock space.
+
+Fixes: 5cf32f63b0f4 ("xfs: fix the calculation for "end" and "length"")
+Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
+Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
+Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_notify_failure.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_notify_failure.c
++++ b/fs/xfs/xfs_notify_failure.c
+@@ -126,8 +126,8 @@ xfs_dax_notify_ddev_failure(
+               struct xfs_rmap_irec    ri_low = { };
+               struct xfs_rmap_irec    ri_high;
+               struct xfs_agf          *agf;
+-              xfs_agblock_t           agend;
+               struct xfs_perag        *pag;
++              xfs_agblock_t           range_agend;
+ 
+               pag = xfs_perag_get(mp, agno);
+               error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp);
+@@ -148,10 +148,10 @@ xfs_dax_notify_ddev_failure(
+                       ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno);
+ 
+               agf = agf_bp->b_addr;
+-              agend = min(be32_to_cpu(agf->agf_length),
++              range_agend = min(be32_to_cpu(agf->agf_length) - 1,
+                               ri_high.rm_startblock);
+               notify.startblock = ri_low.rm_startblock;
+-              notify.blockcount = agend - ri_low.rm_startblock;
++              notify.blockcount = range_agend + 1 - ri_low.rm_startblock;
+ 
+               error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
+                               xfs_dax_failure_fn, &notify);
diff --git a/queue-6.1/xfs-defered-work-could-create-precommits.patch b/queue-6.1/xfs-defered-work-could-create-precommits.patch

new file mode 100644 (file)

index 0000000..67bdef2
--- /dev/null
+++ b/queue-6.1/xfs-defered-work-could-create-precommits.patch
@@ -0,0 +1,45 @@
+From stable+bounces-77027-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:58 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:37 -0700
+Subject: xfs: defered work could create precommits
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner <dchinner@redhat.com>, "Darrick J. Wong" <djwong@kernel.org>, Christoph Hellwig <hch@lst.de>, Dave Chinner <david@fromorbit.com>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-13-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit cb042117488dbf0b3b38b05771639890fada9a52 ]
+
+To fix a AGI-AGF-inode cluster buffer deadlock, we need to move
+inode cluster buffer operations to the ->iop_precommit() method.
+However, this means that deferred operations can require precommits
+to be run on the final transaction that the deferred ops pass back
+to xfs_trans_commit() context. This will be exposed by attribute
+handling, in that the last changes to the inode in the attr set
+state machine "disappear" because the precommit operation is not run.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_trans.c |    5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/fs/xfs/xfs_trans.c
++++ b/fs/xfs/xfs_trans.c
+@@ -970,6 +970,11 @@ __xfs_trans_commit(
+               error = xfs_defer_finish_noroll(&tp);
+               if (error)
+                       goto out_unreserve;
++
++              /* Run precommits from final tx in defer chain. */
++              error = xfs_trans_run_precommits(tp);
++              if (error)
++                      goto out_unreserve;
+       }
+ 
+       /*
diff --git a/queue-6.1/xfs-don-t-use-bmbt-btree-split-workers-for-io-completion.patch b/queue-6.1/xfs-don-t-use-bmbt-btree-split-workers-for-io-completion.patch

new file mode 100644 (file)

index 0000000..2491749
--- /dev/null
+++ b/queue-6.1/xfs-don-t-use-bmbt-btree-split-workers-for-io-completion.patch
@@ -0,0 +1,116 @@
+From stable+bounces-77019-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:25 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:29 -0700
+Subject: xfs: don't use BMBT btree split workers for IO completion
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner <dchinner@redhat.com>, "Darrick J. Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-5-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit c85007e2e3942da1f9361e4b5a9388ea3a8dcc5b ]
+
+When we split a BMBT due to record insertion, we offload it to a
+worker thread because we can be deep in the stack when we try to
+allocate a new block for the BMBT. Allocation can use several
+kilobytes of stack (full memory reclaim, swap and/or IO path can
+end up on the stack during allocation) and we can already be several
+kilobytes deep in the stack when we need to split the BMBT.
+
+A recent workload demonstrated a deadlock in this BMBT split
+offload. It requires several things to happen at once:
+
+1. two inodes need a BMBT split at the same time, one must be
+unwritten extent conversion from IO completion, the other must be
+from extent allocation.
+
+2. there must be a no available xfs_alloc_wq worker threads
+available in the worker pool.
+
+3. There must be sustained severe memory shortages such that new
+kworker threads cannot be allocated to the xfs_alloc_wq pool for
+both threads that need split work to be run
+
+4. The split work from the unwritten extent conversion must run
+first.
+
+5. when the BMBT block allocation runs from the split work, it must
+loop over all AGs and not be able to either trylock an AGF
+successfully, or each AGF is is able to lock has no space available
+for a single block allocation.
+
+6. The BMBT allocation must then attempt to lock the AGF that the
+second task queued to the rescuer thread already has locked before
+it finds an AGF it can allocate from.
+
+At this point, we have an ABBA deadlock between tasks queued on the
+xfs_alloc_wq rescuer thread and a locked AGF. i.e. The queued task
+holding the AGF lock can't be run by the rescuer thread until the
+task the rescuer thread is runing gets the AGF lock....
+
+This is a highly improbably series of events, but there it is.
+
+There's a couple of ways to fix this, but the easiest way to ensure
+that we only punt tasks with a locked AGF that holds enough space
+for the BMBT block allocations to the worker thread.
+
+This works for unwritten extent conversion in IO completion (which
+doesn't have a locked AGF and space reservations) because we have
+tight control over the IO completion stack. It is typically only 6
+functions deep when xfs_btree_split() is called because we've
+already offloaded the IO completion work to a worker thread and
+hence we don't need to worry about stack overruns here.
+
+The other place we can be called for a BMBT split without a
+preceeding allocation is __xfs_bunmapi() when punching out the
+center of an existing extent. We don't remove extents in the IO
+path, so these operations don't tend to be called with a lot of
+stack consumed. Hence we don't really need to ship the split off to
+a worker thread in these cases, either.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_btree.c |   18 ++++++++++++++++--
+ 1 file changed, 16 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_btree.c
++++ b/fs/xfs/libxfs/xfs_btree.c
+@@ -2913,9 +2913,22 @@ xfs_btree_split_worker(
+ }
+ 
+ /*
+- * BMBT split requests often come in with little stack to work on. Push
++ * BMBT split requests often come in with little stack to work on so we push
+  * them off to a worker thread so there is lots of stack to use. For the other
+  * btree types, just call directly to avoid the context switch overhead here.
++ *
++ * Care must be taken here - the work queue rescuer thread introduces potential
++ * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new
++ * AGFs to allocate blocks. A task being run by the rescuer could attempt to
++ * lock an AGF that is already locked by a task queued to run by the rescuer,
++ * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to
++ * release it until the current thread it is running gains the lock.
++ *
++ * To avoid this issue, we only ever queue BMBT splits that don't have an AGF
++ * already locked to allocate from. The only place that doesn't hold an AGF
++ * locked is unwritten extent conversion at IO completion, but that has already
++ * been offloaded to a worker thread and hence has no stack consumption issues
++ * we have to worry about.
+  */
+ STATIC int                                    /* error */
+ xfs_btree_split(
+@@ -2929,7 +2942,8 @@ xfs_btree_split(
+       struct xfs_btree_split_args     args;
+       DECLARE_COMPLETION_ONSTACK(done);
+ 
+-      if (cur->bc_btnum != XFS_BTNUM_BMAP)
++      if (cur->bc_btnum != XFS_BTNUM_BMAP ||
++          cur->bc_tp->t_firstblock == NULLFSBLOCK)
+               return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
+ 
+       args.cur = cur;
diff --git a/queue-6.1/xfs-dquot-shrinker-doesn-t-check-for-xfs_dqflag_freeing.patch b/queue-6.1/xfs-dquot-shrinker-doesn-t-check-for-xfs_dqflag_freeing.patch

new file mode 100644 (file)

index 0000000..1a12688
--- /dev/null
+++ b/queue-6.1/xfs-dquot-shrinker-doesn-t-check-for-xfs_dqflag_freeing.patch
@@ -0,0 +1,67 @@
+From stable+bounces-77016-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:10 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:26 -0700
+Subject: xfs: dquot shrinker doesn't check for XFS_DQFLAG_FREEING
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner <dchinner@redhat.com>, syzbot+912776840162c13db1a3@syzkaller.appspotmail.com, "Darrick J. Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-2-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 52f31ed228212ba572c44e15e818a3a5c74122c0 ]
+
+Resulting in a UAF if the shrinker races with some other dquot
+freeing mechanism that sets XFS_DQFLAG_FREEING before the dquot is
+removed from the LRU. This can occur if a dquot purge races with
+drop_caches.
+
+Reported-by: syzbot+912776840162c13db1a3@syzkaller.appspotmail.com
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_qm.c |   16 ++++++++++++----
+ 1 file changed, 12 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_qm.c
++++ b/fs/xfs/xfs_qm.c
+@@ -423,6 +423,14 @@ xfs_qm_dquot_isolate(
+               goto out_miss_busy;
+ 
+       /*
++       * If something else is freeing this dquot and hasn't yet removed it
++       * from the LRU, leave it for the freeing task to complete the freeing
++       * process rather than risk it being free from under us here.
++       */
++      if (dqp->q_flags & XFS_DQFLAG_FREEING)
++              goto out_miss_unlock;
++
++      /*
+        * This dquot has acquired a reference in the meantime remove it from
+        * the freelist and try again.
+        */
+@@ -441,10 +449,8 @@ xfs_qm_dquot_isolate(
+        * skip it so there is time for the IO to complete before we try to
+        * reclaim it again on the next LRU pass.
+        */
+-      if (!xfs_dqflock_nowait(dqp)) {
+-              xfs_dqunlock(dqp);
+-              goto out_miss_busy;
+-      }
++      if (!xfs_dqflock_nowait(dqp))
++              goto out_miss_unlock;
+ 
+       if (XFS_DQ_IS_DIRTY(dqp)) {
+               struct xfs_buf  *bp = NULL;
+@@ -478,6 +484,8 @@ xfs_qm_dquot_isolate(
+       XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims);
+       return LRU_REMOVED;
+ 
++out_miss_unlock:
++      xfs_dqunlock(dqp);
+ out_miss_busy:
+       trace_xfs_dqreclaim_busy(dqp);
+       XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
diff --git a/queue-6.1/xfs-fix-ag-count-overflow-during-growfs.patch b/queue-6.1/xfs-fix-ag-count-overflow-during-growfs.patch

new file mode 100644 (file)

index 0000000..3864748
--- /dev/null
+++ b/queue-6.1/xfs-fix-ag-count-overflow-during-growfs.patch
@@ -0,0 +1,108 @@
+From stable+bounces-77030-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:03 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:40 -0700
+Subject: xfs: fix ag count overflow during growfs
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Long Li <leo.lilong@huaweicloud.com>, Long Li <leo.lilong@huawei.com>, "Darrick J. Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-16-leah.rumancik@gmail.com>
+
+From: Long Li <leo.lilong@huaweicloud.com>
+
+[ Upstream commit c3b880acadc95d6e019eae5d669e072afda24f1b ]
+
+I found a corruption during growfs:
+
+ XFS (loop0): Internal error agbno >= mp->m_sb.sb_agblocks at line 3661 of
+   file fs/xfs/libxfs/xfs_alloc.c.  Caller __xfs_free_extent+0x28e/0x3c0
+ CPU: 0 PID: 573 Comm: xfs_growfs Not tainted 6.3.0-rc7-next-20230420-00001-gda8c95746257
+ Call Trace:
+  <TASK>
+  dump_stack_lvl+0x50/0x70
+  xfs_corruption_error+0x134/0x150
+  __xfs_free_extent+0x2c1/0x3c0
+  xfs_ag_extend_space+0x291/0x3e0
+  xfs_growfs_data+0xd72/0xe90
+  xfs_file_ioctl+0x5f9/0x14a0
+  __x64_sys_ioctl+0x13e/0x1c0
+  do_syscall_64+0x39/0x80
+  entry_SYSCALL_64_after_hwframe+0x63/0xcd
+ XFS (loop0): Corruption detected. Unmount and run xfs_repair
+ XFS (loop0): Internal error xfs_trans_cancel at line 1097 of file
+   fs/xfs/xfs_trans.c.  Caller xfs_growfs_data+0x691/0xe90
+ CPU: 0 PID: 573 Comm: xfs_growfs Not tainted 6.3.0-rc7-next-20230420-00001-gda8c95746257
+ Call Trace:
+  <TASK>
+  dump_stack_lvl+0x50/0x70
+  xfs_error_report+0x93/0xc0
+  xfs_trans_cancel+0x2c0/0x350
+  xfs_growfs_data+0x691/0xe90
+  xfs_file_ioctl+0x5f9/0x14a0
+  __x64_sys_ioctl+0x13e/0x1c0
+  do_syscall_64+0x39/0x80
+  entry_SYSCALL_64_after_hwframe+0x63/0xcd
+ RIP: 0033:0x7f2d86706577
+
+The bug can be reproduced with the following sequence:
+
+ # truncate -s  1073741824 xfs_test.img
+ # mkfs.xfs -f -b size=1024 -d agcount=4 xfs_test.img
+ # truncate -s 2305843009213693952  xfs_test.img
+ # mount -o loop xfs_test.img /mnt/test
+ # xfs_growfs -D  1125899907891200  /mnt/test
+
+The root cause is that during growfs, user space passed in a large value
+of newblcoks to xfs_growfs_data_private(), due to current sb_agblocks is
+too small, new AG count will exceed UINT_MAX. Because of AG number type
+is unsigned int and it would overflow, that caused nagcount much smaller
+than the actual value. During AG extent space, delta blocks in
+xfs_resizefs_init_new_ags() will much larger than the actual value due to
+incorrect nagcount, even exceed UINT_MAX. This will cause corruption and
+be detected in __xfs_free_extent. Fix it by growing the filesystem to up
+to the maximally allowed AGs and not return EINVAL when new AG count
+overflow.
+
+Signed-off-by: Long Li <leo.lilong@huawei.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_fs.h |    2 ++
+ fs/xfs/xfs_fsops.c     |   13 +++++++++----
+ 2 files changed, 11 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_fs.h
++++ b/fs/xfs/libxfs/xfs_fs.h
+@@ -257,6 +257,8 @@ typedef struct xfs_fsop_resblks {
+ #define XFS_MAX_AG_BLOCKS     (XFS_MAX_AG_BYTES / XFS_MIN_BLOCKSIZE)
+ #define XFS_MAX_CRC_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_CRC_BLOCKSIZE)
+ 
++#define XFS_MAX_AGNUMBER      ((xfs_agnumber_t)(NULLAGNUMBER - 1))
++
+ /* keep the maximum size under 2^31 by a small amount */
+ #define XFS_MAX_LOG_BYTES \
+       ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES)
+--- a/fs/xfs/xfs_fsops.c
++++ b/fs/xfs/xfs_fsops.c
+@@ -115,11 +115,16 @@ xfs_growfs_data_private(
+ 
+       nb_div = nb;
+       nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks);
+-      nagcount = nb_div + (nb_mod != 0);
+-      if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) {
+-              nagcount--;
+-              nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks;
++      if (nb_mod && nb_mod >= XFS_MIN_AG_BLOCKS)
++              nb_div++;
++      else if (nb_mod)
++              nb = nb_div * mp->m_sb.sb_agblocks;
++
++      if (nb_div > XFS_MAX_AGNUMBER + 1) {
++              nb_div = XFS_MAX_AGNUMBER + 1;
++              nb = nb_div * mp->m_sb.sb_agblocks;
+       }
++      nagcount = nb_div;
+       delta = nb - mp->m_sb.sb_dblocks;
+       /*
+        * Reject filesystems with a single AG because they are not
diff --git a/queue-6.1/xfs-fix-agf-vs-inode-cluster-buffer-deadlock.patch b/queue-6.1/xfs-fix-agf-vs-inode-cluster-buffer-deadlock.patch

new file mode 100644 (file)

index 0000000..e8bac44
--- /dev/null
+++ b/queue-6.1/xfs-fix-agf-vs-inode-cluster-buffer-deadlock.patch
@@ -0,0 +1,423 @@
+From stable+bounces-77028-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:01 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:38 -0700
+Subject: xfs: fix AGF vs inode cluster buffer deadlock
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner <dchinner@redhat.com>, Christoph Hellwig <hch@lst.de>, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <david@fromorbit.com>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-14-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 82842fee6e5979ca7e2bf4d839ef890c22ffb7aa ]
+
+Lock order in XFS is AGI -> AGF, hence for operations involving
+inode unlinked list operations we always lock the AGI first. Inode
+unlinked list operations operate on the inode cluster buffer,
+so the lock order there is AGI -> inode cluster buffer.
+
+For O_TMPFILE operations, this now means the lock order set down in
+xfs_rename and xfs_link is AGI -> inode cluster buffer -> AGF as the
+unlinked ops are done before the directory modifications that may
+allocate space and lock the AGF.
+
+Unfortunately, we also now lock the inode cluster buffer when
+logging an inode so that we can attach the inode to the cluster
+buffer and pin it in memory. This creates a lock order of AGF ->
+inode cluster buffer in directory operations as we have to log the
+inode after we've allocated new space for it.
+
+This creates a lock inversion between the AGF and the inode cluster
+buffer. Because the inode cluster buffer is shared across multiple
+inodes, the inversion is not specific to individual inodes but can
+occur when inodes in the same cluster buffer are accessed in
+different orders.
+
+To fix this we need move all the inode log item cluster buffer
+interactions to the end of the current transaction. Unfortunately,
+xfs_trans_log_inode() calls are littered throughout the transactions
+with no thought to ordering against other items or locking. This
+makes it difficult to do anything that involves changing the call
+sites of xfs_trans_log_inode() to change locking orders.
+
+However, we do now have a mechanism that allows is to postpone dirty
+item processing to just before we commit the transaction: the
+->iop_precommit method. This will be called after all the
+modifications are done and high level objects like AGI and AGF
+buffers have been locked and modified, thereby providing a mechanism
+that guarantees we don't lock the inode cluster buffer before those
+high level objects are locked.
+
+This change is largely moving the guts of xfs_trans_log_inode() to
+xfs_inode_item_precommit() and providing an extra flag context in
+the inode log item to track the dirty state of the inode in the
+current transaction. This also means we do a lot less repeated work
+in xfs_trans_log_inode() by only doing it once per transaction when
+all the work is done.
+
+Fixes: 298f7bec503f ("xfs: pin inode backing buffer to the inode log item")
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_log_format.h  |    9 ++
+ fs/xfs/libxfs/xfs_trans_inode.c |  113 ++----------------------------
+ fs/xfs/xfs_inode_item.c         |  149 ++++++++++++++++++++++++++++++++++++++++
+ fs/xfs/xfs_inode_item.h         |    1 
+ 4 files changed, 166 insertions(+), 106 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_log_format.h
++++ b/fs/xfs/libxfs/xfs_log_format.h
+@@ -324,7 +324,6 @@ struct xfs_inode_log_format_32 {
+ #define XFS_ILOG_DOWNER       0x200   /* change the data fork owner on replay */
+ #define XFS_ILOG_AOWNER       0x400   /* change the attr fork owner on replay */
+ 
+-
+ /*
+  * The timestamps are dirty, but not necessarily anything else in the inode
+  * core.  Unlike the other fields above this one must never make it to disk
+@@ -333,6 +332,14 @@ struct xfs_inode_log_format_32 {
+  */
+ #define XFS_ILOG_TIMESTAMP    0x4000
+ 
++/*
++ * The version field has been changed, but not necessarily anything else of
++ * interest. This must never make it to disk - it is used purely to ensure that
++ * the inode item ->precommit operation can update the fsync flag triggers
++ * in the inode item correctly.
++ */
++#define XFS_ILOG_IVERSION     0x8000
++
+ #define       XFS_ILOG_NONCORE        (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
+                                XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
+                                XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
+--- a/fs/xfs/libxfs/xfs_trans_inode.c
++++ b/fs/xfs/libxfs/xfs_trans_inode.c
+@@ -40,9 +40,8 @@ xfs_trans_ijoin(
+       iip->ili_lock_flags = lock_flags;
+       ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
+ 
+-      /*
+-       * Get a log_item_desc to point at the new item.
+-       */
++      /* Reset the per-tx dirty context and add the item to the tx. */
++      iip->ili_dirty_flags = 0;
+       xfs_trans_add_item(tp, &iip->ili_item);
+ }
+ 
+@@ -76,17 +75,10 @@ xfs_trans_ichgtime(
+ /*
+  * This is called to mark the fields indicated in fieldmask as needing to be
+  * logged when the transaction is committed.  The inode must already be
+- * associated with the given transaction.
+- *
+- * The values for fieldmask are defined in xfs_inode_item.h.  We always log all
+- * of the core inode if any of it has changed, and we always log all of the
+- * inline data/extents/b-tree root if any of them has changed.
+- *
+- * Grab and pin the cluster buffer associated with this inode to avoid RMW
+- * cycles at inode writeback time. Avoid the need to add error handling to every
+- * xfs_trans_log_inode() call by shutting down on read error.  This will cause
+- * transactions to fail and everything to error out, just like if we return a
+- * read error in a dirty transaction and cancel it.
++ * associated with the given transaction. All we do here is record where the
++ * inode was dirtied and mark the transaction and inode log item dirty;
++ * everything else is done in the ->precommit log item operation after the
++ * changes in the transaction have been completed.
+  */
+ void
+ xfs_trans_log_inode(
+@@ -96,7 +88,6 @@ xfs_trans_log_inode(
+ {
+       struct xfs_inode_log_item *iip = ip->i_itemp;
+       struct inode            *inode = VFS_I(ip);
+-      uint                    iversion_flags = 0;
+ 
+       ASSERT(iip);
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+@@ -105,18 +96,6 @@ xfs_trans_log_inode(
+       tp->t_flags |= XFS_TRANS_DIRTY;
+ 
+       /*
+-       * Don't bother with i_lock for the I_DIRTY_TIME check here, as races
+-       * don't matter - we either will need an extra transaction in 24 hours
+-       * to log the timestamps, or will clear already cleared fields in the
+-       * worst case.
+-       */
+-      if (inode->i_state & I_DIRTY_TIME) {
+-              spin_lock(&inode->i_lock);
+-              inode->i_state &= ~I_DIRTY_TIME;
+-              spin_unlock(&inode->i_lock);
+-      }
+-
+-      /*
+        * First time we log the inode in a transaction, bump the inode change
+        * counter if it is configured for this to occur. While we have the
+        * inode locked exclusively for metadata modification, we can usually
+@@ -128,86 +107,10 @@ xfs_trans_log_inode(
+       if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) {
+               if (IS_I_VERSION(inode) &&
+                   inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE))
+-                      iversion_flags = XFS_ILOG_CORE;
++                      flags |= XFS_ILOG_IVERSION;
+       }
+ 
+-      /*
+-       * If we're updating the inode core or the timestamps and it's possible
+-       * to upgrade this inode to bigtime format, do so now.
+-       */
+-      if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) &&
+-          xfs_has_bigtime(ip->i_mount) &&
+-          !xfs_inode_has_bigtime(ip)) {
+-              ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME;
+-              flags |= XFS_ILOG_CORE;
+-      }
+-
+-      /*
+-       * Inode verifiers do not check that the extent size hint is an integer
+-       * multiple of the rt extent size on a directory with both rtinherit
+-       * and extszinherit flags set.  If we're logging a directory that is
+-       * misconfigured in this way, clear the hint.
+-       */
+-      if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+-          (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
+-          (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
+-              ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
+-                                 XFS_DIFLAG_EXTSZINHERIT);
+-              ip->i_extsize = 0;
+-              flags |= XFS_ILOG_CORE;
+-      }
+-
+-      /*
+-       * Record the specific change for fdatasync optimisation. This allows
+-       * fdatasync to skip log forces for inodes that are only timestamp
+-       * dirty.
+-       */
+-      spin_lock(&iip->ili_lock);
+-      iip->ili_fsync_fields |= flags;
+-
+-      if (!iip->ili_item.li_buf) {
+-              struct xfs_buf  *bp;
+-              int             error;
+-
+-              /*
+-               * We hold the ILOCK here, so this inode is not going to be
+-               * flushed while we are here. Further, because there is no
+-               * buffer attached to the item, we know that there is no IO in
+-               * progress, so nothing will clear the ili_fields while we read
+-               * in the buffer. Hence we can safely drop the spin lock and
+-               * read the buffer knowing that the state will not change from
+-               * here.
+-               */
+-              spin_unlock(&iip->ili_lock);
+-              error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp);
+-              if (error) {
+-                      xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR);
+-                      return;
+-              }
+-
+-              /*
+-               * We need an explicit buffer reference for the log item but
+-               * don't want the buffer to remain attached to the transaction.
+-               * Hold the buffer but release the transaction reference once
+-               * we've attached the inode log item to the buffer log item
+-               * list.
+-               */
+-              xfs_buf_hold(bp);
+-              spin_lock(&iip->ili_lock);
+-              iip->ili_item.li_buf = bp;
+-              bp->b_flags |= _XBF_INODES;
+-              list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
+-              xfs_trans_brelse(tp, bp);
+-      }
+-
+-      /*
+-       * Always OR in the bits from the ili_last_fields field.  This is to
+-       * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
+-       * in the eventual clearing of the ili_fields bits.  See the big comment
+-       * in xfs_iflush() for an explanation of this coordination mechanism.
+-       */
+-      iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags);
+-      spin_unlock(&iip->ili_lock);
++      iip->ili_dirty_flags |= flags;
+ }
+ 
+ int
+--- a/fs/xfs/xfs_inode_item.c
++++ b/fs/xfs/xfs_inode_item.c
+@@ -29,6 +29,153 @@ static inline struct xfs_inode_log_item
+       return container_of(lip, struct xfs_inode_log_item, ili_item);
+ }
+ 
++static uint64_t
++xfs_inode_item_sort(
++      struct xfs_log_item     *lip)
++{
++      return INODE_ITEM(lip)->ili_inode->i_ino;
++}
++
++/*
++ * Prior to finally logging the inode, we have to ensure that all the
++ * per-modification inode state changes are applied. This includes VFS inode
++ * state updates, format conversions, verifier state synchronisation and
++ * ensuring the inode buffer remains in memory whilst the inode is dirty.
++ *
++ * We have to be careful when we grab the inode cluster buffer due to lock
++ * ordering constraints. The unlinked inode modifications (xfs_iunlink_item)
++ * require AGI -> inode cluster buffer lock order. The inode cluster buffer is
++ * not locked until ->precommit, so it happens after everything else has been
++ * modified.
++ *
++ * Further, we have AGI -> AGF lock ordering, and with O_TMPFILE handling we
++ * have AGI -> AGF -> iunlink item -> inode cluster buffer lock order. Hence we
++ * cannot safely lock the inode cluster buffer in xfs_trans_log_inode() because
++ * it can be called on a inode (e.g. via bumplink/droplink) before we take the
++ * AGF lock modifying directory blocks.
++ *
++ * Rather than force a complete rework of all the transactions to call
++ * xfs_trans_log_inode() once and once only at the end of every transaction, we
++ * move the pinning of the inode cluster buffer to a ->precommit operation. This
++ * matches how the xfs_iunlink_item locks the inode cluster buffer, and it
++ * ensures that the inode cluster buffer locking is always done last in a
++ * transaction. i.e. we ensure the lock order is always AGI -> AGF -> inode
++ * cluster buffer.
++ *
++ * If we return the inode number as the precommit sort key then we'll also
++ * guarantee that the order all inode cluster buffer locking is the same all the
++ * inodes and unlink items in the transaction.
++ */
++static int
++xfs_inode_item_precommit(
++      struct xfs_trans        *tp,
++      struct xfs_log_item     *lip)
++{
++      struct xfs_inode_log_item *iip = INODE_ITEM(lip);
++      struct xfs_inode        *ip = iip->ili_inode;
++      struct inode            *inode = VFS_I(ip);
++      unsigned int            flags = iip->ili_dirty_flags;
++
++      /*
++       * Don't bother with i_lock for the I_DIRTY_TIME check here, as races
++       * don't matter - we either will need an extra transaction in 24 hours
++       * to log the timestamps, or will clear already cleared fields in the
++       * worst case.
++       */
++      if (inode->i_state & I_DIRTY_TIME) {
++              spin_lock(&inode->i_lock);
++              inode->i_state &= ~I_DIRTY_TIME;
++              spin_unlock(&inode->i_lock);
++      }
++
++      /*
++       * If we're updating the inode core or the timestamps and it's possible
++       * to upgrade this inode to bigtime format, do so now.
++       */
++      if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) &&
++          xfs_has_bigtime(ip->i_mount) &&
++          !xfs_inode_has_bigtime(ip)) {
++              ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME;
++              flags |= XFS_ILOG_CORE;
++      }
++
++      /*
++       * Inode verifiers do not check that the extent size hint is an integer
++       * multiple of the rt extent size on a directory with both rtinherit
++       * and extszinherit flags set.  If we're logging a directory that is
++       * misconfigured in this way, clear the hint.
++       */
++      if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
++          (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
++          (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) {
++              ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE |
++                                 XFS_DIFLAG_EXTSZINHERIT);
++              ip->i_extsize = 0;
++              flags |= XFS_ILOG_CORE;
++      }
++
++      /*
++       * Record the specific change for fdatasync optimisation. This allows
++       * fdatasync to skip log forces for inodes that are only timestamp
++       * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it
++       * to XFS_ILOG_CORE so that the actual on-disk dirty tracking
++       * (ili_fields) correctly tracks that the version has changed.
++       */
++      spin_lock(&iip->ili_lock);
++      iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION);
++      if (flags & XFS_ILOG_IVERSION)
++              flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE);
++
++      if (!iip->ili_item.li_buf) {
++              struct xfs_buf  *bp;
++              int             error;
++
++              /*
++               * We hold the ILOCK here, so this inode is not going to be
++               * flushed while we are here. Further, because there is no
++               * buffer attached to the item, we know that there is no IO in
++               * progress, so nothing will clear the ili_fields while we read
++               * in the buffer. Hence we can safely drop the spin lock and
++               * read the buffer knowing that the state will not change from
++               * here.
++               */
++              spin_unlock(&iip->ili_lock);
++              error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp);
++              if (error)
++                      return error;
++
++              /*
++               * We need an explicit buffer reference for the log item but
++               * don't want the buffer to remain attached to the transaction.
++               * Hold the buffer but release the transaction reference once
++               * we've attached the inode log item to the buffer log item
++               * list.
++               */
++              xfs_buf_hold(bp);
++              spin_lock(&iip->ili_lock);
++              iip->ili_item.li_buf = bp;
++              bp->b_flags |= _XBF_INODES;
++              list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list);
++              xfs_trans_brelse(tp, bp);
++      }
++
++      /*
++       * Always OR in the bits from the ili_last_fields field.  This is to
++       * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines
++       * in the eventual clearing of the ili_fields bits.  See the big comment
++       * in xfs_iflush() for an explanation of this coordination mechanism.
++       */
++      iip->ili_fields |= (flags | iip->ili_last_fields);
++      spin_unlock(&iip->ili_lock);
++
++      /*
++       * We are done with the log item transaction dirty state, so clear it so
++       * that it doesn't pollute future transactions.
++       */
++      iip->ili_dirty_flags = 0;
++      return 0;
++}
++
+ /*
+  * The logged size of an inode fork is always the current size of the inode
+  * fork. This means that when an inode fork is relogged, the size of the logged
+@@ -662,6 +809,8 @@ xfs_inode_item_committing(
+ }
+ 
+ static const struct xfs_item_ops xfs_inode_item_ops = {
++      .iop_sort       = xfs_inode_item_sort,
++      .iop_precommit  = xfs_inode_item_precommit,
+       .iop_size       = xfs_inode_item_size,
+       .iop_format     = xfs_inode_item_format,
+       .iop_pin        = xfs_inode_item_pin,
+--- a/fs/xfs/xfs_inode_item.h
++++ b/fs/xfs/xfs_inode_item.h
+@@ -17,6 +17,7 @@ struct xfs_inode_log_item {
+       struct xfs_log_item     ili_item;          /* common portion */
+       struct xfs_inode        *ili_inode;        /* inode ptr */
+       unsigned short          ili_lock_flags;    /* inode lock flags */
++      unsigned int            ili_dirty_flags;   /* dirty in current tx */
+       /*
+        * The ili_lock protects the interactions between the dirty state and
+        * the flush state of the inode log item. This allows us to do atomic
diff --git a/queue-6.1/xfs-fix-bug_on-in-xfs_getbmap.patch b/queue-6.1/xfs-fix-bug_on-in-xfs_getbmap.patch

new file mode 100644 (file)

index 0000000..c624ae3
--- /dev/null
+++ b/queue-6.1/xfs-fix-bug_on-in-xfs_getbmap.patch
@@ -0,0 +1,102 @@
+From stable+bounces-77025-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:53 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:35 -0700
+Subject: xfs: fix BUG_ON in xfs_getbmap()
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Ye Bin <yebin10@huawei.com>, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Dave Chinner <david@fromorbit.com>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-11-leah.rumancik@gmail.com>
+
+From: Ye Bin <yebin10@huawei.com>
+
+[ Upstream commit 8ee81ed581ff35882b006a5205100db0b57bf070 ]
+
+There's issue as follows:
+XFS: Assertion failed: (bmv->bmv_iflags & BMV_IF_DELALLOC) != 0, file: fs/xfs/xfs_bmap_util.c, line: 329
+------------[ cut here ]------------
+kernel BUG at fs/xfs/xfs_message.c:102!
+invalid opcode: 0000 [#1] PREEMPT SMP KASAN
+CPU: 1 PID: 14612 Comm: xfs_io Not tainted 6.3.0-rc2-next-20230315-00006-g2729d23ddb3b-dirty #422
+RIP: 0010:assfail+0x96/0xa0
+RSP: 0018:ffffc9000fa178c0 EFLAGS: 00010246
+RAX: 0000000000000000 RBX: 0000000000000001 RCX: ffff888179a18000
+RDX: 0000000000000000 RSI: ffff888179a18000 RDI: 0000000000000002
+RBP: 0000000000000000 R08: ffffffff8321aab6 R09: 0000000000000000
+R10: 0000000000000001 R11: ffffed1105f85139 R12: ffffffff8aacc4c0
+R13: 0000000000000149 R14: ffff888269f58000 R15: 000000000000000c
+FS:  00007f42f27a4740(0000) GS:ffff88882fc00000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 0000000000b92388 CR3: 000000024f006000 CR4: 00000000000006e0
+DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+Call Trace:
+ <TASK>
+ xfs_getbmap+0x1a5b/0x1e40
+ xfs_ioc_getbmap+0x1fd/0x5b0
+ xfs_file_ioctl+0x2cb/0x1d50
+ __x64_sys_ioctl+0x197/0x210
+ do_syscall_64+0x39/0xb0
+ entry_SYSCALL_64_after_hwframe+0x63/0xcd
+
+Above issue may happen as follows:
+         ThreadA                       ThreadB
+do_shared_fault
+ __do_fault
+  xfs_filemap_fault
+   __xfs_filemap_fault
+    filemap_fault
+                             xfs_ioc_getbmap -> Without BMV_IF_DELALLOC flag
+                             xfs_getbmap
+                              xfs_ilock(ip, XFS_IOLOCK_SHARED);
+                              filemap_write_and_wait
+ do_page_mkwrite
+  xfs_filemap_page_mkwrite
+   __xfs_filemap_fault
+    xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+    iomap_page_mkwrite
+     ...
+     xfs_buffered_write_iomap_begin
+      xfs_bmapi_reserve_delalloc -> Allocate delay extent
+                              xfs_ilock_data_map_shared(ip)
+                             xfs_getbmap_report_one
+                              ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0)
+                               -> trigger BUG_ON
+
+As xfs_filemap_page_mkwrite() only hold XFS_MMAPLOCK_SHARED lock, there's
+small window mkwrite can produce delay extent after file write in xfs_getbmap().
+To solve above issue, just skip delalloc extents.
+
+Signed-off-by: Ye Bin <yebin10@huawei.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_bmap_util.c |   14 ++++++--------
+ 1 file changed, 6 insertions(+), 8 deletions(-)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -314,15 +314,13 @@ xfs_getbmap_report_one(
+       if (isnullstartblock(got->br_startblock) ||
+           got->br_startblock == DELAYSTARTBLOCK) {
+               /*
+-               * Delalloc extents that start beyond EOF can occur due to
+-               * speculative EOF allocation when the delalloc extent is larger
+-               * than the largest freespace extent at conversion time.  These
+-               * extents cannot be converted by data writeback, so can exist
+-               * here even if we are not supposed to be finding delalloc
+-               * extents.
++               * Take the flush completion as being a point-in-time snapshot
++               * where there are no delalloc extents, and if any new ones
++               * have been created racily, just skip them as being 'after'
++               * the flush and so don't get reported.
+                */
+-              if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip)))
+-                      ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0);
++              if (!(bmv->bmv_iflags & BMV_IF_DELALLOC))
++                      return 0;
+ 
+               p->bmv_oflags |= BMV_OF_DELALLOC;
+               p->bmv_block = -2;
diff --git a/queue-6.1/xfs-fix-deadlock-on-xfs_inodegc_worker.patch b/queue-6.1/xfs-fix-deadlock-on-xfs_inodegc_worker.patch

new file mode 100644 (file)

index 0000000..97ba1e2
--- /dev/null
+++ b/queue-6.1/xfs-fix-deadlock-on-xfs_inodegc_worker.patch
@@ -0,0 +1,141 @@
+From stable+bounces-77017-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:13 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:27 -0700
+Subject: xfs: Fix deadlock on xfs_inodegc_worker
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Wu Guanghao <wuguanghao3@huawei.com>, "Darrick J. Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-3-leah.rumancik@gmail.com>
+
+From: Wu Guanghao <wuguanghao3@huawei.com>
+
+[ Upstream commit 4da112513c01d7d0acf1025b8764349d46e177d6 ]
+
+We are doing a test about deleting a large number of files
+when memory is low. A deadlock problem was found.
+
+[ 1240.279183] -> #1 (fs_reclaim){+.+.}-{0:0}:
+[ 1240.280450]        lock_acquire+0x197/0x460
+[ 1240.281548]        fs_reclaim_acquire.part.0+0x20/0x30
+[ 1240.282625]        kmem_cache_alloc+0x2b/0x940
+[ 1240.283816]        xfs_trans_alloc+0x8a/0x8b0
+[ 1240.284757]        xfs_inactive_ifree+0xe4/0x4e0
+[ 1240.285935]        xfs_inactive+0x4e9/0x8a0
+[ 1240.286836]        xfs_inodegc_worker+0x160/0x5e0
+[ 1240.287969]        process_one_work+0xa19/0x16b0
+[ 1240.289030]        worker_thread+0x9e/0x1050
+[ 1240.290131]        kthread+0x34f/0x460
+[ 1240.290999]        ret_from_fork+0x22/0x30
+[ 1240.291905]
+[ 1240.291905] -> #0 ((work_completion)(&gc->work)){+.+.}-{0:0}:
+[ 1240.293569]        check_prev_add+0x160/0x2490
+[ 1240.294473]        __lock_acquire+0x2c4d/0x5160
+[ 1240.295544]        lock_acquire+0x197/0x460
+[ 1240.296403]        __flush_work+0x6bc/0xa20
+[ 1240.297522]        xfs_inode_mark_reclaimable+0x6f0/0xdc0
+[ 1240.298649]        destroy_inode+0xc6/0x1b0
+[ 1240.299677]        dispose_list+0xe1/0x1d0
+[ 1240.300567]        prune_icache_sb+0xec/0x150
+[ 1240.301794]        super_cache_scan+0x2c9/0x480
+[ 1240.302776]        do_shrink_slab+0x3f0/0xaa0
+[ 1240.303671]        shrink_slab+0x170/0x660
+[ 1240.304601]        shrink_node+0x7f7/0x1df0
+[ 1240.305515]        balance_pgdat+0x766/0xf50
+[ 1240.306657]        kswapd+0x5bd/0xd20
+[ 1240.307551]        kthread+0x34f/0x460
+[ 1240.308346]        ret_from_fork+0x22/0x30
+[ 1240.309247]
+[ 1240.309247] other info that might help us debug this:
+[ 1240.309247]
+[ 1240.310944]  Possible unsafe locking scenario:
+[ 1240.310944]
+[ 1240.312379]        CPU0                    CPU1
+[ 1240.313363]        ----                    ----
+[ 1240.314433]   lock(fs_reclaim);
+[ 1240.315107]                                lock((work_completion)(&gc->work));
+[ 1240.316828]                                lock(fs_reclaim);
+[ 1240.318088]   lock((work_completion)(&gc->work));
+[ 1240.319203]
+[ 1240.319203]  *** DEADLOCK ***
+...
+[ 2438.431081] Workqueue: xfs-inodegc/sda xfs_inodegc_worker
+[ 2438.432089] Call Trace:
+[ 2438.432562]  __schedule+0xa94/0x1d20
+[ 2438.435787]  schedule+0xbf/0x270
+[ 2438.436397]  schedule_timeout+0x6f8/0x8b0
+[ 2438.445126]  wait_for_completion+0x163/0x260
+[ 2438.448610]  __flush_work+0x4c4/0xa40
+[ 2438.455011]  xfs_inode_mark_reclaimable+0x6ef/0xda0
+[ 2438.456695]  destroy_inode+0xc6/0x1b0
+[ 2438.457375]  dispose_list+0xe1/0x1d0
+[ 2438.458834]  prune_icache_sb+0xe8/0x150
+[ 2438.461181]  super_cache_scan+0x2b3/0x470
+[ 2438.461950]  do_shrink_slab+0x3cf/0xa50
+[ 2438.462687]  shrink_slab+0x17d/0x660
+[ 2438.466392]  shrink_node+0x87e/0x1d40
+[ 2438.467894]  do_try_to_free_pages+0x364/0x1300
+[ 2438.471188]  try_to_free_pages+0x26c/0x5b0
+[ 2438.473567]  __alloc_pages_slowpath.constprop.136+0x7aa/0x2100
+[ 2438.482577]  __alloc_pages+0x5db/0x710
+[ 2438.485231]  alloc_pages+0x100/0x200
+[ 2438.485923]  allocate_slab+0x2c0/0x380
+[ 2438.486623]  ___slab_alloc+0x41f/0x690
+[ 2438.490254]  __slab_alloc+0x54/0x70
+[ 2438.491692]  kmem_cache_alloc+0x23e/0x270
+[ 2438.492437]  xfs_trans_alloc+0x88/0x880
+[ 2438.493168]  xfs_inactive_ifree+0xe2/0x4e0
+[ 2438.496419]  xfs_inactive+0x4eb/0x8b0
+[ 2438.497123]  xfs_inodegc_worker+0x16b/0x5e0
+[ 2438.497918]  process_one_work+0xbf7/0x1a20
+[ 2438.500316]  worker_thread+0x8c/0x1060
+[ 2438.504938]  ret_from_fork+0x22/0x30
+
+When the memory is insufficient, xfs_inonodegc_worker will trigger memory
+reclamation when memory is allocated, then flush_work() may be called to
+wait for the work to complete. This causes a deadlock.
+
+So use memalloc_nofs_save() to avoid triggering memory reclamation in
+xfs_inodegc_worker.
+
+Signed-off-by: Wu Guanghao <wuguanghao3@huawei.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_icache.c |   10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -1858,6 +1858,7 @@ xfs_inodegc_worker(
+                                               struct xfs_inodegc, work);
+       struct llist_node       *node = llist_del_all(&gc->list);
+       struct xfs_inode        *ip, *n;
++      unsigned int            nofs_flag;
+ 
+       ASSERT(gc->cpu == smp_processor_id());
+ 
+@@ -1866,6 +1867,13 @@ xfs_inodegc_worker(
+       if (!node)
+               return;
+ 
++      /*
++       * We can allocate memory here while doing writeback on behalf of
++       * memory reclaim.  To avoid memory allocation deadlocks set the
++       * task-wide nofs context for the following operations.
++       */
++      nofs_flag = memalloc_nofs_save();
++
+       ip = llist_entry(node, struct xfs_inode, i_gclist);
+       trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
+ 
+@@ -1874,6 +1882,8 @@ xfs_inodegc_worker(
+               xfs_iflags_set(ip, XFS_INACTIVATING);
+               xfs_inodegc_inactivate(ip);
+       }
++
++      memalloc_nofs_restore(nofs_flag);
+ }
+ 
+ /*
diff --git a/queue-6.1/xfs-fix-extent-busy-updating.patch b/queue-6.1/xfs-fix-extent-busy-updating.patch

new file mode 100644 (file)

index 0000000..9a5283a
--- /dev/null
+++ b/queue-6.1/xfs-fix-extent-busy-updating.patch
@@ -0,0 +1,35 @@
+From stable+bounces-77018-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:14 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:28 -0700
+Subject: xfs: fix extent busy updating
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Wengang Wang <wen.gang.wang@oracle.com>, "Darrick J. Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-4-leah.rumancik@gmail.com>
+
+From: Wengang Wang <wen.gang.wang@oracle.com>
+
+[ Upstream commit 601a27ea09a317d0fe2895df7d875381fb393041 ]
+
+In xfs_extent_busy_update_extent() case 6 and 7, whenever bno is modified on
+extent busy, the relavent length has to be modified accordingly.
+
+Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_extent_busy.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/xfs/xfs_extent_busy.c
++++ b/fs/xfs/xfs_extent_busy.c
+@@ -236,6 +236,7 @@ xfs_extent_busy_update_extent(
+                *
+                */
+               busyp->bno = fend;
++              busyp->length = bend - fend;
+       } else if (bbno < fbno) {
+               /*
+                * Case 8:
diff --git a/queue-6.1/xfs-fix-low-space-alloc-deadlock.patch b/queue-6.1/xfs-fix-low-space-alloc-deadlock.patch

new file mode 100644 (file)

index 0000000..ab1a209
--- /dev/null
+++ b/queue-6.1/xfs-fix-low-space-alloc-deadlock.patch
@@ -0,0 +1,240 @@
+From stable+bounces-77020-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:31 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:30 -0700
+Subject: xfs: fix low space alloc deadlock
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner <dchinner@redhat.com>, Allison Henderson <allison.henderson@oracle.com>, "Darrick J. Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-6-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 1dd0510f6d4b85616a36aabb9be38389467122d9 ]
+
+I've recently encountered an ABBA deadlock with g/476. The upcoming
+changes seem to make this much easier to hit, but the underlying
+problem is a pre-existing one.
+
+Essentially, if we select an AG for allocation, then lock the AGF
+and then fail to allocate for some reason (e.g. minimum length
+requirements cannot be satisfied), then we drop out of the
+allocation with the AGF still locked.
+
+The caller then modifies the allocation constraints - usually
+loosening them up - and tries again. This can result in trying to
+access AGFs that are lower than the AGF we already have locked from
+the failed attempt. e.g. the failed attempt skipped several AGs
+before failing, so we have locks an AG higher than the start AG.
+Retrying the allocation from the start AG then causes us to violate
+AGF lock ordering and this can lead to deadlocks.
+
+The deadlock exists even if allocation succeeds - we can do a
+followup allocations in the same transaction for BMBT blocks that
+aren't guaranteed to be in the same AG as the original, and can move
+into higher AGs. Hence we really need to move the tp->t_firstblock
+tracking down into xfs_alloc_vextent() where it can be set when we
+exit with a locked AG.
+
+xfs_alloc_vextent() can also check there if the requested
+allocation falls within the allow range of AGs set by
+tp->t_firstblock. If we can't allocate within the range set, we have
+to fail the allocation. If we are allowed to to non-blocking AGF
+locking, we can ignore the AG locking order limitations as we can
+use try-locks for the first iteration over requested AG range.
+
+This invalidates a set of post allocation asserts that check that
+the allocation is always above tp->t_firstblock if it is set.
+Because we can use try-locks to avoid the deadlock in some
+circumstances, having a pre-existing locked AGF doesn't always
+prevent allocation from lower order AGFs. Hence those ASSERTs need
+to be removed.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_alloc.c |   69 ++++++++++++++++++++++++++++++++++++++--------
+ fs/xfs/libxfs/xfs_bmap.c  |   14 ---------
+ fs/xfs/xfs_trace.h        |    1 
+ 3 files changed, 58 insertions(+), 26 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_alloc.c
++++ b/fs/xfs/libxfs/xfs_alloc.c
+@@ -3164,10 +3164,13 @@ xfs_alloc_vextent(
+       xfs_alloctype_t         type;   /* input allocation type */
+       int                     bump_rotor = 0;
+       xfs_agnumber_t          rotorstep = xfs_rotorstep; /* inode32 agf stepper */
++      xfs_agnumber_t          minimum_agno = 0;
+ 
+       mp = args->mp;
+       type = args->otype = args->type;
+       args->agbno = NULLAGBLOCK;
++      if (args->tp->t_firstblock != NULLFSBLOCK)
++              minimum_agno = XFS_FSB_TO_AGNO(mp, args->tp->t_firstblock);
+       /*
+        * Just fix this up, for the case where the last a.g. is shorter
+        * (or there's only one a.g.) and the caller couldn't easily figure
+@@ -3201,6 +3204,13 @@ xfs_alloc_vextent(
+                */
+               args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+               args->pag = xfs_perag_get(mp, args->agno);
++
++              if (minimum_agno > args->agno) {
++                      trace_xfs_alloc_vextent_skip_deadlock(args);
++                      error = 0;
++                      break;
++              }
++
+               error = xfs_alloc_fix_freelist(args, 0);
+               if (error) {
+                       trace_xfs_alloc_vextent_nofix(args);
+@@ -3232,6 +3242,8 @@ xfs_alloc_vextent(
+       case XFS_ALLOCTYPE_FIRST_AG:
+               /*
+                * Rotate through the allocation groups looking for a winner.
++               * If we are blocking, we must obey minimum_agno contraints for
++               * avoiding ABBA deadlocks on AGF locking.
+                */
+               if (type == XFS_ALLOCTYPE_FIRST_AG) {
+                       /*
+@@ -3239,7 +3251,7 @@ xfs_alloc_vextent(
+                        */
+                       args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+                       args->type = XFS_ALLOCTYPE_THIS_AG;
+-                      sagno = 0;
++                      sagno = minimum_agno;
+                       flags = 0;
+               } else {
+                       /*
+@@ -3248,6 +3260,7 @@ xfs_alloc_vextent(
+                       args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
+                       flags = XFS_ALLOC_FLAG_TRYLOCK;
+               }
++
+               /*
+                * Loop over allocation groups twice; first time with
+                * trylock set, second time without.
+@@ -3276,19 +3289,21 @@ xfs_alloc_vextent(
+                       if (args->agno == sagno &&
+                           type == XFS_ALLOCTYPE_START_BNO)
+                               args->type = XFS_ALLOCTYPE_THIS_AG;
++
+                       /*
+-                      * For the first allocation, we can try any AG to get
+-                      * space.  However, if we already have allocated a
+-                      * block, we don't want to try AGs whose number is below
+-                      * sagno. Otherwise, we may end up with out-of-order
+-                      * locking of AGF, which might cause deadlock.
+-                      */
++                       * If we are try-locking, we can't deadlock on AGF
++                       * locks, so we can wrap all the way back to the first
++                       * AG. Otherwise, wrap back to the start AG so we can't
++                       * deadlock, and let the end of scan handler decide what
++                       * to do next.
++                       */
+                       if (++(args->agno) == mp->m_sb.sb_agcount) {
+-                              if (args->tp->t_firstblock != NULLFSBLOCK)
+-                                      args->agno = sagno;
+-                              else
++                              if (flags & XFS_ALLOC_FLAG_TRYLOCK)
+                                       args->agno = 0;
++                              else
++                                      args->agno = sagno;
+                       }
++
+                       /*
+                        * Reached the starting a.g., must either be done
+                        * or switch to non-trylock mode.
+@@ -3300,7 +3315,14 @@ xfs_alloc_vextent(
+                                       break;
+                               }
+ 
++                              /*
++                               * Blocking pass next, so we must obey minimum
++                               * agno constraints to avoid ABBA AGF deadlocks.
++                               */
+                               flags = 0;
++                              if (minimum_agno > sagno)
++                                      sagno = minimum_agno;
++
+                               if (type == XFS_ALLOCTYPE_START_BNO) {
+                                       args->agbno = XFS_FSB_TO_AGBNO(mp,
+                                               args->fsbno);
+@@ -3322,9 +3344,9 @@ xfs_alloc_vextent(
+               ASSERT(0);
+               /* NOTREACHED */
+       }
+-      if (args->agbno == NULLAGBLOCK)
++      if (args->agbno == NULLAGBLOCK) {
+               args->fsbno = NULLFSBLOCK;
+-      else {
++      } else {
+               args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
+ #ifdef DEBUG
+               ASSERT(args->len >= args->minlen);
+@@ -3335,6 +3357,29 @@ xfs_alloc_vextent(
+ #endif
+ 
+       }
++
++      /*
++       * We end up here with a locked AGF. If we failed, the caller is likely
++       * going to try to allocate again with different parameters, and that
++       * can widen the AGs that are searched for free space. If we have to do
++       * BMBT block allocation, we have to do a new allocation.
++       *
++       * Hence leaving this function with the AGF locked opens up potential
++       * ABBA AGF deadlocks because a future allocation attempt in this
++       * transaction may attempt to lock a lower number AGF.
++       *
++       * We can't release the AGF until the transaction is commited, so at
++       * this point we must update the "firstblock" tracker to point at this
++       * AG if the tracker is empty or points to a lower AG. This allows the
++       * next allocation attempt to be modified appropriately to avoid
++       * deadlocks.
++       */
++      if (args->agbp &&
++          (args->tp->t_firstblock == NULLFSBLOCK ||
++           args->pag->pag_agno > minimum_agno)) {
++              args->tp->t_firstblock = XFS_AGB_TO_FSB(mp,
++                                      args->pag->pag_agno, 0);
++      }
+       xfs_perag_put(args->pag);
+       return 0;
+ error0:
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -3413,21 +3413,7 @@ xfs_bmap_process_allocated_extent(
+       xfs_fileoff_t           orig_offset,
+       xfs_extlen_t            orig_length)
+ {
+-      int                     nullfb;
+-
+-      nullfb = ap->tp->t_firstblock == NULLFSBLOCK;
+-
+-      /*
+-       * check the allocation happened at the same or higher AG than
+-       * the first block that was allocated.
+-       */
+-      ASSERT(nullfb ||
+-              XFS_FSB_TO_AGNO(args->mp, ap->tp->t_firstblock) <=
+-              XFS_FSB_TO_AGNO(args->mp, args->fsbno));
+-
+       ap->blkno = args->fsbno;
+-      if (nullfb)
+-              ap->tp->t_firstblock = args->fsbno;
+       ap->length = args->len;
+       /*
+        * If the extent size hint is active, we tried to round the
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -1877,6 +1877,7 @@ DEFINE_ALLOC_EVENT(xfs_alloc_small_noten
+ DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
+ DEFINE_ALLOC_EVENT(xfs_alloc_small_error);
+ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs);
++DEFINE_ALLOC_EVENT(xfs_alloc_vextent_skip_deadlock);
+ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix);
+ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
+ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
diff --git a/queue-6.1/xfs-fix-negative-array-access-in-xfs_getbmap.patch b/queue-6.1/xfs-fix-negative-array-access-in-xfs_getbmap.patch

new file mode 100644 (file)

index 0000000..21a7ba6
--- /dev/null
+++ b/queue-6.1/xfs-fix-negative-array-access-in-xfs_getbmap.patch
@@ -0,0 +1,67 @@
+From stable+bounces-77034-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:17 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:44 -0700
+Subject: xfs: fix negative array access in xfs_getbmap
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, "Darrick J. Wong" <djwong@kernel.org>, syzbot+c103d3808a0de5faaf80@syzkaller.appspotmail.com, Dave Chinner <dchinner@redhat.com>, Dave Chinner <david@fromorbit.com>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-20-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 1bba82fe1afac69c85c1f5ea137c8e73de3c8032 ]
+
+In commit 8ee81ed581ff, Ye Bin complained about an ASSERT in the bmapx
+code that trips if we encounter a delalloc extent after flushing the
+pagecache to disk.  The ioctl code does not hold MMAPLOCK so it's
+entirely possible that a racing write page fault can create a delalloc
+extent after the file has been flushed.  The proposed solution was to
+replace the assertion with an early return that avoids filling out the
+bmap recordset with a delalloc entry if the caller didn't ask for it.
+
+At the time, I recall thinking that the forward logic sounded ok, but
+felt hesitant because I suspected that changing this code would cause
+something /else/ to burst loose due to some other subtlety.
+
+syzbot of course found that subtlety.  If all the extent mappings found
+after the flush are delalloc mappings, we'll reach the end of the data
+fork without ever incrementing bmv->bmv_entries.  This is new, since
+before we'd have emitted the delalloc mappings even though the caller
+didn't ask for them.  Once we reach the end, we'll try to set
+BMV_OF_LAST on the -1st entry (because bmv_entries is zero) and go
+corrupt something else in memory.  Yay.
+
+I really dislike all these stupid patches that fiddle around with debug
+code and break things that otherwise worked well enough.  Nobody was
+complaining that calling XFS_IOC_BMAPX without BMV_IF_DELALLOC would
+return BMV_OF_DELALLOC records, and now we've gone from "weird behavior
+that nobody cared about" to "bad behavior that must be addressed
+immediately".
+
+Maybe I'll just ignore anything from Huawei from now on for my own sake.
+
+Reported-by: syzbot+c103d3808a0de5faaf80@syzkaller.appspotmail.com
+Link: https://lore.kernel.org/linux-xfs/20230412024907.GP360889@frogsfrogsfrogs/
+Fixes: 8ee81ed581ff ("xfs: fix BUG_ON in xfs_getbmap()")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_bmap_util.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -558,7 +558,9 @@ xfs_getbmap(
+               if (!xfs_iext_next_extent(ifp, &icur, &got)) {
+                       xfs_fileoff_t   end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
+ 
+-                      out[bmv->bmv_entries - 1].bmv_oflags |= BMV_OF_LAST;
++                      if (bmv->bmv_entries > 0)
++                              out[bmv->bmv_entries - 1].bmv_oflags |=
++                                                              BMV_OF_LAST;
+ 
+                       if (whichfork != XFS_ATTR_FORK && bno < end &&
+                           !xfs_getbmap_full(bmv)) {
diff --git a/queue-6.1/xfs-fix-reloading-entire-unlinked-bucket-lists.patch b/queue-6.1/xfs-fix-reloading-entire-unlinked-bucket-lists.patch

new file mode 100644 (file)

index 0000000..b0e6f94
--- /dev/null
+++ b/queue-6.1/xfs-fix-reloading-entire-unlinked-bucket-lists.patch
@@ -0,0 +1,199 @@
+From stable+bounces-77040-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:49 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:50 -0700
+Subject: xfs: fix reloading entire unlinked bucket lists
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-26-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 537c013b140d373d1ffe6290b841dc00e67effaa ]
+
+During review of the patcheset that provided reloading of the incore
+iunlink list, Dave made a few suggestions, and I updated the copy in my
+dev tree.  Unfortunately, I then got distracted by ... who even knows
+what ... and forgot to backport those changes from my dev tree to my
+release candidate branch.  I then sent multiple pull requests with stale
+patches, and that's what was merged into -rc3.
+
+So.
+
+This patch re-adds the use of an unlocked iunlink list check to
+determine if we want to allocate the resources to recreate the incore
+list.  Since lost iunlinked inodes are supposed to be rare, this change
+helps us avoid paying the transaction and AGF locking costs every time
+we open any inode.
+
+This also re-adds the shutdowns on failure, and re-applies the
+restructuring of the inner loop in xfs_inode_reload_unlinked_bucket, and
+re-adds a requested comment about the quotachecking code.
+
+Retain the original RVB tag from Dave since there's no code change from
+the last submission.
+
+Fixes: 68b957f64fca1 ("xfs: load uncached unlinked inodes into memory on demand")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_export.c |   16 ++++++++++++----
+ fs/xfs/xfs_inode.c  |   48 +++++++++++++++++++++++++++++++++++-------------
+ fs/xfs/xfs_itable.c |    2 ++
+ fs/xfs/xfs_qm.c     |   15 ++++++++++++---
+ 4 files changed, 61 insertions(+), 20 deletions(-)
+
+--- a/fs/xfs/xfs_export.c
++++ b/fs/xfs/xfs_export.c
+@@ -146,10 +146,18 @@ xfs_nfs_get_inode(
+               return ERR_PTR(error);
+       }
+ 
+-      error = xfs_inode_reload_unlinked(ip);
+-      if (error) {
+-              xfs_irele(ip);
+-              return ERR_PTR(error);
++      /*
++       * Reload the incore unlinked list to avoid failure in inodegc.
++       * Use an unlocked check here because unrecovered unlinked inodes
++       * should be somewhat rare.
++       */
++      if (xfs_inode_unlinked_incomplete(ip)) {
++              error = xfs_inode_reload_unlinked(ip);
++              if (error) {
++                      xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
++                      xfs_irele(ip);
++                      return ERR_PTR(error);
++              }
+       }
+ 
+       if (VFS_I(ip)->i_generation != generation) {
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1744,6 +1744,14 @@ xfs_inactive(
+               truncate = 1;
+ 
+       if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) {
++              /*
++               * If this inode is being inactivated during a quotacheck and
++               * has not yet been scanned by quotacheck, we /must/ remove
++               * the dquots from the inode before inactivation changes the
++               * block and inode counts.  Most probably this is a result of
++               * reloading the incore iunlinked list to purge unrecovered
++               * unlinked inodes.
++               */
+               xfs_qm_dqdetach(ip);
+       } else {
+               error = xfs_qm_dqattach(ip);
+@@ -3657,6 +3665,16 @@ xfs_inode_reload_unlinked_bucket(
+       if (error)
+               return error;
+ 
++      /*
++       * We've taken ILOCK_SHARED and the AGI buffer lock to stabilize the
++       * incore unlinked list pointers for this inode.  Check once more to
++       * see if we raced with anyone else to reload the unlinked list.
++       */
++      if (!xfs_inode_unlinked_incomplete(ip)) {
++              foundit = true;
++              goto out_agibp;
++      }
++
+       bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
+       agi = agibp->b_addr;
+ 
+@@ -3671,25 +3689,27 @@ xfs_inode_reload_unlinked_bucket(
+       while (next_agino != NULLAGINO) {
+               struct xfs_inode        *next_ip = NULL;
+ 
++              /* Found this caller's inode, set its backlink. */
+               if (next_agino == agino) {
+-                      /* Found this inode, set its backlink. */
+                       next_ip = ip;
+                       next_ip->i_prev_unlinked = prev_agino;
+                       foundit = true;
++                      goto next_inode;
+               }
+-              if (!next_ip) {
+-                      /* Inode already in memory. */
+-                      next_ip = xfs_iunlink_lookup(pag, next_agino);
+-              }
+-              if (!next_ip) {
+-                      /* Inode not in memory, reload. */
+-                      error = xfs_iunlink_reload_next(tp, agibp, prev_agino,
+-                                      next_agino);
+-                      if (error)
+-                              break;
+ 
+-                      next_ip = xfs_iunlink_lookup(pag, next_agino);
+-              }
++              /* Try in-memory lookup first. */
++              next_ip = xfs_iunlink_lookup(pag, next_agino);
++              if (next_ip)
++                      goto next_inode;
++
++              /* Inode not in memory, try reloading it. */
++              error = xfs_iunlink_reload_next(tp, agibp, prev_agino,
++                              next_agino);
++              if (error)
++                      break;
++
++              /* Grab the reloaded inode. */
++              next_ip = xfs_iunlink_lookup(pag, next_agino);
+               if (!next_ip) {
+                       /* No incore inode at all?  We reloaded it... */
+                       ASSERT(next_ip != NULL);
+@@ -3697,10 +3717,12 @@ xfs_inode_reload_unlinked_bucket(
+                       break;
+               }
+ 
++next_inode:
+               prev_agino = next_agino;
+               next_agino = next_ip->i_next_unlinked;
+       }
+ 
++out_agibp:
+       xfs_trans_brelse(tp, agibp);
+       /* Should have found this inode somewhere in the iunlinked bucket. */
+       if (!error && !foundit)
+--- a/fs/xfs/xfs_itable.c
++++ b/fs/xfs/xfs_itable.c
+@@ -80,10 +80,12 @@ xfs_bulkstat_one_int(
+       if (error)
+               goto out;
+ 
++      /* Reload the incore unlinked list to avoid failure in inodegc. */
+       if (xfs_inode_unlinked_incomplete(ip)) {
+               error = xfs_inode_reload_unlinked_bucket(tp, ip);
+               if (error) {
+                       xfs_iunlock(ip, XFS_ILOCK_SHARED);
++                      xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+                       xfs_irele(ip);
+                       return error;
+               }
+--- a/fs/xfs/xfs_qm.c
++++ b/fs/xfs/xfs_qm.c
+@@ -1160,9 +1160,18 @@ xfs_qm_dqusage_adjust(
+       if (error)
+               return error;
+ 
+-      error = xfs_inode_reload_unlinked(ip);
+-      if (error)
+-              goto error0;
++      /*
++       * Reload the incore unlinked list to avoid failure in inodegc.
++       * Use an unlocked check here because unrecovered unlinked inodes
++       * should be somewhat rare.
++       */
++      if (xfs_inode_unlinked_incomplete(ip)) {
++              error = xfs_inode_reload_unlinked(ip);
++              if (error) {
++                      xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
++                      goto error0;
++              }
++      }
+ 
+       ASSERT(ip->i_delayed_blks == 0);
+ 
diff --git a/queue-6.1/xfs-fix-the-calculation-for-end-and-length.patch b/queue-6.1/xfs-fix-the-calculation-for-end-and-length.patch

new file mode 100644 (file)

index 0000000..dc203b1
--- /dev/null
+++ b/queue-6.1/xfs-fix-the-calculation-for-end-and-length.patch
@@ -0,0 +1,56 @@
+From stable+bounces-77032-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:11 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:42 -0700
+Subject: xfs: fix the calculation for "end" and "length"
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Shiyang Ruan <ruansy.fnst@fujitsu.com>, "Darrick J. Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-18-leah.rumancik@gmail.com>
+
+From: Shiyang Ruan <ruansy.fnst@fujitsu.com>
+
+[ Upstream commit 5cf32f63b0f4c520460c1a5dd915dc4f09085f29 ]
+
+The value of "end" should be "start + length - 1".
+
+Signed-off-by: Shiyang Ruan <ruansy.fnst@fujitsu.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_notify_failure.c |    9 +++++----
+ 1 file changed, 5 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_notify_failure.c
++++ b/fs/xfs/xfs_notify_failure.c
+@@ -114,7 +114,8 @@ xfs_dax_notify_ddev_failure(
+       int                     error = 0;
+       xfs_fsblock_t           fsbno = XFS_DADDR_TO_FSB(mp, daddr);
+       xfs_agnumber_t          agno = XFS_FSB_TO_AGNO(mp, fsbno);
+-      xfs_fsblock_t           end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen);
++      xfs_fsblock_t           end_fsbno = XFS_DADDR_TO_FSB(mp,
++                                                           daddr + bblen - 1);
+       xfs_agnumber_t          end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
+ 
+       error = xfs_trans_alloc_empty(mp, &tp);
+@@ -210,7 +211,7 @@ xfs_dax_notify_failure(
+       ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
+ 
+       /* Ignore the range out of filesystem area */
+-      if (offset + len < ddev_start)
++      if (offset + len - 1 < ddev_start)
+               return -ENXIO;
+       if (offset > ddev_end)
+               return -ENXIO;
+@@ -222,8 +223,8 @@ xfs_dax_notify_failure(
+               len -= ddev_start - offset;
+               offset = 0;
+       }
+-      if (offset + len > ddev_end)
+-              len -= ddev_end - offset;
++      if (offset + len - 1 > ddev_end)
++              len = ddev_end - offset + 1;
+ 
+       return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len),
+                       mf_flags);
diff --git a/queue-6.1/xfs-fix-uninitialized-variable-access.patch b/queue-6.1/xfs-fix-uninitialized-variable-access.patch

new file mode 100644 (file)

index 0000000..96ee1dd
--- /dev/null
+++ b/queue-6.1/xfs-fix-uninitialized-variable-access.patch
@@ -0,0 +1,36 @@
+From stable+bounces-77023-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:45 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:33 -0700
+Subject: xfs: fix uninitialized variable access
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, "Darrick J. Wong" <djwong@kernel.org>, syzbot+090ae72d552e6bd93cfe@syzkaller.appspotmail.com, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-9-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 60b730a40c43fbcc034970d3e77eb0f25b8cc1cf ]
+
+If the end position of a GETFSMAP query overlaps an allocated space and
+we're using the free space info to generate fsmap info, the akeys
+information gets fed into the fsmap formatter with bad results.
+Zero-init the space.
+
+Reported-by: syzbot+090ae72d552e6bd93cfe@syzkaller.appspotmail.com
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_fsmap.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/xfs/xfs_fsmap.c
++++ b/fs/xfs/xfs_fsmap.c
+@@ -761,6 +761,7 @@ xfs_getfsmap_datadev_bnobt(
+ {
+       struct xfs_alloc_rec_incore     akeys[2];
+ 
++      memset(akeys, 0, sizeof(akeys));
+       info->missing_owner = XFS_FMR_OWN_UNKNOWN;
+       return __xfs_getfsmap_datadev(tp, keys, info,
+                       xfs_getfsmap_datadev_bnobt_query, &akeys[0]);
diff --git a/queue-6.1/xfs-fix-unlink-vs-cluster-buffer-instantiation-race.patch b/queue-6.1/xfs-fix-unlink-vs-cluster-buffer-instantiation-race.patch

new file mode 100644 (file)

index 0000000..b647fee
--- /dev/null
+++ b/queue-6.1/xfs-fix-unlink-vs-cluster-buffer-instantiation-race.patch
@@ -0,0 +1,180 @@
+From stable+bounces-77035-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:20 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:45 -0700
+Subject: xfs: fix unlink vs cluster buffer instantiation race
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner <dchinner@redhat.com>, Luis Chamberlain <mcgrof@kernel.org>, Christoph Hellwig <hch@lst.de>, "Darrick J. Wong" <djwong@kernel.org>, Chandan Babu R <chandanbabu@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240924183851.1901667-21-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 348a1983cf4cf5099fc398438a968443af4c9f65 ]
+
+Luis has been reporting an assert failure when freeing an inode
+cluster during inode inactivation for a while. The assert looks
+like:
+
+ XFS: Assertion failed: bp->b_flags & XBF_DONE, file: fs/xfs/xfs_trans_buf.c, line: 241
+ ------------[ cut here ]------------
+ kernel BUG at fs/xfs/xfs_message.c:102!
+ Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI
+ CPU: 4 PID: 73 Comm: kworker/4:1 Not tainted 6.10.0-rc1 #4
+ Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
+ Workqueue: xfs-inodegc/loop5 xfs_inodegc_worker [xfs]
+ RIP: 0010:assfail (fs/xfs/xfs_message.c:102) xfs
+ RSP: 0018:ffff88810188f7f0 EFLAGS: 00010202
+ RAX: 0000000000000000 RBX: ffff88816e748250 RCX: 1ffffffff844b0e7
+ RDX: 0000000000000004 RSI: ffff88810188f558 RDI: ffffffffc2431fa0
+ RBP: 1ffff11020311f01 R08: 0000000042431f9f R09: ffffed1020311e9b
+ R10: ffff88810188f4df R11: ffffffffac725d70 R12: ffff88817a3f4000
+ R13: ffff88812182f000 R14: ffff88810188f998 R15: ffffffffc2423f80
+ FS:  0000000000000000(0000) GS:ffff8881c8400000(0000) knlGS:0000000000000000
+ CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ CR2: 000055fe9d0f109c CR3: 000000014426c002 CR4: 0000000000770ef0
+ DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+ DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
+ PKRU: 55555554
+ Call Trace:
+  <TASK>
+ xfs_trans_read_buf_map (fs/xfs/xfs_trans_buf.c:241 (discriminator 1)) xfs
+ xfs_imap_to_bp (fs/xfs/xfs_trans.h:210 fs/xfs/libxfs/xfs_inode_buf.c:138) xfs
+ xfs_inode_item_precommit (fs/xfs/xfs_inode_item.c:145) xfs
+ xfs_trans_run_precommits (fs/xfs/xfs_trans.c:931) xfs
+ __xfs_trans_commit (fs/xfs/xfs_trans.c:966) xfs
+ xfs_inactive_ifree (fs/xfs/xfs_inode.c:1811) xfs
+ xfs_inactive (fs/xfs/xfs_inode.c:2013) xfs
+ xfs_inodegc_worker (fs/xfs/xfs_icache.c:1841 fs/xfs/xfs_icache.c:1886) xfs
+ process_one_work (kernel/workqueue.c:3231)
+ worker_thread (kernel/workqueue.c:3306 (discriminator 2) kernel/workqueue.c:3393 (discriminator 2))
+ kthread (kernel/kthread.c:389)
+ ret_from_fork (arch/x86/kernel/process.c:147)
+ ret_from_fork_asm (arch/x86/entry/entry_64.S:257)
+  </TASK>
+
+And occurs when the the inode precommit handlers is attempt to look
+up the inode cluster buffer to attach the inode for writeback.
+
+The trail of logic that I can reconstruct is as follows.
+
+       1. the inode is clean when inodegc runs, so it is not
+          attached to a cluster buffer when precommit runs.
+
+       2. #1 implies the inode cluster buffer may be clean and not
+          pinned by dirty inodes when inodegc runs.
+
+       3. #2 implies that the inode cluster buffer can be reclaimed
+          by memory pressure at any time.
+
+       4. The assert failure implies that the cluster buffer was
+          attached to the transaction, but not marked done. It had
+          been accessed earlier in the transaction, but not marked
+          done.
+
+       5. #4 implies the cluster buffer has been invalidated (i.e.
+          marked stale).
+
+       6. #5 implies that the inode cluster buffer was instantiated
+          uninitialised in the transaction in xfs_ifree_cluster(),
+          which only instantiates the buffers to invalidate them
+          and never marks them as done.
+
+Given factors 1-3, this issue is highly dependent on timing and
+environmental factors. Hence the issue can be very difficult to
+reproduce in some situations, but highly reliable in others. Luis
+has an environment where it can be reproduced easily by g/531 but,
+OTOH, I've reproduced it only once in ~2000 cycles of g/531.
+
+I think the fix is to have xfs_ifree_cluster() set the XBF_DONE flag
+on the cluster buffers, even though they may not be initialised. The
+reasons why I think this is safe are:
+
+       1. A buffer cache lookup hit on a XBF_STALE buffer will
+          clear the XBF_DONE flag. Hence all future users of the
+          buffer know they have to re-initialise the contents
+          before use and mark it done themselves.
+
+       2. xfs_trans_binval() sets the XFS_BLI_STALE flag, which
+          means the buffer remains locked until the journal commit
+          completes and the buffer is unpinned. Hence once marked
+          XBF_STALE/XFS_BLI_STALE by xfs_ifree_cluster(), the only
+          context that can access the freed buffer is the currently
+          running transaction.
+
+       3. #2 implies that future buffer lookups in the currently
+          running transaction will hit the transaction match code
+          and not the buffer cache. Hence XBF_STALE and
+          XFS_BLI_STALE will not be cleared unless the transaction
+          initialises and logs the buffer with valid contents
+          again. At which point, the buffer will be marked marked
+          XBF_DONE again, so having XBF_DONE already set on the
+          stale buffer is a moot point.
+
+       4. #2 also implies that any concurrent access to that
+          cluster buffer will block waiting on the buffer lock
+          until the inode cluster has been fully freed and is no
+          longer an active inode cluster buffer.
+
+       5. #4 + #1 means that any future user of the disk range of
+          that buffer will always see the range of disk blocks
+          covered by the cluster buffer as not done, and hence must
+          initialise the contents themselves.
+
+       6. Setting XBF_DONE in xfs_ifree_cluster() then means the
+          unlinked inode precommit code will see a XBF_DONE buffer
+          from the transaction match as it expects. It can then
+          attach the stale but newly dirtied inode to the stale
+          but newly dirtied cluster buffer without unexpected
+          failures. The stale buffer will then sail through the
+          journal and do the right thing with the attached stale
+          inode during unpin.
+
+Hence the fix is just one line of extra code. The explanation of
+why we have to set XBF_DONE in xfs_ifree_cluster, OTOH, is long and
+complex....
+
+Fixes: 82842fee6e59 ("xfs: fix AGF vs inode cluster buffer deadlock")
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Tested-by: Luis Chamberlain <mcgrof@kernel.org>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_inode.c |   23 +++++++++++++++++++----
+ 1 file changed, 19 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -2297,11 +2297,26 @@ xfs_ifree_cluster(
+                * This buffer may not have been correctly initialised as we
+                * didn't read it from disk. That's not important because we are
+                * only using to mark the buffer as stale in the log, and to
+-               * attach stale cached inodes on it. That means it will never be
+-               * dispatched for IO. If it is, we want to know about it, and we
+-               * want it to fail. We can acheive this by adding a write
+-               * verifier to the buffer.
++               * attach stale cached inodes on it.
++               *
++               * For the inode that triggered the cluster freeing, this
++               * attachment may occur in xfs_inode_item_precommit() after we
++               * have marked this buffer stale.  If this buffer was not in
++               * memory before xfs_ifree_cluster() started, it will not be
++               * marked XBF_DONE and this will cause problems later in
++               * xfs_inode_item_precommit() when we trip over a (stale, !done)
++               * buffer to attached to the transaction.
++               *
++               * Hence we have to mark the buffer as XFS_DONE here. This is
++               * safe because we are also marking the buffer as XBF_STALE and
++               * XFS_BLI_STALE. That means it will never be dispatched for
++               * IO and it won't be unlocked until the cluster freeing has
++               * been committed to the journal and the buffer unpinned. If it
++               * is written, we want to know about it, and we want it to
++               * fail. We can acheive this by adding a write verifier to the
++               * buffer.
+                */
++              bp->b_flags |= XBF_DONE;
+               bp->b_ops = &xfs_inode_buf_ops;
+ 
+               /*
diff --git a/queue-6.1/xfs-journal-geometry-is-not-properly-bounds-checked.patch b/queue-6.1/xfs-journal-geometry-is-not-properly-bounds-checked.patch

new file mode 100644 (file)

index 0000000..a61a9f8
--- /dev/null
+++ b/queue-6.1/xfs-journal-geometry-is-not-properly-bounds-checked.patch
@@ -0,0 +1,219 @@
+From stable+bounces-77744-greg=kroah.com@vger.kernel.org Thu Sep 26 02:57:23 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 25 Sep 2024 17:57:05 -0700
+Subject: xfs: journal geometry is not properly bounds checked
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner <dchinner@redhat.com>, Christoph Hellwig <hch@lst.de>, "Darrick J. Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240926005705.2896598-1-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit f1e1765aad7de7a8b8102044fc6a44684bc36180 ]
+
+If the journal geometry results in a sector or log stripe unit
+validation problem, it indicates that we cannot set the log up to
+safely write to the the journal. In these cases, we must abort the
+mount because the corruption needs external intervention to resolve.
+Similarly, a journal that is too large cannot be written to safely,
+either, so we shouldn't allow those geometries to mount, either.
+
+If the log is too small, we risk having transaction reservations
+overruning the available log space and the system hanging waiting
+for space it can never provide. This is purely a runtime hang issue,
+not a corruption issue as per the first cases listed above. We abort
+mounts of the log is too small for V5 filesystems, but we must allow
+v4 filesystems to mount because, historically, there was no log size
+validity checking and so some systems may still be out there with
+undersized logs.
+
+The problem is that on V4 filesystems, when we discover a log
+geometry problem, we skip all the remaining checks and then allow
+the log to continue mounting. This mean that if one of the log size
+checks fails, we skip the log stripe unit check. i.e. we allow the
+mount because a "non-fatal" geometry is violated, and then fail to
+check the hard fail geometries that should fail the mount.
+
+Move all these fatal checks to the superblock verifier, and add a
+new check for the two log sector size geometry variables having the
+same values. This will prevent any attempt to mount a log that has
+invalid or inconsistent geometries long before we attempt to mount
+the log.
+
+However, for the minimum log size checks, we can only do that once
+we've setup up the log and calculated all the iclog sizes and
+roundoffs. Hence this needs to remain in the log mount code after
+the log has been initialised. It is also the only case where we
+should allow a v4 filesystem to continue running, so leave that
+handling in place, too.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+
+Notes:
+    A new fix for the latest 6.1.y backport series just came in. Ran
+    some tests on it as well and all looks good. Please include with
+    the original set.
+
+    Thanks,
+    Leah
+
+ fs/xfs/libxfs/xfs_sb.c |   56 ++++++++++++++++++++++++++++++++++++++++++++++++-
+ fs/xfs/xfs_log.c       |   47 +++++++++++++----------------------------
+ 2 files changed, 70 insertions(+), 33 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_sb.c
++++ b/fs/xfs/libxfs/xfs_sb.c
+@@ -413,7 +413,6 @@ xfs_validate_sb_common(
+           sbp->sb_inodelog < XFS_DINODE_MIN_LOG                       ||
+           sbp->sb_inodelog > XFS_DINODE_MAX_LOG                       ||
+           sbp->sb_inodesize != (1 << sbp->sb_inodelog)                ||
+-          sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE                    ||
+           sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) ||
+           XFS_FSB_TO_B(mp, sbp->sb_agblocks) < XFS_MIN_AG_BYTES       ||
+           XFS_FSB_TO_B(mp, sbp->sb_agblocks) > XFS_MAX_AG_BYTES       ||
+@@ -431,6 +430,61 @@ xfs_validate_sb_common(
+               return -EFSCORRUPTED;
+       }
+ 
++      /*
++       * Logs that are too large are not supported at all. Reject them
++       * outright. Logs that are too small are tolerated on v4 filesystems,
++       * but we can only check that when mounting the log. Hence we skip
++       * those checks here.
++       */
++      if (sbp->sb_logblocks > XFS_MAX_LOG_BLOCKS) {
++              xfs_notice(mp,
++              "Log size 0x%x blocks too large, maximum size is 0x%llx blocks",
++                       sbp->sb_logblocks, XFS_MAX_LOG_BLOCKS);
++              return -EFSCORRUPTED;
++      }
++
++      if (XFS_FSB_TO_B(mp, sbp->sb_logblocks) > XFS_MAX_LOG_BYTES) {
++              xfs_warn(mp,
++              "log size 0x%llx bytes too large, maximum size is 0x%llx bytes",
++                       XFS_FSB_TO_B(mp, sbp->sb_logblocks),
++                       XFS_MAX_LOG_BYTES);
++              return -EFSCORRUPTED;
++      }
++
++      /*
++       * Do not allow filesystems with corrupted log sector or stripe units to
++       * be mounted. We cannot safely size the iclogs or write to the log if
++       * the log stripe unit is not valid.
++       */
++      if (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT) {
++              if (sbp->sb_logsectsize != (1U << sbp->sb_logsectlog)) {
++                      xfs_notice(mp,
++                      "log sector size in bytes/log2 (0x%x/0x%x) must match",
++                              sbp->sb_logsectsize, 1U << sbp->sb_logsectlog);
++                      return -EFSCORRUPTED;
++              }
++      } else if (sbp->sb_logsectsize || sbp->sb_logsectlog) {
++              xfs_notice(mp,
++              "log sector size in bytes/log2 (0x%x/0x%x) are not zero",
++                      sbp->sb_logsectsize, sbp->sb_logsectlog);
++              return -EFSCORRUPTED;
++      }
++
++      if (sbp->sb_logsunit > 1) {
++              if (sbp->sb_logsunit % sbp->sb_blocksize) {
++                      xfs_notice(mp,
++              "log stripe unit 0x%x bytes must be a multiple of block size",
++                              sbp->sb_logsunit);
++                      return -EFSCORRUPTED;
++              }
++              if (sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE) {
++                      xfs_notice(mp,
++              "log stripe unit 0x%x bytes over maximum size (0x%x bytes)",
++                              sbp->sb_logsunit, XLOG_MAX_RECORD_BSIZE);
++                      return -EFSCORRUPTED;
++              }
++      }
++
+       /* Validate the realtime geometry; stolen from xfs_repair */
+       if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE ||
+           sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) {
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -639,7 +639,6 @@ xfs_log_mount(
+       int             num_bblks)
+ {
+       struct xlog     *log;
+-      bool            fatal = xfs_has_crc(mp);
+       int             error = 0;
+       int             min_logfsbs;
+ 
+@@ -661,53 +660,37 @@ xfs_log_mount(
+       mp->m_log = log;
+ 
+       /*
+-       * Validate the given log space and drop a critical message via syslog
+-       * if the log size is too small that would lead to some unexpected
+-       * situations in transaction log space reservation stage.
++       * Now that we have set up the log and it's internal geometry
++       * parameters, we can validate the given log space and drop a critical
++       * message via syslog if the log size is too small. A log that is too
++       * small can lead to unexpected situations in transaction log space
++       * reservation stage. The superblock verifier has already validated all
++       * the other log geometry constraints, so we don't have to check those
++       * here.
+        *
+-       * Note: we can't just reject the mount if the validation fails.  This
+-       * would mean that people would have to downgrade their kernel just to
+-       * remedy the situation as there is no way to grow the log (short of
+-       * black magic surgery with xfs_db).
++       * Note: For v4 filesystems, we can't just reject the mount if the
++       * validation fails.  This would mean that people would have to
++       * downgrade their kernel just to remedy the situation as there is no
++       * way to grow the log (short of black magic surgery with xfs_db).
+        *
+-       * We can, however, reject mounts for CRC format filesystems, as the
++       * We can, however, reject mounts for V5 format filesystems, as the
+        * mkfs binary being used to make the filesystem should never create a
+        * filesystem with a log that is too small.
+        */
+       min_logfsbs = xfs_log_calc_minimum_size(mp);
+-
+       if (mp->m_sb.sb_logblocks < min_logfsbs) {
+               xfs_warn(mp,
+               "Log size %d blocks too small, minimum size is %d blocks",
+                        mp->m_sb.sb_logblocks, min_logfsbs);
+-              error = -EINVAL;
+-      } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) {
+-              xfs_warn(mp,
+-              "Log size %d blocks too large, maximum size is %lld blocks",
+-                       mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS);
+-              error = -EINVAL;
+-      } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) {
+-              xfs_warn(mp,
+-              "log size %lld bytes too large, maximum size is %lld bytes",
+-                       XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks),
+-                       XFS_MAX_LOG_BYTES);
+-              error = -EINVAL;
+-      } else if (mp->m_sb.sb_logsunit > 1 &&
+-                 mp->m_sb.sb_logsunit % mp->m_sb.sb_blocksize) {
+-              xfs_warn(mp,
+-              "log stripe unit %u bytes must be a multiple of block size",
+-                       mp->m_sb.sb_logsunit);
+-              error = -EINVAL;
+-              fatal = true;
+-      }
+-      if (error) {
++
+               /*
+                * Log check errors are always fatal on v5; or whenever bad
+                * metadata leads to a crash.
+                */
+-              if (fatal) {
++              if (xfs_has_crc(mp)) {
+                       xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!");
+                       ASSERT(0);
++                      error = -EINVAL;
+                       goto out_free_log;
+               }
+               xfs_crit(mp, "Log size out of supported range.");
diff --git a/queue-6.1/xfs-load-uncached-unlinked-inodes-into-memory-on-demand.patch b/queue-6.1/xfs-load-uncached-unlinked-inodes-into-memory-on-demand.patch

new file mode 100644 (file)

index 0000000..04fe8c8
--- /dev/null
+++ b/queue-6.1/xfs-load-uncached-unlinked-inodes-into-memory-on-demand.patch
@@ -0,0 +1,218 @@
+From stable+bounces-77033-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:13 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:43 -0700
+Subject: xfs: load uncached unlinked inodes into memory on demand
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, "Darrick J. Wong" <djwong@kernel.org>, shrikanth hegde <sshegde@linux.vnet.ibm.com>, Ritesh Harjani <ritesh.list@gmail.com>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-19-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 68b957f64fca1930164bfc6d6d379acdccd547d7 ]
+
+shrikanth hegde reports that filesystems fail shortly after mount with
+the following failure:
+
+       WARNING: CPU: 56 PID: 12450 at fs/xfs/xfs_inode.c:1839 xfs_iunlink_lookup+0x58/0x80 [xfs]
+
+This of course is the WARN_ON_ONCE in xfs_iunlink_lookup:
+
+       ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+       if (WARN_ON_ONCE(!ip || !ip->i_ino)) { ... }
+
+>From diagnostic data collected by the bug reporters, it would appear
+that we cleanly mounted a filesystem that contained unlinked inodes.
+Unlinked inodes are only processed as a final step of log recovery,
+which means that clean mounts do not process the unlinked list at all.
+
+Prior to the introduction of the incore unlinked lists, this wasn't a
+problem because the unlink code would (very expensively) traverse the
+entire ondisk metadata iunlink chain to keep things up to date.
+However, the incore unlinked list code complains when it realizes that
+it is out of sync with the ondisk metadata and shuts down the fs, which
+is bad.
+
+Ritesh proposed to solve this problem by unconditionally parsing the
+unlinked lists at mount time, but this imposes a mount time cost for
+every filesystem to catch something that should be very infrequent.
+Instead, let's target the places where we can encounter a next_unlinked
+pointer that refers to an inode that is not in cache, and load it into
+cache.
+
+Note: This patch does not address the problem of iget loading an inode
+from the middle of the iunlink list and needing to set i_prev_unlinked
+correctly.
+
+Reported-by: shrikanth hegde <sshegde@linux.vnet.ibm.com>
+Triaged-by: Ritesh Harjani <ritesh.list@gmail.com>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_inode.c |   80 +++++++++++++++++++++++++++++++++++++++++++++++++----
+ fs/xfs/xfs_trace.h |   25 ++++++++++++++++
+ 2 files changed, 100 insertions(+), 5 deletions(-)
+
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1829,12 +1829,17 @@ xfs_iunlink_lookup(
+ 
+       rcu_read_lock();
+       ip = radix_tree_lookup(&pag->pag_ici_root, agino);
++      if (!ip) {
++              /* Caller can handle inode not being in memory. */
++              rcu_read_unlock();
++              return NULL;
++      }
+ 
+       /*
+-       * Inode not in memory or in RCU freeing limbo should not happen.
+-       * Warn about this and let the caller handle the failure.
++       * Inode in RCU freeing limbo should not happen.  Warn about this and
++       * let the caller handle the failure.
+        */
+-      if (WARN_ON_ONCE(!ip || !ip->i_ino)) {
++      if (WARN_ON_ONCE(!ip->i_ino)) {
+               rcu_read_unlock();
+               return NULL;
+       }
+@@ -1843,7 +1848,10 @@ xfs_iunlink_lookup(
+       return ip;
+ }
+ 
+-/* Update the prev pointer of the next agino. */
++/*
++ * Update the prev pointer of the next agino.  Returns -ENOLINK if the inode
++ * is not in cache.
++ */
+ static int
+ xfs_iunlink_update_backref(
+       struct xfs_perag        *pag,
+@@ -1858,7 +1866,8 @@ xfs_iunlink_update_backref(
+ 
+       ip = xfs_iunlink_lookup(pag, next_agino);
+       if (!ip)
+-              return -EFSCORRUPTED;
++              return -ENOLINK;
++
+       ip->i_prev_unlinked = prev_agino;
+       return 0;
+ }
+@@ -1902,6 +1911,62 @@ xfs_iunlink_update_bucket(
+       return 0;
+ }
+ 
++/*
++ * Load the inode @next_agino into the cache and set its prev_unlinked pointer
++ * to @prev_agino.  Caller must hold the AGI to synchronize with other changes
++ * to the unlinked list.
++ */
++STATIC int
++xfs_iunlink_reload_next(
++      struct xfs_trans        *tp,
++      struct xfs_buf          *agibp,
++      xfs_agino_t             prev_agino,
++      xfs_agino_t             next_agino)
++{
++      struct xfs_perag        *pag = agibp->b_pag;
++      struct xfs_mount        *mp = pag->pag_mount;
++      struct xfs_inode        *next_ip = NULL;
++      xfs_ino_t               ino;
++      int                     error;
++
++      ASSERT(next_agino != NULLAGINO);
++
++#ifdef DEBUG
++      rcu_read_lock();
++      next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino);
++      ASSERT(next_ip == NULL);
++      rcu_read_unlock();
++#endif
++
++      xfs_info_ratelimited(mp,
++ "Found unrecovered unlinked inode 0x%x in AG 0x%x.  Initiating recovery.",
++                      next_agino, pag->pag_agno);
++
++      /*
++       * Use an untrusted lookup just to be cautious in case the AGI has been
++       * corrupted and now points at a free inode.  That shouldn't happen,
++       * but we'd rather shut down now since we're already running in a weird
++       * situation.
++       */
++      ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
++      error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip);
++      if (error)
++              return error;
++
++      /* If this is not an unlinked inode, something is very wrong. */
++      if (VFS_I(next_ip)->i_nlink != 0) {
++              error = -EFSCORRUPTED;
++              goto rele;
++      }
++
++      next_ip->i_prev_unlinked = prev_agino;
++      trace_xfs_iunlink_reload_next(next_ip);
++rele:
++      ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE));
++      xfs_irele(next_ip);
++      return error;
++}
++
+ static int
+ xfs_iunlink_insert_inode(
+       struct xfs_trans        *tp,
+@@ -1933,6 +1998,8 @@ xfs_iunlink_insert_inode(
+        * inode.
+        */
+       error = xfs_iunlink_update_backref(pag, agino, next_agino);
++      if (error == -ENOLINK)
++              error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino);
+       if (error)
+               return error;
+ 
+@@ -2027,6 +2094,9 @@ xfs_iunlink_remove_inode(
+        */
+       error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked,
+                       ip->i_next_unlinked);
++      if (error == -ENOLINK)
++              error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked,
++                              ip->i_next_unlinked);
+       if (error)
+               return error;
+ 
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -3679,6 +3679,31 @@ TRACE_EVENT(xfs_iunlink_update_dinode,
+                 __entry->new_ptr)
+ );
+ 
++TRACE_EVENT(xfs_iunlink_reload_next,
++      TP_PROTO(struct xfs_inode *ip),
++      TP_ARGS(ip),
++      TP_STRUCT__entry(
++              __field(dev_t, dev)
++              __field(xfs_agnumber_t, agno)
++              __field(xfs_agino_t, agino)
++              __field(xfs_agino_t, prev_agino)
++              __field(xfs_agino_t, next_agino)
++      ),
++      TP_fast_assign(
++              __entry->dev = ip->i_mount->m_super->s_dev;
++              __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
++              __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
++              __entry->prev_agino = ip->i_prev_unlinked;
++              __entry->next_agino = ip->i_next_unlinked;
++      ),
++      TP_printk("dev %d:%d agno 0x%x agino 0x%x prev_unlinked 0x%x next_unlinked 0x%x",
++                MAJOR(__entry->dev), MINOR(__entry->dev),
++                __entry->agno,
++                __entry->agino,
++                __entry->prev_agino,
++                __entry->next_agino)
++);
++
+ DECLARE_EVENT_CLASS(xfs_ag_inode_class,
+       TP_PROTO(struct xfs_inode *ip),
+       TP_ARGS(ip),
diff --git a/queue-6.1/xfs-make-inode-unlinked-bucket-recovery-work-with-quotacheck.patch b/queue-6.1/xfs-make-inode-unlinked-bucket-recovery-work-with-quotacheck.patch

new file mode 100644 (file)

index 0000000..18702d4
--- /dev/null
+++ b/queue-6.1/xfs-make-inode-unlinked-bucket-recovery-work-with-quotacheck.patch
@@ -0,0 +1,154 @@
+From stable+bounces-77039-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:38 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:49 -0700
+Subject: xfs: make inode unlinked bucket recovery work with quotacheck
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, "Darrick J. Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-25-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 49813a21ed57895b73ec4ed3b99d4beec931496f ]
+
+Teach quotacheck to reload the unlinked inode lists when walking the
+inode table.  This requires extra state handling, since it's possible
+that a reloaded inode will get inactivated before quotacheck tries to
+scan it; in this case, we need to ensure that the reloaded inode does
+not have dquots attached when it is freed.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_attr_inactive.c |    1 -
+ fs/xfs/xfs_inode.c         |   12 +++++++++---
+ fs/xfs/xfs_inode.h         |    5 ++++-
+ fs/xfs/xfs_mount.h         |   10 +++++++++-
+ fs/xfs/xfs_qm.c            |    7 +++++++
+ 5 files changed, 29 insertions(+), 6 deletions(-)
+
+--- a/fs/xfs/xfs_attr_inactive.c
++++ b/fs/xfs/xfs_attr_inactive.c
+@@ -333,7 +333,6 @@ xfs_attr_inactive(
+       int                     error = 0;
+ 
+       mp = dp->i_mount;
+-      ASSERT(! XFS_NOT_DQATTACHED(mp, dp));
+ 
+       xfs_ilock(dp, lock_mode);
+       if (!xfs_inode_has_attr_fork(dp))
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1743,9 +1743,13 @@ xfs_inactive(
+            ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0))
+               truncate = 1;
+ 
+-      error = xfs_qm_dqattach(ip);
+-      if (error)
+-              goto out;
++      if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) {
++              xfs_qm_dqdetach(ip);
++      } else {
++              error = xfs_qm_dqattach(ip);
++              if (error)
++                      goto out;
++      }
+ 
+       if (S_ISLNK(VFS_I(ip)->i_mode))
+               error = xfs_inactive_symlink(ip);
+@@ -1963,6 +1967,8 @@ xfs_iunlink_reload_next(
+       trace_xfs_iunlink_reload_next(next_ip);
+ rele:
+       ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE));
++      if (xfs_is_quotacheck_running(mp) && next_ip)
++              xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED);
+       xfs_irele(next_ip);
+       return error;
+ }
+--- a/fs/xfs/xfs_inode.h
++++ b/fs/xfs/xfs_inode.h
+@@ -344,6 +344,9 @@ static inline bool xfs_inode_has_large_e
+  */
+ #define XFS_INACTIVATING      (1 << 13)
+ 
++/* Quotacheck is running but inode has not been added to quota counts. */
++#define XFS_IQUOTAUNCHECKED   (1 << 14)
++
+ /* All inode state flags related to inode reclaim. */
+ #define XFS_ALL_IRECLAIM_FLAGS        (XFS_IRECLAIMABLE | \
+                                XFS_IRECLAIM | \
+@@ -358,7 +361,7 @@ static inline bool xfs_inode_has_large_e
+ #define XFS_IRECLAIM_RESET_FLAGS      \
+       (XFS_IRECLAIMABLE | XFS_IRECLAIM | \
+        XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \
+-       XFS_INACTIVATING)
++       XFS_INACTIVATING | XFS_IQUOTAUNCHECKED)
+ 
+ /*
+  * Flags for inode locking.
+--- a/fs/xfs/xfs_mount.h
++++ b/fs/xfs/xfs_mount.h
+@@ -401,6 +401,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID)
+ #define XFS_OPSTATE_WARNED_SHRINK     8
+ /* Kernel has logged a warning about logged xattr updates being used. */
+ #define XFS_OPSTATE_WARNED_LARP               9
++/* Mount time quotacheck is running */
++#define XFS_OPSTATE_QUOTACHECK_RUNNING        10
+ 
+ #define __XFS_IS_OPSTATE(name, NAME) \
+ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \
+@@ -423,6 +425,11 @@ __XFS_IS_OPSTATE(inode32, INODE32)
+ __XFS_IS_OPSTATE(readonly, READONLY)
+ __XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED)
+ __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED)
++#ifdef CONFIG_XFS_QUOTA
++__XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING)
++#else
++# define xfs_is_quotacheck_running(mp)        (false)
++#endif
+ 
+ static inline bool
+ xfs_should_warn(struct xfs_mount *mp, long nr)
+@@ -440,7 +447,8 @@ xfs_should_warn(struct xfs_mount *mp, lo
+       { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED),         "blockgc" }, \
+       { (1UL << XFS_OPSTATE_WARNED_SCRUB),            "wscrub" }, \
+       { (1UL << XFS_OPSTATE_WARNED_SHRINK),           "wshrink" }, \
+-      { (1UL << XFS_OPSTATE_WARNED_LARP),             "wlarp" }
++      { (1UL << XFS_OPSTATE_WARNED_LARP),             "wlarp" }, \
++      { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING),      "quotacheck" }
+ 
+ /*
+  * Max and min values for mount-option defined I/O
+--- a/fs/xfs/xfs_qm.c
++++ b/fs/xfs/xfs_qm.c
+@@ -1160,6 +1160,10 @@ xfs_qm_dqusage_adjust(
+       if (error)
+               return error;
+ 
++      error = xfs_inode_reload_unlinked(ip);
++      if (error)
++              goto error0;
++
+       ASSERT(ip->i_delayed_blks == 0);
+ 
+       if (XFS_IS_REALTIME_INODE(ip)) {
+@@ -1173,6 +1177,7 @@ xfs_qm_dqusage_adjust(
+       }
+ 
+       nblks = (xfs_qcnt_t)ip->i_nblocks - rtblks;
++      xfs_iflags_clear(ip, XFS_IQUOTAUNCHECKED);
+ 
+       /*
+        * Add the (disk blocks and inode) resources occupied by this
+@@ -1319,8 +1324,10 @@ xfs_qm_quotacheck(
+               flags |= XFS_PQUOTA_CHKD;
+       }
+ 
++      xfs_set_quotacheck_running(mp);
+       error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true,
+                       NULL);
++      xfs_clear_quotacheck_running(mp);
+ 
+       /*
+        * On error, the inode walk may have partially populated the dquot
diff --git a/queue-6.1/xfs-prefer-free-inodes-at-enospc-over-chunk-allocation.patch b/queue-6.1/xfs-prefer-free-inodes-at-enospc-over-chunk-allocation.patch

new file mode 100644 (file)

index 0000000..f0d1a7e
--- /dev/null
+++ b/queue-6.1/xfs-prefer-free-inodes-at-enospc-over-chunk-allocation.patch
@@ -0,0 +1,95 @@
+From stable+bounces-77021-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:37 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:31 -0700
+Subject: xfs: prefer free inodes at ENOSPC over chunk allocation
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner <dchinner@redhat.com>, Allison Henderson <allison.henderson@oracle.com>, "Darrick J. Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-7-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit f08f984c63e9980614ae3a0a574b31eaaef284b2 ]
+
+When an XFS filesystem has free inodes in chunks already allocated
+on disk, it will still allocate new inode chunks if the target AG
+has no free inodes in it. Normally, this is a good idea as it
+preserves locality of all the inodes in a given directory.
+
+However, at ENOSPC this can lead to using the last few remaining
+free filesystem blocks to allocate a new chunk when there are many,
+many free inodes that could be allocated without consuming free
+space. This results in speeding up the consumption of the last few
+blocks and inode create operations then returning ENOSPC when there
+free inodes available because we don't have enough block left in the
+filesystem for directory creation reservations to proceed.
+
+Hence when we are near ENOSPC, we should be attempting to preserve
+the remaining blocks for directory block allocation rather than
+using them for unnecessary inode chunk creation.
+
+This particular behaviour is exposed by xfs/294, when it drives to
+ENOSPC on empty file creation whilst there are still thousands of
+free inodes available for allocation in other AGs in the filesystem.
+
+Hence, when we are within 1% of ENOSPC, change the inode allocation
+behaviour to prefer to use existing free inodes over allocating new
+inode chunks, even though it results is poorer locality of the data
+set. It is more important for the allocations to be space efficient
+near ENOSPC than to have optimal locality for performance, so lets
+modify the inode AG selection code to reflect that fact.
+
+This allows generic/294 to not only pass with this allocator rework
+patchset, but to increase the number of post-ENOSPC empty inode
+allocations to from ~600 to ~9080 before we hit ENOSPC on the
+directory create transaction reservation.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Allison Henderson <allison.henderson@oracle.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_ialloc.c |   17 +++++++++++++++++
+ 1 file changed, 17 insertions(+)
+
+--- a/fs/xfs/libxfs/xfs_ialloc.c
++++ b/fs/xfs/libxfs/xfs_ialloc.c
+@@ -1737,6 +1737,7 @@ xfs_dialloc(
+       struct xfs_perag        *pag;
+       struct xfs_ino_geometry *igeo = M_IGEO(mp);
+       bool                    ok_alloc = true;
++      bool                    low_space = false;
+       int                     flags;
+       xfs_ino_t               ino;
+ 
+@@ -1768,6 +1769,20 @@ xfs_dialloc(
+       }
+ 
+       /*
++       * If we are near to ENOSPC, we want to prefer allocation from AGs that
++       * have free inodes in them rather than use up free space allocating new
++       * inode chunks. Hence we turn off allocation for the first non-blocking
++       * pass through the AGs if we are near ENOSPC to consume free inodes
++       * that we can immediately allocate, but then we allow allocation on the
++       * second pass if we fail to find an AG with free inodes in it.
++       */
++      if (percpu_counter_read_positive(&mp->m_fdblocks) <
++                      mp->m_low_space[XFS_LOWSP_1_PCNT]) {
++              ok_alloc = false;
++              low_space = true;
++      }
++
++      /*
+        * Loop until we find an allocation group that either has free inodes
+        * or in which we can allocate some inodes.  Iterate through the
+        * allocation groups upward, wrapping at the end.
+@@ -1795,6 +1810,8 @@ xfs_dialloc(
+                               break;
+                       }
+                       flags = 0;
++                      if (low_space)
++                              ok_alloc = true;
+               }
+               xfs_perag_put(pag);
+       }
diff --git a/queue-6.1/xfs-quotacheck-failure-can-race-with-background-inode-inactivation.patch b/queue-6.1/xfs-quotacheck-failure-can-race-with-background-inode-inactivation.patch

new file mode 100644 (file)

index 0000000..8b5dc78
--- /dev/null
+++ b/queue-6.1/xfs-quotacheck-failure-can-race-with-background-inode-inactivation.patch
@@ -0,0 +1,128 @@
+From stable+bounces-77024-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:50 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:34 -0700
+Subject: xfs: quotacheck failure can race with background inode inactivation
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner <dchinner@redhat.com>, Pengfei Xu <pengfei.xu@intel.com>, "Darrick J. Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-10-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 0c7273e494dd5121e20e160cb2f047a593ee14a8 ]
+
+The background inode inactivation can attached dquots to inodes, but
+this can race with a foreground quotacheck failure that leads to
+disabling quotas and freeing the mp->m_quotainfo structure. The
+background inode inactivation then tries to allocate a quota, tries
+to dereference mp->m_quotainfo, and crashes like so:
+
+XFS (loop1): Quotacheck: Unsuccessful (Error -5): Disabling quotas.
+xfs filesystem being mounted at /root/syzkaller.qCVHXV/0/file0 supports timestamps until 2038 (0x7fffffff)
+BUG: kernel NULL pointer dereference, address: 00000000000002a8
+....
+CPU: 0 PID: 161 Comm: kworker/0:4 Not tainted 6.2.0-c9c3395d5e3d #1
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014
+Workqueue: xfs-inodegc/loop1 xfs_inodegc_worker
+RIP: 0010:xfs_dquot_alloc+0x95/0x1e0
+....
+Call Trace:
+ <TASK>
+ xfs_qm_dqread+0x46/0x440
+ xfs_qm_dqget_inode+0x154/0x500
+ xfs_qm_dqattach_one+0x142/0x3c0
+ xfs_qm_dqattach_locked+0x14a/0x170
+ xfs_qm_dqattach+0x52/0x80
+ xfs_inactive+0x186/0x340
+ xfs_inodegc_worker+0xd3/0x430
+ process_one_work+0x3b1/0x960
+ worker_thread+0x52/0x660
+ kthread+0x161/0x1a0
+ ret_from_fork+0x29/0x50
+ </TASK>
+....
+
+Prevent this race by flushing all the queued background inode
+inactivations pending before purging all the cached dquots when
+quotacheck fails.
+
+Reported-by: Pengfei Xu <pengfei.xu@intel.com>
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_qm.c |   40 ++++++++++++++++++++++++++--------------
+ 1 file changed, 26 insertions(+), 14 deletions(-)
+
+--- a/fs/xfs/xfs_qm.c
++++ b/fs/xfs/xfs_qm.c
+@@ -1321,15 +1321,14 @@ xfs_qm_quotacheck(
+ 
+       error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true,
+                       NULL);
+-      if (error) {
+-              /*
+-               * The inode walk may have partially populated the dquot
+-               * caches.  We must purge them before disabling quota and
+-               * tearing down the quotainfo, or else the dquots will leak.
+-               */
+-              xfs_qm_dqpurge_all(mp);
+-              goto error_return;
+-      }
++
++      /*
++       * On error, the inode walk may have partially populated the dquot
++       * caches.  We must purge them before disabling quota and tearing down
++       * the quotainfo, or else the dquots will leak.
++       */
++      if (error)
++              goto error_purge;
+ 
+       /*
+        * We've made all the changes that we need to make incore.  Flush them
+@@ -1363,10 +1362,8 @@ xfs_qm_quotacheck(
+        * and turn quotaoff. The dquots won't be attached to any of the inodes
+        * at this point (because we intentionally didn't in dqget_noattach).
+        */
+-      if (error) {
+-              xfs_qm_dqpurge_all(mp);
+-              goto error_return;
+-      }
++      if (error)
++              goto error_purge;
+ 
+       /*
+        * If one type of quotas is off, then it will lose its
+@@ -1376,7 +1373,7 @@ xfs_qm_quotacheck(
+       mp->m_qflags &= ~XFS_ALL_QUOTA_CHKD;
+       mp->m_qflags |= flags;
+ 
+- error_return:
++error_return:
+       xfs_buf_delwri_cancel(&buffer_list);
+ 
+       if (error) {
+@@ -1395,6 +1392,21 @@ xfs_qm_quotacheck(
+       } else
+               xfs_notice(mp, "Quotacheck: Done.");
+       return error;
++
++error_purge:
++      /*
++       * On error, we may have inodes queued for inactivation. This may try
++       * to attach dquots to the inode before running cleanup operations on
++       * the inode and this can race with the xfs_qm_destroy_quotainfo() call
++       * below that frees mp->m_quotainfo. To avoid this race, flush all the
++       * pending inodegc operations before we purge the dquots from memory,
++       * ensuring that background inactivation is idle whilst we turn off
++       * quotas.
++       */
++      xfs_inodegc_flush(mp);
++      xfs_qm_dqpurge_all(mp);
++      goto error_return;
++
+ }
+ 
+ /*
diff --git a/queue-6.1/xfs-reload-entire-unlinked-bucket-lists.patch b/queue-6.1/xfs-reload-entire-unlinked-bucket-lists.patch

new file mode 100644 (file)

index 0000000..b9de009
--- /dev/null
+++ b/queue-6.1/xfs-reload-entire-unlinked-bucket-lists.patch
@@ -0,0 +1,221 @@
+From stable+bounces-77038-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:31 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:48 -0700
+Subject: xfs: reload entire unlinked bucket lists
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, "Darrick J. Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-24-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 83771c50e42b92de6740a63e152c96c052d37736 ]
+
+The previous patch to reload unrecovered unlinked inodes when adding a
+newly created inode to the unlinked list is missing a key piece of
+functionality.  It doesn't handle the case that someone calls xfs_iget
+on an inode that is not the last item in the incore list.  For example,
+if at mount time the ondisk iunlink bucket looks like this:
+
+AGI -> 7 -> 22 -> 3 -> NULL
+
+None of these three inodes are cached in memory.  Now let's say that
+someone tries to open inode 3 by handle.  We need to walk the list to
+make sure that inodes 7 and 22 get loaded cold, and that the
+i_prev_unlinked of inode 3 gets set to 22.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_export.c |    6 +++
+ fs/xfs/xfs_inode.c  |  100 ++++++++++++++++++++++++++++++++++++++++++++++++++++
+ fs/xfs/xfs_inode.h  |    9 ++++
+ fs/xfs/xfs_itable.c |    9 ++++
+ fs/xfs/xfs_trace.h  |   20 ++++++++++
+ 5 files changed, 144 insertions(+)
+
+--- a/fs/xfs/xfs_export.c
++++ b/fs/xfs/xfs_export.c
+@@ -146,6 +146,12 @@ xfs_nfs_get_inode(
+               return ERR_PTR(error);
+       }
+ 
++      error = xfs_inode_reload_unlinked(ip);
++      if (error) {
++              xfs_irele(ip);
++              return ERR_PTR(error);
++      }
++
+       if (VFS_I(ip)->i_generation != generation) {
+               xfs_irele(ip);
+               return ERR_PTR(-ESTALE);
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -3622,3 +3622,103 @@ xfs_iunlock2_io_mmap(
+       if (ip1 != ip2)
+               inode_unlock(VFS_I(ip1));
+ }
++
++/*
++ * Reload the incore inode list for this inode.  Caller should ensure that
++ * the link count cannot change, either by taking ILOCK_SHARED or otherwise
++ * preventing other threads from executing.
++ */
++int
++xfs_inode_reload_unlinked_bucket(
++      struct xfs_trans        *tp,
++      struct xfs_inode        *ip)
++{
++      struct xfs_mount        *mp = tp->t_mountp;
++      struct xfs_buf          *agibp;
++      struct xfs_agi          *agi;
++      struct xfs_perag        *pag;
++      xfs_agnumber_t          agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
++      xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
++      xfs_agino_t             prev_agino, next_agino;
++      unsigned int            bucket;
++      bool                    foundit = false;
++      int                     error;
++
++      /* Grab the first inode in the list */
++      pag = xfs_perag_get(mp, agno);
++      error = xfs_ialloc_read_agi(pag, tp, &agibp);
++      xfs_perag_put(pag);
++      if (error)
++              return error;
++
++      bucket = agino % XFS_AGI_UNLINKED_BUCKETS;
++      agi = agibp->b_addr;
++
++      trace_xfs_inode_reload_unlinked_bucket(ip);
++
++      xfs_info_ratelimited(mp,
++ "Found unrecovered unlinked inode 0x%x in AG 0x%x.  Initiating list recovery.",
++                      agino, agno);
++
++      prev_agino = NULLAGINO;
++      next_agino = be32_to_cpu(agi->agi_unlinked[bucket]);
++      while (next_agino != NULLAGINO) {
++              struct xfs_inode        *next_ip = NULL;
++
++              if (next_agino == agino) {
++                      /* Found this inode, set its backlink. */
++                      next_ip = ip;
++                      next_ip->i_prev_unlinked = prev_agino;
++                      foundit = true;
++              }
++              if (!next_ip) {
++                      /* Inode already in memory. */
++                      next_ip = xfs_iunlink_lookup(pag, next_agino);
++              }
++              if (!next_ip) {
++                      /* Inode not in memory, reload. */
++                      error = xfs_iunlink_reload_next(tp, agibp, prev_agino,
++                                      next_agino);
++                      if (error)
++                              break;
++
++                      next_ip = xfs_iunlink_lookup(pag, next_agino);
++              }
++              if (!next_ip) {
++                      /* No incore inode at all?  We reloaded it... */
++                      ASSERT(next_ip != NULL);
++                      error = -EFSCORRUPTED;
++                      break;
++              }
++
++              prev_agino = next_agino;
++              next_agino = next_ip->i_next_unlinked;
++      }
++
++      xfs_trans_brelse(tp, agibp);
++      /* Should have found this inode somewhere in the iunlinked bucket. */
++      if (!error && !foundit)
++              error = -EFSCORRUPTED;
++      return error;
++}
++
++/* Decide if this inode is missing its unlinked list and reload it. */
++int
++xfs_inode_reload_unlinked(
++      struct xfs_inode        *ip)
++{
++      struct xfs_trans        *tp;
++      int                     error;
++
++      error = xfs_trans_alloc_empty(ip->i_mount, &tp);
++      if (error)
++              return error;
++
++      xfs_ilock(ip, XFS_ILOCK_SHARED);
++      if (xfs_inode_unlinked_incomplete(ip))
++              error = xfs_inode_reload_unlinked_bucket(tp, ip);
++      xfs_iunlock(ip, XFS_ILOCK_SHARED);
++      xfs_trans_cancel(tp);
++
++      return error;
++}
+--- a/fs/xfs/xfs_inode.h
++++ b/fs/xfs/xfs_inode.h
+@@ -593,4 +593,13 @@ void xfs_end_io(struct work_struct *work
+ int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+ void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2);
+ 
++static inline bool
++xfs_inode_unlinked_incomplete(
++      struct xfs_inode        *ip)
++{
++      return VFS_I(ip)->i_nlink == 0 && !xfs_inode_on_unlinked_list(ip);
++}
++int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip);
++int xfs_inode_reload_unlinked(struct xfs_inode *ip);
++
+ #endif        /* __XFS_INODE_H__ */
+--- a/fs/xfs/xfs_itable.c
++++ b/fs/xfs/xfs_itable.c
+@@ -80,6 +80,15 @@ xfs_bulkstat_one_int(
+       if (error)
+               goto out;
+ 
++      if (xfs_inode_unlinked_incomplete(ip)) {
++              error = xfs_inode_reload_unlinked_bucket(tp, ip);
++              if (error) {
++                      xfs_iunlock(ip, XFS_ILOCK_SHARED);
++                      xfs_irele(ip);
++                      return error;
++              }
++      }
++
+       ASSERT(ip != NULL);
+       ASSERT(ip->i_imap.im_blkno != 0);
+       inode = VFS_I(ip);
+--- a/fs/xfs/xfs_trace.h
++++ b/fs/xfs/xfs_trace.h
+@@ -3704,6 +3704,26 @@ TRACE_EVENT(xfs_iunlink_reload_next,
+                 __entry->next_agino)
+ );
+ 
++TRACE_EVENT(xfs_inode_reload_unlinked_bucket,
++      TP_PROTO(struct xfs_inode *ip),
++      TP_ARGS(ip),
++      TP_STRUCT__entry(
++              __field(dev_t, dev)
++              __field(xfs_agnumber_t, agno)
++              __field(xfs_agino_t, agino)
++      ),
++      TP_fast_assign(
++              __entry->dev = ip->i_mount->m_super->s_dev;
++              __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino);
++              __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino);
++      ),
++      TP_printk("dev %d:%d agno 0x%x agino 0x%x bucket %u",
++                MAJOR(__entry->dev), MINOR(__entry->dev),
++                __entry->agno,
++                __entry->agino,
++                __entry->agino % XFS_AGI_UNLINKED_BUCKETS)
++);
++
+ DECLARE_EVENT_CLASS(xfs_ag_inode_class,
+       TP_PROTO(struct xfs_inode *ip),
+       TP_ARGS(ip),
diff --git a/queue-6.1/xfs-remove-warn-when-dquot-cache-insertion-fails.patch b/queue-6.1/xfs-remove-warn-when-dquot-cache-insertion-fails.patch

new file mode 100644 (file)

index 0000000..d865fcb
--- /dev/null
+++ b/queue-6.1/xfs-remove-warn-when-dquot-cache-insertion-fails.patch
@@ -0,0 +1,35 @@
+From stable+bounces-77031-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:06 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:41 -0700
+Subject: xfs: remove WARN when dquot cache insertion fails
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner <dchinner@redhat.com>, syzbot+6ae213503fb12e87934f@syzkaller.appspotmail.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <david@fromorbit.com>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-17-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 4b827b3f305d1fcf837265f1e12acc22ee84327c ]
+
+It just creates unnecessary bot noise these days.
+
+Reported-by: syzbot+6ae213503fb12e87934f@syzkaller.appspotmail.com
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_dquot.c |    1 -
+ 1 file changed, 1 deletion(-)
+
+--- a/fs/xfs/xfs_dquot.c
++++ b/fs/xfs/xfs_dquot.c
+@@ -798,7 +798,6 @@ xfs_qm_dqget_cache_insert(
+       error = radix_tree_insert(tree, id, dqp);
+       if (unlikely(error)) {
+               /* Duplicate found!  Caller must try again. */
+-              WARN_ON(error != -EEXIST);
+               mutex_unlock(&qi->qi_tree_lock);
+               trace_xfs_dqget_dup(dqp);
+               return error;
diff --git a/queue-6.1/xfs-set-bnobt-cntbt-numrecs-correctly-when-formatting-new-ags.patch b/queue-6.1/xfs-set-bnobt-cntbt-numrecs-correctly-when-formatting-new-ags.patch

new file mode 100644 (file)

index 0000000..565b1cf
--- /dev/null
+++ b/queue-6.1/xfs-set-bnobt-cntbt-numrecs-correctly-when-formatting-new-ags.patch
@@ -0,0 +1,129 @@
+From stable+bounces-77041-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:54 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:51 -0700
+Subject: xfs: set bnobt/cntbt numrecs correctly when formatting new AGs
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Dave Chinner <david@fromorbit.com>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-27-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 8e698ee72c4ecbbf18264568eb310875839fd601 ]
+
+Through generic/300, I discovered that mkfs.xfs creates corrupt
+filesystems when given these parameters:
+
+Filesystems formatted with --unsupported are not supported!!
+meta-data=/dev/sda               isize=512    agcount=8, agsize=16352 blks
+         =                       sectsz=512   attr=2, projid32bit=1
+         =                       crc=1        finobt=1, sparse=1, rmapbt=1
+         =                       reflink=1    bigtime=1 inobtcount=1 nrext64=1
+data     =                       bsize=4096   blocks=130816, imaxpct=25
+         =                       sunit=32     swidth=128 blks
+naming   =version 2              bsize=4096   ascii-ci=0, ftype=1
+log      =internal log           bsize=4096   blocks=8192, version=2
+         =                       sectsz=512   sunit=32 blks, lazy-count=1
+realtime =none                   extsz=4096   blocks=0, rtextents=0
+         =                       rgcount=0    rgsize=0 blks
+Discarding blocks...Done.
+Phase 1 - find and verify superblock...
+        - reporting progress in intervals of 15 minutes
+Phase 2 - using internal log
+        - zero log...
+        - 16:30:50: zeroing log - 16320 of 16320 blocks done
+        - scan filesystem freespace and inode maps...
+agf_freeblks 25, counted 0 in ag 4
+sb_fdblocks 8823, counted 8798
+
+The root cause of this problem is the numrecs handling in
+xfs_freesp_init_recs, which is used to initialize a new AG.  Prior to
+calling the function, we set up the new bnobt block with numrecs == 1
+and rely on _freesp_init_recs to format that new record.  If the last
+record created has a blockcount of zero, then it sets numrecs = 0.
+
+That last bit isn't correct if the AG contains the log, the start of the
+log is not immediately after the initial blocks due to stripe alignment,
+and the end of the log is perfectly aligned with the end of the AG.  For
+this case, we actually formatted a single bnobt record to handle the
+free space before the start of the (stripe aligned) log, and incremented
+arec to try to format a second record.  That second record turned out to
+be unnecessary, so what we really want is to leave numrecs at 1.
+
+The numrecs handling itself is overly complicated because a different
+function sets numrecs == 1.  Change the bnobt creation code to start
+with numrecs set to zero and only increment it after successfully
+formatting a free space extent into the btree block.
+
+Fixes: f327a00745ff ("xfs: account for log space when formatting new AGs")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Dave Chinner <david@fromorbit.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_ag.c |   19 +++++++++----------
+ 1 file changed, 9 insertions(+), 10 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_ag.c
++++ b/fs/xfs/libxfs/xfs_ag.c
+@@ -415,10 +415,12 @@ xfs_freesp_init_recs(
+               ASSERT(start >= mp->m_ag_prealloc_blocks);
+               if (start != mp->m_ag_prealloc_blocks) {
+                       /*
+-                       * Modify first record to pad stripe align of log
++                       * Modify first record to pad stripe align of log and
++                       * bump the record count.
+                        */
+                       arec->ar_blockcount = cpu_to_be32(start -
+                                               mp->m_ag_prealloc_blocks);
++                      be16_add_cpu(&block->bb_numrecs, 1);
+                       nrec = arec + 1;
+ 
+                       /*
+@@ -429,7 +431,6 @@ xfs_freesp_init_recs(
+                                       be32_to_cpu(arec->ar_startblock) +
+                                       be32_to_cpu(arec->ar_blockcount));
+                       arec = nrec;
+-                      be16_add_cpu(&block->bb_numrecs, 1);
+               }
+               /*
+                * Change record start to after the internal log
+@@ -438,15 +439,13 @@ xfs_freesp_init_recs(
+       }
+ 
+       /*
+-       * Calculate the record block count and check for the case where
+-       * the log might have consumed all available space in the AG. If
+-       * so, reset the record count to 0 to avoid exposure of an invalid
+-       * record start block.
++       * Calculate the block count of this record; if it is nonzero,
++       * increment the record count.
+        */
+       arec->ar_blockcount = cpu_to_be32(id->agsize -
+                                         be32_to_cpu(arec->ar_startblock));
+-      if (!arec->ar_blockcount)
+-              block->bb_numrecs = 0;
++      if (arec->ar_blockcount)
++              be16_add_cpu(&block->bb_numrecs, 1);
+ }
+ 
+ /*
+@@ -458,7 +457,7 @@ xfs_bnoroot_init(
+       struct xfs_buf          *bp,
+       struct aghdr_init_data  *id)
+ {
+-      xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno);
++      xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 0, id->agno);
+       xfs_freesp_init_recs(mp, bp, id);
+ }
+ 
+@@ -468,7 +467,7 @@ xfs_cntroot_init(
+       struct xfs_buf          *bp,
+       struct aghdr_init_data  *id)
+ {
+-      xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno);
++      xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 0, id->agno);
+       xfs_freesp_init_recs(mp, bp, id);
+ }
+ 
diff --git a/queue-6.1/xfs-use-i_prev_unlinked-to-distinguish-inodes-that-are-not-on-the-unlinked-list.patch b/queue-6.1/xfs-use-i_prev_unlinked-to-distinguish-inodes-that-are-not-on-the-unlinked-list.patch

new file mode 100644 (file)

index 0000000..f9c8156
--- /dev/null
+++ b/queue-6.1/xfs-use-i_prev_unlinked-to-distinguish-inodes-that-are-not-on-the-unlinked-list.patch
@@ -0,0 +1,109 @@
+From stable+bounces-77037-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:23 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Tue, 24 Sep 2024 11:38:47 -0700
+Subject: xfs: use i_prev_unlinked to distinguish inodes that are not on the unlinked list
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, "Darrick J. Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>, Chandan Babu R <chandanbabu@kernel.org>
+Message-ID: <20240924183851.1901667-23-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit f12b96683d6976a3a07fdf3323277c79dbe8f6ab ]
+
+Alter the definition of i_prev_unlinked slightly to make it more obvious
+when an inode with 0 link count is not part of the iunlink bucket lists
+rooted in the AGI.  This distinction is necessary because it is not
+sufficient to check inode.i_nlink to decide if an inode is on the
+unlinked list.  Updates to i_nlink can happen while holding only
+ILOCK_EXCL, but updates to an inode's position in the AGI unlinked list
+(which happen after the nlink update) requires both ILOCK_EXCL and the
+AGI buffer lock.
+
+The next few patches will make it possible to reload an entire unlinked
+bucket list when we're walking the inode table or performing handle
+operations and need more than the ability to iget the last inode in the
+chain.
+
+The upcoming directory repair code also needs to be able to make this
+distinction to decide if a zero link count directory should be moved to
+the orphanage or allowed to inactivate.  An upcoming enhancement to the
+online AGI fsck code will need this distinction to check and rebuild the
+AGI unlinked buckets.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_icache.c |    2 +-
+ fs/xfs/xfs_inode.c  |    3 ++-
+ fs/xfs/xfs_inode.h  |   20 +++++++++++++++++++-
+ 3 files changed, 22 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -113,7 +113,7 @@ xfs_inode_alloc(
+       INIT_LIST_HEAD(&ip->i_ioend_list);
+       spin_lock_init(&ip->i_ioend_lock);
+       ip->i_next_unlinked = NULLAGINO;
+-      ip->i_prev_unlinked = NULLAGINO;
++      ip->i_prev_unlinked = 0;
+ 
+       return ip;
+ }
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -2015,6 +2015,7 @@ xfs_iunlink_insert_inode(
+       }
+ 
+       /* Point the head of the list to point to this inode. */
++      ip->i_prev_unlinked = NULLAGINO;
+       return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino);
+ }
+ 
+@@ -2117,7 +2118,7 @@ xfs_iunlink_remove_inode(
+       }
+ 
+       ip->i_next_unlinked = NULLAGINO;
+-      ip->i_prev_unlinked = NULLAGINO;
++      ip->i_prev_unlinked = 0;
+       return error;
+ }
+ 
+--- a/fs/xfs/xfs_inode.h
++++ b/fs/xfs/xfs_inode.h
+@@ -68,8 +68,21 @@ typedef struct xfs_inode {
+       uint64_t                i_diflags2;     /* XFS_DIFLAG2_... */
+       struct timespec64       i_crtime;       /* time created */
+ 
+-      /* unlinked list pointers */
++      /*
++       * Unlinked list pointers.  These point to the next and previous inodes
++       * in the AGI unlinked bucket list, respectively.  These fields can
++       * only be updated with the AGI locked.
++       *
++       * i_next_unlinked caches di_next_unlinked.
++       */
+       xfs_agino_t             i_next_unlinked;
++
++      /*
++       * If the inode is not on an unlinked list, this field is zero.  If the
++       * inode is the first element in an unlinked list, this field is
++       * NULLAGINO.  Otherwise, i_prev_unlinked points to the previous inode
++       * in the unlinked list.
++       */
+       xfs_agino_t             i_prev_unlinked;
+ 
+       /* VFS inode */
+@@ -81,6 +94,11 @@ typedef struct xfs_inode {
+       struct list_head        i_ioend_list;
+ } xfs_inode_t;
+ 
++static inline bool xfs_inode_on_unlinked_list(const struct xfs_inode *ip)
++{
++      return ip->i_prev_unlinked != 0;
++}
++
+ static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip)
+ {
+       return ip->i_forkoff > 0;
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 27 Sep 2024 06:58:13 +0000 (08:58 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Fri, 27 Sep 2024 06:58:13 +0000 (08:58 +0200)
queue-6.1/netfilter-nft_socket-fix-a-null-vs-is_err-bug-in-nft_socket_cgroup_subtree_level.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/netfilter-nft_socket-make-cgroupsv2-matching-work-with-namespaces.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/series		patch \| blob \| blame \| history
queue-6.1/xfs-block-reservation-too-large-for-minleft-allocation.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-buffer-pins-need-to-hold-a-buffer-reference.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-collect-errors-from-inodegc-for-unlinked-inode-recovery.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-correct-calculation-for-agend-and-blockcount.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-defered-work-could-create-precommits.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-don-t-use-bmbt-btree-split-workers-for-io-completion.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-dquot-shrinker-doesn-t-check-for-xfs_dqflag_freeing.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-fix-ag-count-overflow-during-growfs.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-fix-agf-vs-inode-cluster-buffer-deadlock.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-fix-bug_on-in-xfs_getbmap.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-fix-deadlock-on-xfs_inodegc_worker.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-fix-extent-busy-updating.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-fix-low-space-alloc-deadlock.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-fix-negative-array-access-in-xfs_getbmap.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-fix-reloading-entire-unlinked-bucket-lists.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-fix-the-calculation-for-end-and-length.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-fix-uninitialized-variable-access.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-fix-unlink-vs-cluster-buffer-instantiation-race.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-journal-geometry-is-not-properly-bounds-checked.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-load-uncached-unlinked-inodes-into-memory-on-demand.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-make-inode-unlinked-bucket-recovery-work-with-quotacheck.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-prefer-free-inodes-at-enospc-over-chunk-allocation.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-quotacheck-failure-can-race-with-background-inode-inactivation.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-reload-entire-unlinked-bucket-lists.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-remove-warn-when-dquot-cache-insertion-fails.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-set-bnobt-cntbt-numrecs-correctly-when-formatting-new-ags.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/xfs-use-i_prev_unlinked-to-distinguish-inodes-that-are-not-on-the-unlinked-list.patch	[new file with mode: 0644]	patch \| blob