From: Greg Kroah-Hartman Date: Fri, 27 Sep 2024 06:58:13 +0000 (+0200) Subject: 6.1-stable patches X-Git-Tag: v6.1.112~33 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7fca5e77d59f6888daccb5e8126be870f11ae98b;p=thirdparty%2Fkernel%2Fstable-queue.git 6.1-stable patches added patches: netfilter-nft_socket-fix-a-null-vs-is_err-bug-in-nft_socket_cgroup_subtree_level.patch netfilter-nft_socket-make-cgroupsv2-matching-work-with-namespaces.patch xfs-block-reservation-too-large-for-minleft-allocation.patch xfs-buffer-pins-need-to-hold-a-buffer-reference.patch xfs-collect-errors-from-inodegc-for-unlinked-inode-recovery.patch xfs-correct-calculation-for-agend-and-blockcount.patch xfs-defered-work-could-create-precommits.patch xfs-don-t-use-bmbt-btree-split-workers-for-io-completion.patch xfs-dquot-shrinker-doesn-t-check-for-xfs_dqflag_freeing.patch xfs-fix-ag-count-overflow-during-growfs.patch xfs-fix-agf-vs-inode-cluster-buffer-deadlock.patch xfs-fix-bug_on-in-xfs_getbmap.patch xfs-fix-deadlock-on-xfs_inodegc_worker.patch xfs-fix-extent-busy-updating.patch xfs-fix-low-space-alloc-deadlock.patch xfs-fix-negative-array-access-in-xfs_getbmap.patch xfs-fix-reloading-entire-unlinked-bucket-lists.patch xfs-fix-the-calculation-for-end-and-length.patch xfs-fix-uninitialized-variable-access.patch xfs-fix-unlink-vs-cluster-buffer-instantiation-race.patch xfs-journal-geometry-is-not-properly-bounds-checked.patch xfs-load-uncached-unlinked-inodes-into-memory-on-demand.patch xfs-make-inode-unlinked-bucket-recovery-work-with-quotacheck.patch xfs-prefer-free-inodes-at-enospc-over-chunk-allocation.patch xfs-quotacheck-failure-can-race-with-background-inode-inactivation.patch xfs-reload-entire-unlinked-bucket-lists.patch xfs-remove-warn-when-dquot-cache-insertion-fails.patch xfs-set-bnobt-cntbt-numrecs-correctly-when-formatting-new-ags.patch xfs-use-i_prev_unlinked-to-distinguish-inodes-that-are-not-on-the-unlinked-list.patch --- diff --git a/queue-6.1/netfilter-nft_socket-fix-a-null-vs-is_err-bug-in-nft_socket_cgroup_subtree_level.patch b/queue-6.1/netfilter-nft_socket-fix-a-null-vs-is_err-bug-in-nft_socket_cgroup_subtree_level.patch new file mode 100644 index 00000000000..63861d997c1 --- /dev/null +++ b/queue-6.1/netfilter-nft_socket-fix-a-null-vs-is_err-bug-in-nft_socket_cgroup_subtree_level.patch @@ -0,0 +1,36 @@ +From 7052622fccb1efb850c6b55de477f65d03525a30 Mon Sep 17 00:00:00 2001 +From: Dan Carpenter +Date: Sat, 14 Sep 2024 12:56:51 +0300 +Subject: netfilter: nft_socket: Fix a NULL vs IS_ERR() bug in nft_socket_cgroup_subtree_level() + +From: Dan Carpenter + +commit 7052622fccb1efb850c6b55de477f65d03525a30 upstream. + +The cgroup_get_from_path() function never returns NULL, it returns error +pointers. Update the error handling to match. + +Fixes: 7f3287db6543 ("netfilter: nft_socket: make cgroupsv2 matching work with namespaces") +Signed-off-by: Dan Carpenter +Acked-by: Florian Westphal +Acked-by: Pablo Neira Ayuso +Link: https://patch.msgid.link/bbc0c4e0-05cc-4f44-8797-2f4b3920a820@stanley.mountain +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/netfilter/nft_socket.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/net/netfilter/nft_socket.c ++++ b/net/netfilter/nft_socket.c +@@ -61,8 +61,8 @@ static noinline int nft_socket_cgroup_su + struct cgroup *cgrp = cgroup_get_from_path("/"); + int level; + +- if (!cgrp) +- return -ENOENT; ++ if (IS_ERR(cgrp)) ++ return PTR_ERR(cgrp); + + level = cgrp->level; + diff --git a/queue-6.1/netfilter-nft_socket-make-cgroupsv2-matching-work-with-namespaces.patch b/queue-6.1/netfilter-nft_socket-make-cgroupsv2-matching-work-with-namespaces.patch new file mode 100644 index 00000000000..75090b07c53 --- /dev/null +++ b/queue-6.1/netfilter-nft_socket-make-cgroupsv2-matching-work-with-namespaces.patch @@ -0,0 +1,144 @@ +From 7f3287db654395f9c5ddd246325ff7889f550286 Mon Sep 17 00:00:00 2001 +From: Florian Westphal +Date: Sat, 7 Sep 2024 16:07:49 +0200 +Subject: netfilter: nft_socket: make cgroupsv2 matching work with namespaces + +From: Florian Westphal + +commit 7f3287db654395f9c5ddd246325ff7889f550286 upstream. + +When running in container environmment, /sys/fs/cgroup/ might not be +the real root node of the sk-attached cgroup. + +Example: + +In container: +% stat /sys//fs/cgroup/ +Device: 0,21 Inode: 2214 .. +% stat /sys/fs/cgroup/foo +Device: 0,21 Inode: 2264 .. + +The expectation would be for: + + nft add rule .. socket cgroupv2 level 1 "foo" counter + +to match traffic from a process that got added to "foo" via +"echo $pid > /sys/fs/cgroup/foo/cgroup.procs". + +However, 'level 3' is needed to make this work. + +Seen from initial namespace, the complete hierarchy is: + +% stat /sys/fs/cgroup/system.slice/docker-.../foo + Device: 0,21 Inode: 2264 .. + +i.e. hierarchy is +0 1 2 3 +/ -> system.slice -> docker-1... -> foo + +... but the container doesn't know that its "/" is the "docker-1.." +cgroup. Current code will retrieve the 'system.slice' cgroup node +and store its kn->id in the destination register, so compare with +2264 ("foo" cgroup id) will not match. + +Fetch "/" cgroup from ->init() and add its level to the level we try to +extract. cgroup root-level is 0 for the init-namespace or the level +of the ancestor that is exposed as the cgroup root inside the container. + +In the above case, cgrp->level of "/" resolved in the container is 2 +(docker-1...scope/) and request for 'level 1' will get adjusted +to fetch the actual level (3). + +v2: use CONFIG_SOCK_CGROUP_DATA, eval function depends on it. + (kernel test robot) + +Cc: cgroups@vger.kernel.org +Fixes: e0bb96db96f8 ("netfilter: nft_socket: add support for cgroupsv2") +Reported-by: Nadia Pinaeva +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Greg Kroah-Hartman +--- + net/netfilter/nft_socket.c | 41 ++++++++++++++++++++++++++++++++++++++--- + 1 file changed, 38 insertions(+), 3 deletions(-) + +--- a/net/netfilter/nft_socket.c ++++ b/net/netfilter/nft_socket.c +@@ -9,7 +9,8 @@ + + struct nft_socket { + enum nft_socket_keys key:8; +- u8 level; ++ u8 level; /* cgroupv2 level to extract */ ++ u8 level_user; /* cgroupv2 level provided by userspace */ + u8 len; + union { + u8 dreg; +@@ -53,6 +54,28 @@ nft_sock_get_eval_cgroupv2(u32 *dest, st + memcpy(dest, &cgid, sizeof(u64)); + return true; + } ++ ++/* process context only, uses current->nsproxy. */ ++static noinline int nft_socket_cgroup_subtree_level(void) ++{ ++ struct cgroup *cgrp = cgroup_get_from_path("/"); ++ int level; ++ ++ if (!cgrp) ++ return -ENOENT; ++ ++ level = cgrp->level; ++ ++ cgroup_put(cgrp); ++ ++ if (WARN_ON_ONCE(level > 255)) ++ return -ERANGE; ++ ++ if (WARN_ON_ONCE(level < 0)) ++ return -EINVAL; ++ ++ return level; ++} + #endif + + static struct sock *nft_socket_do_lookup(const struct nft_pktinfo *pkt) +@@ -174,9 +197,10 @@ static int nft_socket_init(const struct + case NFT_SOCKET_MARK: + len = sizeof(u32); + break; +-#ifdef CONFIG_CGROUPS ++#ifdef CONFIG_SOCK_CGROUP_DATA + case NFT_SOCKET_CGROUPV2: { + unsigned int level; ++ int err; + + if (!tb[NFTA_SOCKET_LEVEL]) + return -EINVAL; +@@ -185,6 +209,17 @@ static int nft_socket_init(const struct + if (level > 255) + return -EOPNOTSUPP; + ++ err = nft_socket_cgroup_subtree_level(); ++ if (err < 0) ++ return err; ++ ++ priv->level_user = level; ++ ++ level += err; ++ /* Implies a giant cgroup tree */ ++ if (WARN_ON_ONCE(level > 255)) ++ return -EOPNOTSUPP; ++ + priv->level = level; + len = sizeof(u64); + break; +@@ -209,7 +244,7 @@ static int nft_socket_dump(struct sk_buf + if (nft_dump_register(skb, NFTA_SOCKET_DREG, priv->dreg)) + return -1; + if (priv->key == NFT_SOCKET_CGROUPV2 && +- nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level))) ++ nla_put_be32(skb, NFTA_SOCKET_LEVEL, htonl(priv->level_user))) + return -1; + return 0; + } diff --git a/queue-6.1/series b/queue-6.1/series index 1d30fd1648f..aa5be7ab593 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -29,3 +29,32 @@ block-fix-where-bio-io-priority-gets-set.patch spi-spidev-add-missing-spi_device_id-for-jg10309-01.patch ocfs2-add-bounds-checking-to-ocfs2_xattr_find_entry.patch ocfs2-strict-bound-check-before-memcmp-in-ocfs2_xatt.patch +xfs-dquot-shrinker-doesn-t-check-for-xfs_dqflag_freeing.patch +xfs-fix-deadlock-on-xfs_inodegc_worker.patch +xfs-fix-extent-busy-updating.patch +xfs-don-t-use-bmbt-btree-split-workers-for-io-completion.patch +xfs-fix-low-space-alloc-deadlock.patch +xfs-prefer-free-inodes-at-enospc-over-chunk-allocation.patch +xfs-block-reservation-too-large-for-minleft-allocation.patch +xfs-fix-uninitialized-variable-access.patch +xfs-quotacheck-failure-can-race-with-background-inode-inactivation.patch +xfs-fix-bug_on-in-xfs_getbmap.patch +xfs-buffer-pins-need-to-hold-a-buffer-reference.patch +xfs-defered-work-could-create-precommits.patch +xfs-fix-agf-vs-inode-cluster-buffer-deadlock.patch +xfs-collect-errors-from-inodegc-for-unlinked-inode-recovery.patch +xfs-fix-ag-count-overflow-during-growfs.patch +xfs-remove-warn-when-dquot-cache-insertion-fails.patch +xfs-fix-the-calculation-for-end-and-length.patch +xfs-load-uncached-unlinked-inodes-into-memory-on-demand.patch +xfs-fix-negative-array-access-in-xfs_getbmap.patch +xfs-fix-unlink-vs-cluster-buffer-instantiation-race.patch +xfs-correct-calculation-for-agend-and-blockcount.patch +xfs-use-i_prev_unlinked-to-distinguish-inodes-that-are-not-on-the-unlinked-list.patch +xfs-reload-entire-unlinked-bucket-lists.patch +xfs-make-inode-unlinked-bucket-recovery-work-with-quotacheck.patch +xfs-fix-reloading-entire-unlinked-bucket-lists.patch +xfs-set-bnobt-cntbt-numrecs-correctly-when-formatting-new-ags.patch +xfs-journal-geometry-is-not-properly-bounds-checked.patch +netfilter-nft_socket-make-cgroupsv2-matching-work-with-namespaces.patch +netfilter-nft_socket-fix-a-null-vs-is_err-bug-in-nft_socket_cgroup_subtree_level.patch diff --git a/queue-6.1/xfs-block-reservation-too-large-for-minleft-allocation.patch b/queue-6.1/xfs-block-reservation-too-large-for-minleft-allocation.patch new file mode 100644 index 00000000000..4acc4bd6a2d --- /dev/null +++ b/queue-6.1/xfs-block-reservation-too-large-for-minleft-allocation.patch @@ -0,0 +1,100 @@ +From stable+bounces-77022-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:43 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:32 -0700 +Subject: xfs: block reservation too large for minleft allocation +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner , Allison Henderson , "Darrick J. Wong" , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-8-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit d5753847b216db0e553e8065aa825cfe497ad143 ] + +When we enter xfs_bmbt_alloc_block() without having first allocated +a data extent (i.e. tp->t_firstblock == NULLFSBLOCK) because we +are doing something like unwritten extent conversion, the transaction +block reservation is used as the minleft value. + +This works for operations like unwritten extent conversion, but it +assumes that the block reservation is only for a BMBT split. THis is +not always true, and sometimes results in larger than necessary +minleft values being set. We only actually need enough space for a +btree split, something we already handle correctly in +xfs_bmapi_write() via the xfs_bmapi_minleft() calculation. + +We should use xfs_bmapi_minleft() in xfs_bmbt_alloc_block() to +calculate the number of blocks a BMBT split on this inode is going to +require, not use the transaction block reservation that contains the +maximum number of blocks this transaction may consume in it... + +Signed-off-by: Dave Chinner +Reviewed-by: Allison Henderson +Reviewed-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_bmap.c | 2 +- + fs/xfs/libxfs/xfs_bmap.h | 2 ++ + fs/xfs/libxfs/xfs_bmap_btree.c | 19 +++++++++---------- + 3 files changed, 12 insertions(+), 11 deletions(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -4242,7 +4242,7 @@ xfs_bmapi_convert_unwritten( + return 0; + } + +-static inline xfs_extlen_t ++xfs_extlen_t + xfs_bmapi_minleft( + struct xfs_trans *tp, + struct xfs_inode *ip, +--- a/fs/xfs/libxfs/xfs_bmap.h ++++ b/fs/xfs/libxfs/xfs_bmap.h +@@ -220,6 +220,8 @@ int xfs_bmap_add_extent_unwritten_real(s + struct xfs_inode *ip, int whichfork, + struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, + struct xfs_bmbt_irec *new, int *logflagsp); ++xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip, ++ int fork); + + enum xfs_bmap_intent_type { + XFS_BMAP_MAP = 1, +--- a/fs/xfs/libxfs/xfs_bmap_btree.c ++++ b/fs/xfs/libxfs/xfs_bmap_btree.c +@@ -213,18 +213,16 @@ xfs_bmbt_alloc_block( + if (args.fsbno == NULLFSBLOCK) { + args.fsbno = be64_to_cpu(start->l); + args.type = XFS_ALLOCTYPE_START_BNO; ++ + /* +- * Make sure there is sufficient room left in the AG to +- * complete a full tree split for an extent insert. If +- * we are converting the middle part of an extent then +- * we may need space for two tree splits. +- * +- * We are relying on the caller to make the correct block +- * reservation for this operation to succeed. If the +- * reservation amount is insufficient then we may fail a +- * block allocation here and corrupt the filesystem. ++ * If we are coming here from something like unwritten extent ++ * conversion, there has been no data extent allocation already ++ * done, so we have to ensure that we attempt to locate the ++ * entire set of bmbt allocations in the same AG, as ++ * xfs_bmapi_write() would have reserved. + */ +- args.minleft = args.tp->t_blk_res; ++ args.minleft = xfs_bmapi_minleft(cur->bc_tp, cur->bc_ino.ip, ++ cur->bc_ino.whichfork); + } else if (cur->bc_tp->t_flags & XFS_TRANS_LOWMODE) { + args.type = XFS_ALLOCTYPE_START_BNO; + } else { +@@ -248,6 +246,7 @@ xfs_bmbt_alloc_block( + * successful activate the lowspace algorithm. + */ + args.fsbno = 0; ++ args.minleft = 0; + args.type = XFS_ALLOCTYPE_FIRST_AG; + error = xfs_alloc_vextent(&args); + if (error) diff --git a/queue-6.1/xfs-buffer-pins-need-to-hold-a-buffer-reference.patch b/queue-6.1/xfs-buffer-pins-need-to-hold-a-buffer-reference.patch new file mode 100644 index 00000000000..d3bf3283262 --- /dev/null +++ b/queue-6.1/xfs-buffer-pins-need-to-hold-a-buffer-reference.patch @@ -0,0 +1,184 @@ +From stable+bounces-77026-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:53 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:36 -0700 +Subject: xfs: buffer pins need to hold a buffer reference +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner , yangerkun , "Darrick J. Wong" , Christoph Hellwig , Dave Chinner , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-12-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit 89a4bf0dc3857569a77061d3d5ea2ac85f7e13c6 ] + +When a buffer is unpinned by xfs_buf_item_unpin(), we need to access +the buffer after we've dropped the buffer log item reference count. +This opens a window where we can have two racing unpins for the +buffer item (e.g. shutdown checkpoint context callback processing +racing with journal IO iclog completion processing) and both attempt +to access the buffer after dropping the BLI reference count. If we +are unlucky, the "BLI freed" context wins the race and frees the +buffer before the "BLI still active" case checks the buffer pin +count. + +This results in a use after free that can only be triggered +in active filesystem shutdown situations. + +To fix this, we need to ensure that buffer existence extends beyond +the BLI reference count checks and until the unpin processing is +complete. This implies that a buffer pin operation must also take a +buffer reference to ensure that the buffer cannot be freed until the +buffer unpin processing is complete. + +Reported-by: yangerkun +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_buf_item.c | 88 ++++++++++++++++++++++++++++++++++++-------------- + 1 file changed, 65 insertions(+), 23 deletions(-) + +--- a/fs/xfs/xfs_buf_item.c ++++ b/fs/xfs/xfs_buf_item.c +@@ -452,10 +452,18 @@ xfs_buf_item_format( + * This is called to pin the buffer associated with the buf log item in memory + * so it cannot be written out. + * +- * We also always take a reference to the buffer log item here so that the bli +- * is held while the item is pinned in memory. This means that we can +- * unconditionally drop the reference count a transaction holds when the +- * transaction is completed. ++ * We take a reference to the buffer log item here so that the BLI life cycle ++ * extends at least until the buffer is unpinned via xfs_buf_item_unpin() and ++ * inserted into the AIL. ++ * ++ * We also need to take a reference to the buffer itself as the BLI unpin ++ * processing requires accessing the buffer after the BLI has dropped the final ++ * BLI reference. See xfs_buf_item_unpin() for an explanation. ++ * If unpins race to drop the final BLI reference and only the ++ * BLI owns a reference to the buffer, then the loser of the race can have the ++ * buffer fgreed from under it (e.g. on shutdown). Taking a buffer reference per ++ * pin count ensures the life cycle of the buffer extends for as ++ * long as we hold the buffer pin reference in xfs_buf_item_unpin(). + */ + STATIC void + xfs_buf_item_pin( +@@ -470,13 +478,30 @@ xfs_buf_item_pin( + + trace_xfs_buf_item_pin(bip); + ++ xfs_buf_hold(bip->bli_buf); + atomic_inc(&bip->bli_refcount); + atomic_inc(&bip->bli_buf->b_pin_count); + } + + /* +- * This is called to unpin the buffer associated with the buf log item which +- * was previously pinned with a call to xfs_buf_item_pin(). ++ * This is called to unpin the buffer associated with the buf log item which was ++ * previously pinned with a call to xfs_buf_item_pin(). We enter this function ++ * with a buffer pin count, a buffer reference and a BLI reference. ++ * ++ * We must drop the BLI reference before we unpin the buffer because the AIL ++ * doesn't acquire a BLI reference whenever it accesses it. Therefore if the ++ * refcount drops to zero, the bli could still be AIL resident and the buffer ++ * submitted for I/O at any point before we return. This can result in IO ++ * completion freeing the buffer while we are still trying to access it here. ++ * This race condition can also occur in shutdown situations where we abort and ++ * unpin buffers from contexts other that journal IO completion. ++ * ++ * Hence we have to hold a buffer reference per pin count to ensure that the ++ * buffer cannot be freed until we have finished processing the unpin operation. ++ * The reference is taken in xfs_buf_item_pin(), and we must hold it until we ++ * are done processing the buffer state. In the case of an abort (remove = ++ * true) then we re-use the current pin reference as the IO reference we hand ++ * off to IO failure handling. + */ + STATIC void + xfs_buf_item_unpin( +@@ -493,24 +518,18 @@ xfs_buf_item_unpin( + + trace_xfs_buf_item_unpin(bip); + +- /* +- * Drop the bli ref associated with the pin and grab the hold required +- * for the I/O simulation failure in the abort case. We have to do this +- * before the pin count drops because the AIL doesn't acquire a bli +- * reference. Therefore if the refcount drops to zero, the bli could +- * still be AIL resident and the buffer submitted for I/O (and freed on +- * completion) at any point before we return. This can be removed once +- * the AIL properly holds a reference on the bli. +- */ + freed = atomic_dec_and_test(&bip->bli_refcount); +- if (freed && !stale && remove) +- xfs_buf_hold(bp); + if (atomic_dec_and_test(&bp->b_pin_count)) + wake_up_all(&bp->b_waiters); + +- /* nothing to do but drop the pin count if the bli is active */ +- if (!freed) ++ /* ++ * Nothing to do but drop the buffer pin reference if the BLI is ++ * still active. ++ */ ++ if (!freed) { ++ xfs_buf_rele(bp); + return; ++ } + + if (stale) { + ASSERT(bip->bli_flags & XFS_BLI_STALE); +@@ -523,6 +542,15 @@ xfs_buf_item_unpin( + trace_xfs_buf_item_unpin_stale(bip); + + /* ++ * The buffer has been locked and referenced since it was marked ++ * stale so we own both lock and reference exclusively here. We ++ * do not need the pin reference any more, so drop it now so ++ * that we only have one reference to drop once item completion ++ * processing is complete. ++ */ ++ xfs_buf_rele(bp); ++ ++ /* + * If we get called here because of an IO error, we may or may + * not have the item on the AIL. xfs_trans_ail_delete() will + * take care of that situation. xfs_trans_ail_delete() drops +@@ -538,16 +566,30 @@ xfs_buf_item_unpin( + ASSERT(bp->b_log_item == NULL); + } + xfs_buf_relse(bp); +- } else if (remove) { ++ return; ++ } ++ ++ if (remove) { + /* +- * The buffer must be locked and held by the caller to simulate +- * an async I/O failure. We acquired the hold for this case +- * before the buffer was unpinned. ++ * We need to simulate an async IO failures here to ensure that ++ * the correct error completion is run on this buffer. This ++ * requires a reference to the buffer and for the buffer to be ++ * locked. We can safely pass ownership of the pin reference to ++ * the IO to ensure that nothing can free the buffer while we ++ * wait for the lock and then run the IO failure completion. + */ + xfs_buf_lock(bp); + bp->b_flags |= XBF_ASYNC; + xfs_buf_ioend_fail(bp); ++ return; + } ++ ++ /* ++ * BLI has no more active references - it will be moved to the AIL to ++ * manage the remaining BLI/buffer life cycle. There is nothing left for ++ * us to do here so drop the pin reference to the buffer. ++ */ ++ xfs_buf_rele(bp); + } + + STATIC uint diff --git a/queue-6.1/xfs-collect-errors-from-inodegc-for-unlinked-inode-recovery.patch b/queue-6.1/xfs-collect-errors-from-inodegc-for-unlinked-inode-recovery.patch new file mode 100644 index 00000000000..a4e4bbad614 --- /dev/null +++ b/queue-6.1/xfs-collect-errors-from-inodegc-for-unlinked-inode-recovery.patch @@ -0,0 +1,340 @@ +From stable+bounces-77029-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:01 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:39 -0700 +Subject: xfs: collect errors from inodegc for unlinked inode recovery +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner , "Darrick J. Wong" , Dave Chinner , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-15-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit d4d12c02bf5f768f1b423c7ae2909c5afdfe0d5f ] + +Unlinked list recovery requires errors removing the inode the from +the unlinked list get fed back to the main recovery loop. Now that +we offload the unlinking to the inodegc work, we don't get errors +being fed back when we trip over a corruption that prevents the +inode from being removed from the unlinked list. + +This means we never clear the corrupt unlinked list bucket, +resulting in runtime operations eventually tripping over it and +shutting down. + +Fix this by collecting inodegc worker errors and feed them +back to the flush caller. This is largely best effort - the only +context that really cares is log recovery, and it only flushes a +single inode at a time so we don't need complex synchronised +handling. Essentially the inodegc workers will capture the first +error that occurs and the next flush will gather them and clear +them. The flush itself will only report the first gathered error. + +In the cases where callers can return errors, propagate the +collected inodegc flush error up the error handling chain. + +In the case of inode unlinked list recovery, there are several +superfluous calls to flush queued unlinked inodes - +xlog_recover_iunlink_bucket() guarantees that it has flushed the +inodegc and collected errors before it returns. Hence nothing in the +calling path needs to run a flush, even when an error is returned. + +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_icache.c | 46 +++++++++++++++++++++++++++++++++++++--------- + fs/xfs/xfs_icache.h | 4 ++-- + fs/xfs/xfs_inode.c | 20 ++++++-------------- + fs/xfs/xfs_inode.h | 2 +- + fs/xfs/xfs_log_recover.c | 19 +++++++++---------- + fs/xfs/xfs_mount.h | 1 + + fs/xfs/xfs_super.c | 1 + + fs/xfs/xfs_trans.c | 4 +++- + 8 files changed, 60 insertions(+), 37 deletions(-) + +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -454,6 +454,27 @@ xfs_inodegc_queue_all( + return ret; + } + ++/* Wait for all queued work and collect errors */ ++static int ++xfs_inodegc_wait_all( ++ struct xfs_mount *mp) ++{ ++ int cpu; ++ int error = 0; ++ ++ flush_workqueue(mp->m_inodegc_wq); ++ for_each_online_cpu(cpu) { ++ struct xfs_inodegc *gc; ++ ++ gc = per_cpu_ptr(mp->m_inodegc, cpu); ++ if (gc->error && !error) ++ error = gc->error; ++ gc->error = 0; ++ } ++ ++ return error; ++} ++ + /* + * Check the validity of the inode we just found it the cache + */ +@@ -1490,15 +1511,14 @@ xfs_blockgc_free_space( + if (error) + return error; + +- xfs_inodegc_flush(mp); +- return 0; ++ return xfs_inodegc_flush(mp); + } + + /* + * Reclaim all the free space that we can by scheduling the background blockgc + * and inodegc workers immediately and waiting for them all to clear. + */ +-void ++int + xfs_blockgc_flush_all( + struct xfs_mount *mp) + { +@@ -1519,7 +1539,7 @@ xfs_blockgc_flush_all( + for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) + flush_delayed_work(&pag->pag_blockgc_work); + +- xfs_inodegc_flush(mp); ++ return xfs_inodegc_flush(mp); + } + + /* +@@ -1841,13 +1861,17 @@ xfs_inodegc_set_reclaimable( + * This is the last chance to make changes to an otherwise unreferenced file + * before incore reclamation happens. + */ +-static void ++static int + xfs_inodegc_inactivate( + struct xfs_inode *ip) + { ++ int error; ++ + trace_xfs_inode_inactivating(ip); +- xfs_inactive(ip); ++ error = xfs_inactive(ip); + xfs_inodegc_set_reclaimable(ip); ++ return error; ++ + } + + void +@@ -1879,8 +1903,12 @@ xfs_inodegc_worker( + + WRITE_ONCE(gc->shrinker_hits, 0); + llist_for_each_entry_safe(ip, n, node, i_gclist) { ++ int error; ++ + xfs_iflags_set(ip, XFS_INACTIVATING); +- xfs_inodegc_inactivate(ip); ++ error = xfs_inodegc_inactivate(ip); ++ if (error && !gc->error) ++ gc->error = error; + } + + memalloc_nofs_restore(nofs_flag); +@@ -1904,13 +1932,13 @@ xfs_inodegc_push( + * Force all currently queued inode inactivation work to run immediately and + * wait for the work to finish. + */ +-void ++int + xfs_inodegc_flush( + struct xfs_mount *mp) + { + xfs_inodegc_push(mp); + trace_xfs_inodegc_flush(mp, __return_address); +- flush_workqueue(mp->m_inodegc_wq); ++ return xfs_inodegc_wait_all(mp); + } + + /* +--- a/fs/xfs/xfs_icache.h ++++ b/fs/xfs/xfs_icache.h +@@ -59,7 +59,7 @@ int xfs_blockgc_free_dquots(struct xfs_m + unsigned int iwalk_flags); + int xfs_blockgc_free_quota(struct xfs_inode *ip, unsigned int iwalk_flags); + int xfs_blockgc_free_space(struct xfs_mount *mp, struct xfs_icwalk *icm); +-void xfs_blockgc_flush_all(struct xfs_mount *mp); ++int xfs_blockgc_flush_all(struct xfs_mount *mp); + + void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip); + void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip); +@@ -77,7 +77,7 @@ void xfs_blockgc_start(struct xfs_mount + + void xfs_inodegc_worker(struct work_struct *work); + void xfs_inodegc_push(struct xfs_mount *mp); +-void xfs_inodegc_flush(struct xfs_mount *mp); ++int xfs_inodegc_flush(struct xfs_mount *mp); + void xfs_inodegc_stop(struct xfs_mount *mp); + void xfs_inodegc_start(struct xfs_mount *mp); + void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu); +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -1620,16 +1620,7 @@ xfs_inactive_ifree( + */ + xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1); + +- /* +- * Just ignore errors at this point. There is nothing we can do except +- * to try to keep going. Make sure it's not a silent error. +- */ +- error = xfs_trans_commit(tp); +- if (error) +- xfs_notice(mp, "%s: xfs_trans_commit returned error %d", +- __func__, error); +- +- return 0; ++ return xfs_trans_commit(tp); + } + + /* +@@ -1696,12 +1687,12 @@ xfs_inode_needs_inactive( + * now be truncated. Also, we clear all of the read-ahead state + * kept for the inode here since the file is now closed. + */ +-void ++int + xfs_inactive( + xfs_inode_t *ip) + { + struct xfs_mount *mp; +- int error; ++ int error = 0; + int truncate = 0; + + /* +@@ -1742,7 +1733,7 @@ xfs_inactive( + * reference to the inode at this point anyways. + */ + if (xfs_can_free_eofblocks(ip, true)) +- xfs_free_eofblocks(ip); ++ error = xfs_free_eofblocks(ip); + + goto out; + } +@@ -1779,7 +1770,7 @@ xfs_inactive( + /* + * Free the inode. + */ +- xfs_inactive_ifree(ip); ++ error = xfs_inactive_ifree(ip); + + out: + /* +@@ -1787,6 +1778,7 @@ out: + * the attached dquots. + */ + xfs_qm_dqdetach(ip); ++ return error; + } + + /* +--- a/fs/xfs/xfs_inode.h ++++ b/fs/xfs/xfs_inode.h +@@ -470,7 +470,7 @@ enum layout_break_reason { + (xfs_has_grpid((pip)->i_mount) || (VFS_I(pip)->i_mode & S_ISGID)) + + int xfs_release(struct xfs_inode *ip); +-void xfs_inactive(struct xfs_inode *ip); ++int xfs_inactive(struct xfs_inode *ip); + int xfs_lookup(struct xfs_inode *dp, const struct xfs_name *name, + struct xfs_inode **ipp, struct xfs_name *ci_name); + int xfs_create(struct user_namespace *mnt_userns, +--- a/fs/xfs/xfs_log_recover.c ++++ b/fs/xfs/xfs_log_recover.c +@@ -2711,7 +2711,9 @@ xlog_recover_iunlink_bucket( + * just to flush the inodegc queue and wait for it to + * complete. + */ +- xfs_inodegc_flush(mp); ++ error = xfs_inodegc_flush(mp); ++ if (error) ++ break; + } + + prev_agino = agino; +@@ -2719,10 +2721,15 @@ xlog_recover_iunlink_bucket( + } + + if (prev_ip) { ++ int error2; ++ + ip->i_prev_unlinked = prev_agino; + xfs_irele(prev_ip); ++ ++ error2 = xfs_inodegc_flush(mp); ++ if (error2 && !error) ++ return error2; + } +- xfs_inodegc_flush(mp); + return error; + } + +@@ -2789,7 +2796,6 @@ xlog_recover_iunlink_ag( + * bucket and remaining inodes on it unreferenced and + * unfreeable. + */ +- xfs_inodegc_flush(pag->pag_mount); + xlog_recover_clear_agi_bucket(pag, bucket); + } + } +@@ -2806,13 +2812,6 @@ xlog_recover_process_iunlinks( + + for_each_perag(log->l_mp, agno, pag) + xlog_recover_iunlink_ag(pag); +- +- /* +- * Flush the pending unlinked inodes to ensure that the inactivations +- * are fully completed on disk and the incore inodes can be reclaimed +- * before we signal that recovery is complete. +- */ +- xfs_inodegc_flush(log->l_mp); + } + + STATIC void +--- a/fs/xfs/xfs_mount.h ++++ b/fs/xfs/xfs_mount.h +@@ -62,6 +62,7 @@ struct xfs_error_cfg { + struct xfs_inodegc { + struct llist_head list; + struct delayed_work work; ++ int error; + + /* approximate count of inodes in the list */ + unsigned int items; +--- a/fs/xfs/xfs_super.c ++++ b/fs/xfs/xfs_super.c +@@ -1089,6 +1089,7 @@ xfs_inodegc_init_percpu( + #endif + init_llist_head(&gc->list); + gc->items = 0; ++ gc->error = 0; + INIT_DELAYED_WORK(&gc->work, xfs_inodegc_worker); + } + return 0; +--- a/fs/xfs/xfs_trans.c ++++ b/fs/xfs/xfs_trans.c +@@ -290,7 +290,9 @@ retry: + * Do not perform a synchronous scan because callers can hold + * other locks. + */ +- xfs_blockgc_flush_all(mp); ++ error = xfs_blockgc_flush_all(mp); ++ if (error) ++ return error; + want_retry = false; + goto retry; + } diff --git a/queue-6.1/xfs-correct-calculation-for-agend-and-blockcount.patch b/queue-6.1/xfs-correct-calculation-for-agend-and-blockcount.patch new file mode 100644 index 00000000000..66c37f35ccc --- /dev/null +++ b/queue-6.1/xfs-correct-calculation-for-agend-and-blockcount.patch @@ -0,0 +1,54 @@ +From stable+bounces-77036-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:21 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:46 -0700 +Subject: xfs: correct calculation for agend and blockcount +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Shiyang Ruan , "Darrick J. Wong" , Chandan Babu R , Leah Rumancik +Message-ID: <20240924183851.1901667-22-leah.rumancik@gmail.com> + +From: Shiyang Ruan + +[ Upstream commit 3c90c01e49342b166e5c90ec2c85b220be15a20e ] + +The agend should be "start + length - 1", then, blockcount should be +"end + 1 - start". Correct 2 calculation mistakes. + +Also, rename "agend" to "range_agend" because it's not the end of the AG +per se; it's the end of the dead region within an AG's agblock space. + +Fixes: 5cf32f63b0f4 ("xfs: fix the calculation for "end" and "length"") +Signed-off-by: Shiyang Ruan +Reviewed-by: "Darrick J. Wong" +Signed-off-by: Chandan Babu R +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_notify_failure.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/fs/xfs/xfs_notify_failure.c ++++ b/fs/xfs/xfs_notify_failure.c +@@ -126,8 +126,8 @@ xfs_dax_notify_ddev_failure( + struct xfs_rmap_irec ri_low = { }; + struct xfs_rmap_irec ri_high; + struct xfs_agf *agf; +- xfs_agblock_t agend; + struct xfs_perag *pag; ++ xfs_agblock_t range_agend; + + pag = xfs_perag_get(mp, agno); + error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); +@@ -148,10 +148,10 @@ xfs_dax_notify_ddev_failure( + ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno); + + agf = agf_bp->b_addr; +- agend = min(be32_to_cpu(agf->agf_length), ++ range_agend = min(be32_to_cpu(agf->agf_length) - 1, + ri_high.rm_startblock); + notify.startblock = ri_low.rm_startblock; +- notify.blockcount = agend - ri_low.rm_startblock; ++ notify.blockcount = range_agend + 1 - ri_low.rm_startblock; + + error = xfs_rmap_query_range(cur, &ri_low, &ri_high, + xfs_dax_failure_fn, ¬ify); diff --git a/queue-6.1/xfs-defered-work-could-create-precommits.patch b/queue-6.1/xfs-defered-work-could-create-precommits.patch new file mode 100644 index 00000000000..67bdef2511f --- /dev/null +++ b/queue-6.1/xfs-defered-work-could-create-precommits.patch @@ -0,0 +1,45 @@ +From stable+bounces-77027-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:58 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:37 -0700 +Subject: xfs: defered work could create precommits +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner , "Darrick J. Wong" , Christoph Hellwig , Dave Chinner , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-13-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit cb042117488dbf0b3b38b05771639890fada9a52 ] + +To fix a AGI-AGF-inode cluster buffer deadlock, we need to move +inode cluster buffer operations to the ->iop_precommit() method. +However, this means that deferred operations can require precommits +to be run on the final transaction that the deferred ops pass back +to xfs_trans_commit() context. This will be exposed by attribute +handling, in that the last changes to the inode in the attr set +state machine "disappear" because the precommit operation is not run. + +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_trans.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/fs/xfs/xfs_trans.c ++++ b/fs/xfs/xfs_trans.c +@@ -970,6 +970,11 @@ __xfs_trans_commit( + error = xfs_defer_finish_noroll(&tp); + if (error) + goto out_unreserve; ++ ++ /* Run precommits from final tx in defer chain. */ ++ error = xfs_trans_run_precommits(tp); ++ if (error) ++ goto out_unreserve; + } + + /* diff --git a/queue-6.1/xfs-don-t-use-bmbt-btree-split-workers-for-io-completion.patch b/queue-6.1/xfs-don-t-use-bmbt-btree-split-workers-for-io-completion.patch new file mode 100644 index 00000000000..2491749712e --- /dev/null +++ b/queue-6.1/xfs-don-t-use-bmbt-btree-split-workers-for-io-completion.patch @@ -0,0 +1,116 @@ +From stable+bounces-77019-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:25 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:29 -0700 +Subject: xfs: don't use BMBT btree split workers for IO completion +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner , "Darrick J. Wong" , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-5-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit c85007e2e3942da1f9361e4b5a9388ea3a8dcc5b ] + +When we split a BMBT due to record insertion, we offload it to a +worker thread because we can be deep in the stack when we try to +allocate a new block for the BMBT. Allocation can use several +kilobytes of stack (full memory reclaim, swap and/or IO path can +end up on the stack during allocation) and we can already be several +kilobytes deep in the stack when we need to split the BMBT. + +A recent workload demonstrated a deadlock in this BMBT split +offload. It requires several things to happen at once: + +1. two inodes need a BMBT split at the same time, one must be +unwritten extent conversion from IO completion, the other must be +from extent allocation. + +2. there must be a no available xfs_alloc_wq worker threads +available in the worker pool. + +3. There must be sustained severe memory shortages such that new +kworker threads cannot be allocated to the xfs_alloc_wq pool for +both threads that need split work to be run + +4. The split work from the unwritten extent conversion must run +first. + +5. when the BMBT block allocation runs from the split work, it must +loop over all AGs and not be able to either trylock an AGF +successfully, or each AGF is is able to lock has no space available +for a single block allocation. + +6. The BMBT allocation must then attempt to lock the AGF that the +second task queued to the rescuer thread already has locked before +it finds an AGF it can allocate from. + +At this point, we have an ABBA deadlock between tasks queued on the +xfs_alloc_wq rescuer thread and a locked AGF. i.e. The queued task +holding the AGF lock can't be run by the rescuer thread until the +task the rescuer thread is runing gets the AGF lock.... + +This is a highly improbably series of events, but there it is. + +There's a couple of ways to fix this, but the easiest way to ensure +that we only punt tasks with a locked AGF that holds enough space +for the BMBT block allocations to the worker thread. + +This works for unwritten extent conversion in IO completion (which +doesn't have a locked AGF and space reservations) because we have +tight control over the IO completion stack. It is typically only 6 +functions deep when xfs_btree_split() is called because we've +already offloaded the IO completion work to a worker thread and +hence we don't need to worry about stack overruns here. + +The other place we can be called for a BMBT split without a +preceeding allocation is __xfs_bunmapi() when punching out the +center of an existing extent. We don't remove extents in the IO +path, so these operations don't tend to be called with a lot of +stack consumed. Hence we don't really need to ship the split off to +a worker thread in these cases, either. + +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_btree.c | 18 ++++++++++++++++-- + 1 file changed, 16 insertions(+), 2 deletions(-) + +--- a/fs/xfs/libxfs/xfs_btree.c ++++ b/fs/xfs/libxfs/xfs_btree.c +@@ -2913,9 +2913,22 @@ xfs_btree_split_worker( + } + + /* +- * BMBT split requests often come in with little stack to work on. Push ++ * BMBT split requests often come in with little stack to work on so we push + * them off to a worker thread so there is lots of stack to use. For the other + * btree types, just call directly to avoid the context switch overhead here. ++ * ++ * Care must be taken here - the work queue rescuer thread introduces potential ++ * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new ++ * AGFs to allocate blocks. A task being run by the rescuer could attempt to ++ * lock an AGF that is already locked by a task queued to run by the rescuer, ++ * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to ++ * release it until the current thread it is running gains the lock. ++ * ++ * To avoid this issue, we only ever queue BMBT splits that don't have an AGF ++ * already locked to allocate from. The only place that doesn't hold an AGF ++ * locked is unwritten extent conversion at IO completion, but that has already ++ * been offloaded to a worker thread and hence has no stack consumption issues ++ * we have to worry about. + */ + STATIC int /* error */ + xfs_btree_split( +@@ -2929,7 +2942,8 @@ xfs_btree_split( + struct xfs_btree_split_args args; + DECLARE_COMPLETION_ONSTACK(done); + +- if (cur->bc_btnum != XFS_BTNUM_BMAP) ++ if (cur->bc_btnum != XFS_BTNUM_BMAP || ++ cur->bc_tp->t_firstblock == NULLFSBLOCK) + return __xfs_btree_split(cur, level, ptrp, key, curp, stat); + + args.cur = cur; diff --git a/queue-6.1/xfs-dquot-shrinker-doesn-t-check-for-xfs_dqflag_freeing.patch b/queue-6.1/xfs-dquot-shrinker-doesn-t-check-for-xfs_dqflag_freeing.patch new file mode 100644 index 00000000000..1a12688cd38 --- /dev/null +++ b/queue-6.1/xfs-dquot-shrinker-doesn-t-check-for-xfs_dqflag_freeing.patch @@ -0,0 +1,67 @@ +From stable+bounces-77016-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:10 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:26 -0700 +Subject: xfs: dquot shrinker doesn't check for XFS_DQFLAG_FREEING +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner , syzbot+912776840162c13db1a3@syzkaller.appspotmail.com, "Darrick J. Wong" , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-2-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit 52f31ed228212ba572c44e15e818a3a5c74122c0 ] + +Resulting in a UAF if the shrinker races with some other dquot +freeing mechanism that sets XFS_DQFLAG_FREEING before the dquot is +removed from the LRU. This can occur if a dquot purge races with +drop_caches. + +Reported-by: syzbot+912776840162c13db1a3@syzkaller.appspotmail.com +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_qm.c | 16 ++++++++++++---- + 1 file changed, 12 insertions(+), 4 deletions(-) + +--- a/fs/xfs/xfs_qm.c ++++ b/fs/xfs/xfs_qm.c +@@ -423,6 +423,14 @@ xfs_qm_dquot_isolate( + goto out_miss_busy; + + /* ++ * If something else is freeing this dquot and hasn't yet removed it ++ * from the LRU, leave it for the freeing task to complete the freeing ++ * process rather than risk it being free from under us here. ++ */ ++ if (dqp->q_flags & XFS_DQFLAG_FREEING) ++ goto out_miss_unlock; ++ ++ /* + * This dquot has acquired a reference in the meantime remove it from + * the freelist and try again. + */ +@@ -441,10 +449,8 @@ xfs_qm_dquot_isolate( + * skip it so there is time for the IO to complete before we try to + * reclaim it again on the next LRU pass. + */ +- if (!xfs_dqflock_nowait(dqp)) { +- xfs_dqunlock(dqp); +- goto out_miss_busy; +- } ++ if (!xfs_dqflock_nowait(dqp)) ++ goto out_miss_unlock; + + if (XFS_DQ_IS_DIRTY(dqp)) { + struct xfs_buf *bp = NULL; +@@ -478,6 +484,8 @@ xfs_qm_dquot_isolate( + XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims); + return LRU_REMOVED; + ++out_miss_unlock: ++ xfs_dqunlock(dqp); + out_miss_busy: + trace_xfs_dqreclaim_busy(dqp); + XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); diff --git a/queue-6.1/xfs-fix-ag-count-overflow-during-growfs.patch b/queue-6.1/xfs-fix-ag-count-overflow-during-growfs.patch new file mode 100644 index 00000000000..38647481cf7 --- /dev/null +++ b/queue-6.1/xfs-fix-ag-count-overflow-during-growfs.patch @@ -0,0 +1,108 @@ +From stable+bounces-77030-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:03 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:40 -0700 +Subject: xfs: fix ag count overflow during growfs +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Long Li , Long Li , "Darrick J. Wong" , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-16-leah.rumancik@gmail.com> + +From: Long Li + +[ Upstream commit c3b880acadc95d6e019eae5d669e072afda24f1b ] + +I found a corruption during growfs: + + XFS (loop0): Internal error agbno >= mp->m_sb.sb_agblocks at line 3661 of + file fs/xfs/libxfs/xfs_alloc.c. Caller __xfs_free_extent+0x28e/0x3c0 + CPU: 0 PID: 573 Comm: xfs_growfs Not tainted 6.3.0-rc7-next-20230420-00001-gda8c95746257 + Call Trace: + + dump_stack_lvl+0x50/0x70 + xfs_corruption_error+0x134/0x150 + __xfs_free_extent+0x2c1/0x3c0 + xfs_ag_extend_space+0x291/0x3e0 + xfs_growfs_data+0xd72/0xe90 + xfs_file_ioctl+0x5f9/0x14a0 + __x64_sys_ioctl+0x13e/0x1c0 + do_syscall_64+0x39/0x80 + entry_SYSCALL_64_after_hwframe+0x63/0xcd + XFS (loop0): Corruption detected. Unmount and run xfs_repair + XFS (loop0): Internal error xfs_trans_cancel at line 1097 of file + fs/xfs/xfs_trans.c. Caller xfs_growfs_data+0x691/0xe90 + CPU: 0 PID: 573 Comm: xfs_growfs Not tainted 6.3.0-rc7-next-20230420-00001-gda8c95746257 + Call Trace: + + dump_stack_lvl+0x50/0x70 + xfs_error_report+0x93/0xc0 + xfs_trans_cancel+0x2c0/0x350 + xfs_growfs_data+0x691/0xe90 + xfs_file_ioctl+0x5f9/0x14a0 + __x64_sys_ioctl+0x13e/0x1c0 + do_syscall_64+0x39/0x80 + entry_SYSCALL_64_after_hwframe+0x63/0xcd + RIP: 0033:0x7f2d86706577 + +The bug can be reproduced with the following sequence: + + # truncate -s 1073741824 xfs_test.img + # mkfs.xfs -f -b size=1024 -d agcount=4 xfs_test.img + # truncate -s 2305843009213693952 xfs_test.img + # mount -o loop xfs_test.img /mnt/test + # xfs_growfs -D 1125899907891200 /mnt/test + +The root cause is that during growfs, user space passed in a large value +of newblcoks to xfs_growfs_data_private(), due to current sb_agblocks is +too small, new AG count will exceed UINT_MAX. Because of AG number type +is unsigned int and it would overflow, that caused nagcount much smaller +than the actual value. During AG extent space, delta blocks in +xfs_resizefs_init_new_ags() will much larger than the actual value due to +incorrect nagcount, even exceed UINT_MAX. This will cause corruption and +be detected in __xfs_free_extent. Fix it by growing the filesystem to up +to the maximally allowed AGs and not return EINVAL when new AG count +overflow. + +Signed-off-by: Long Li +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_fs.h | 2 ++ + fs/xfs/xfs_fsops.c | 13 +++++++++---- + 2 files changed, 11 insertions(+), 4 deletions(-) + +--- a/fs/xfs/libxfs/xfs_fs.h ++++ b/fs/xfs/libxfs/xfs_fs.h +@@ -257,6 +257,8 @@ typedef struct xfs_fsop_resblks { + #define XFS_MAX_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_BLOCKSIZE) + #define XFS_MAX_CRC_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_CRC_BLOCKSIZE) + ++#define XFS_MAX_AGNUMBER ((xfs_agnumber_t)(NULLAGNUMBER - 1)) ++ + /* keep the maximum size under 2^31 by a small amount */ + #define XFS_MAX_LOG_BYTES \ + ((2 * 1024 * 1024 * 1024ULL) - XFS_MIN_LOG_BYTES) +--- a/fs/xfs/xfs_fsops.c ++++ b/fs/xfs/xfs_fsops.c +@@ -115,11 +115,16 @@ xfs_growfs_data_private( + + nb_div = nb; + nb_mod = do_div(nb_div, mp->m_sb.sb_agblocks); +- nagcount = nb_div + (nb_mod != 0); +- if (nb_mod && nb_mod < XFS_MIN_AG_BLOCKS) { +- nagcount--; +- nb = (xfs_rfsblock_t)nagcount * mp->m_sb.sb_agblocks; ++ if (nb_mod && nb_mod >= XFS_MIN_AG_BLOCKS) ++ nb_div++; ++ else if (nb_mod) ++ nb = nb_div * mp->m_sb.sb_agblocks; ++ ++ if (nb_div > XFS_MAX_AGNUMBER + 1) { ++ nb_div = XFS_MAX_AGNUMBER + 1; ++ nb = nb_div * mp->m_sb.sb_agblocks; + } ++ nagcount = nb_div; + delta = nb - mp->m_sb.sb_dblocks; + /* + * Reject filesystems with a single AG because they are not diff --git a/queue-6.1/xfs-fix-agf-vs-inode-cluster-buffer-deadlock.patch b/queue-6.1/xfs-fix-agf-vs-inode-cluster-buffer-deadlock.patch new file mode 100644 index 00000000000..e8bac44cc9c --- /dev/null +++ b/queue-6.1/xfs-fix-agf-vs-inode-cluster-buffer-deadlock.patch @@ -0,0 +1,423 @@ +From stable+bounces-77028-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:01 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:38 -0700 +Subject: xfs: fix AGF vs inode cluster buffer deadlock +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner , Christoph Hellwig , "Darrick J. Wong" , Dave Chinner , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-14-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit 82842fee6e5979ca7e2bf4d839ef890c22ffb7aa ] + +Lock order in XFS is AGI -> AGF, hence for operations involving +inode unlinked list operations we always lock the AGI first. Inode +unlinked list operations operate on the inode cluster buffer, +so the lock order there is AGI -> inode cluster buffer. + +For O_TMPFILE operations, this now means the lock order set down in +xfs_rename and xfs_link is AGI -> inode cluster buffer -> AGF as the +unlinked ops are done before the directory modifications that may +allocate space and lock the AGF. + +Unfortunately, we also now lock the inode cluster buffer when +logging an inode so that we can attach the inode to the cluster +buffer and pin it in memory. This creates a lock order of AGF -> +inode cluster buffer in directory operations as we have to log the +inode after we've allocated new space for it. + +This creates a lock inversion between the AGF and the inode cluster +buffer. Because the inode cluster buffer is shared across multiple +inodes, the inversion is not specific to individual inodes but can +occur when inodes in the same cluster buffer are accessed in +different orders. + +To fix this we need move all the inode log item cluster buffer +interactions to the end of the current transaction. Unfortunately, +xfs_trans_log_inode() calls are littered throughout the transactions +with no thought to ordering against other items or locking. This +makes it difficult to do anything that involves changing the call +sites of xfs_trans_log_inode() to change locking orders. + +However, we do now have a mechanism that allows is to postpone dirty +item processing to just before we commit the transaction: the +->iop_precommit method. This will be called after all the +modifications are done and high level objects like AGI and AGF +buffers have been locked and modified, thereby providing a mechanism +that guarantees we don't lock the inode cluster buffer before those +high level objects are locked. + +This change is largely moving the guts of xfs_trans_log_inode() to +xfs_inode_item_precommit() and providing an extra flag context in +the inode log item to track the dirty state of the inode in the +current transaction. This also means we do a lot less repeated work +in xfs_trans_log_inode() by only doing it once per transaction when +all the work is done. + +Fixes: 298f7bec503f ("xfs: pin inode backing buffer to the inode log item") +Signed-off-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_log_format.h | 9 ++ + fs/xfs/libxfs/xfs_trans_inode.c | 113 ++---------------------------- + fs/xfs/xfs_inode_item.c | 149 ++++++++++++++++++++++++++++++++++++++++ + fs/xfs/xfs_inode_item.h | 1 + 4 files changed, 166 insertions(+), 106 deletions(-) + +--- a/fs/xfs/libxfs/xfs_log_format.h ++++ b/fs/xfs/libxfs/xfs_log_format.h +@@ -324,7 +324,6 @@ struct xfs_inode_log_format_32 { + #define XFS_ILOG_DOWNER 0x200 /* change the data fork owner on replay */ + #define XFS_ILOG_AOWNER 0x400 /* change the attr fork owner on replay */ + +- + /* + * The timestamps are dirty, but not necessarily anything else in the inode + * core. Unlike the other fields above this one must never make it to disk +@@ -333,6 +332,14 @@ struct xfs_inode_log_format_32 { + */ + #define XFS_ILOG_TIMESTAMP 0x4000 + ++/* ++ * The version field has been changed, but not necessarily anything else of ++ * interest. This must never make it to disk - it is used purely to ensure that ++ * the inode item ->precommit operation can update the fsync flag triggers ++ * in the inode item correctly. ++ */ ++#define XFS_ILOG_IVERSION 0x8000 ++ + #define XFS_ILOG_NONCORE (XFS_ILOG_DDATA | XFS_ILOG_DEXT | \ + XFS_ILOG_DBROOT | XFS_ILOG_DEV | \ + XFS_ILOG_ADATA | XFS_ILOG_AEXT | \ +--- a/fs/xfs/libxfs/xfs_trans_inode.c ++++ b/fs/xfs/libxfs/xfs_trans_inode.c +@@ -40,9 +40,8 @@ xfs_trans_ijoin( + iip->ili_lock_flags = lock_flags; + ASSERT(!xfs_iflags_test(ip, XFS_ISTALE)); + +- /* +- * Get a log_item_desc to point at the new item. +- */ ++ /* Reset the per-tx dirty context and add the item to the tx. */ ++ iip->ili_dirty_flags = 0; + xfs_trans_add_item(tp, &iip->ili_item); + } + +@@ -76,17 +75,10 @@ xfs_trans_ichgtime( + /* + * This is called to mark the fields indicated in fieldmask as needing to be + * logged when the transaction is committed. The inode must already be +- * associated with the given transaction. +- * +- * The values for fieldmask are defined in xfs_inode_item.h. We always log all +- * of the core inode if any of it has changed, and we always log all of the +- * inline data/extents/b-tree root if any of them has changed. +- * +- * Grab and pin the cluster buffer associated with this inode to avoid RMW +- * cycles at inode writeback time. Avoid the need to add error handling to every +- * xfs_trans_log_inode() call by shutting down on read error. This will cause +- * transactions to fail and everything to error out, just like if we return a +- * read error in a dirty transaction and cancel it. ++ * associated with the given transaction. All we do here is record where the ++ * inode was dirtied and mark the transaction and inode log item dirty; ++ * everything else is done in the ->precommit log item operation after the ++ * changes in the transaction have been completed. + */ + void + xfs_trans_log_inode( +@@ -96,7 +88,6 @@ xfs_trans_log_inode( + { + struct xfs_inode_log_item *iip = ip->i_itemp; + struct inode *inode = VFS_I(ip); +- uint iversion_flags = 0; + + ASSERT(iip); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); +@@ -105,18 +96,6 @@ xfs_trans_log_inode( + tp->t_flags |= XFS_TRANS_DIRTY; + + /* +- * Don't bother with i_lock for the I_DIRTY_TIME check here, as races +- * don't matter - we either will need an extra transaction in 24 hours +- * to log the timestamps, or will clear already cleared fields in the +- * worst case. +- */ +- if (inode->i_state & I_DIRTY_TIME) { +- spin_lock(&inode->i_lock); +- inode->i_state &= ~I_DIRTY_TIME; +- spin_unlock(&inode->i_lock); +- } +- +- /* + * First time we log the inode in a transaction, bump the inode change + * counter if it is configured for this to occur. While we have the + * inode locked exclusively for metadata modification, we can usually +@@ -128,86 +107,10 @@ xfs_trans_log_inode( + if (!test_and_set_bit(XFS_LI_DIRTY, &iip->ili_item.li_flags)) { + if (IS_I_VERSION(inode) && + inode_maybe_inc_iversion(inode, flags & XFS_ILOG_CORE)) +- iversion_flags = XFS_ILOG_CORE; ++ flags |= XFS_ILOG_IVERSION; + } + +- /* +- * If we're updating the inode core or the timestamps and it's possible +- * to upgrade this inode to bigtime format, do so now. +- */ +- if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) && +- xfs_has_bigtime(ip->i_mount) && +- !xfs_inode_has_bigtime(ip)) { +- ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME; +- flags |= XFS_ILOG_CORE; +- } +- +- /* +- * Inode verifiers do not check that the extent size hint is an integer +- * multiple of the rt extent size on a directory with both rtinherit +- * and extszinherit flags set. If we're logging a directory that is +- * misconfigured in this way, clear the hint. +- */ +- if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && +- (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && +- (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) { +- ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | +- XFS_DIFLAG_EXTSZINHERIT); +- ip->i_extsize = 0; +- flags |= XFS_ILOG_CORE; +- } +- +- /* +- * Record the specific change for fdatasync optimisation. This allows +- * fdatasync to skip log forces for inodes that are only timestamp +- * dirty. +- */ +- spin_lock(&iip->ili_lock); +- iip->ili_fsync_fields |= flags; +- +- if (!iip->ili_item.li_buf) { +- struct xfs_buf *bp; +- int error; +- +- /* +- * We hold the ILOCK here, so this inode is not going to be +- * flushed while we are here. Further, because there is no +- * buffer attached to the item, we know that there is no IO in +- * progress, so nothing will clear the ili_fields while we read +- * in the buffer. Hence we can safely drop the spin lock and +- * read the buffer knowing that the state will not change from +- * here. +- */ +- spin_unlock(&iip->ili_lock); +- error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp); +- if (error) { +- xfs_force_shutdown(ip->i_mount, SHUTDOWN_META_IO_ERROR); +- return; +- } +- +- /* +- * We need an explicit buffer reference for the log item but +- * don't want the buffer to remain attached to the transaction. +- * Hold the buffer but release the transaction reference once +- * we've attached the inode log item to the buffer log item +- * list. +- */ +- xfs_buf_hold(bp); +- spin_lock(&iip->ili_lock); +- iip->ili_item.li_buf = bp; +- bp->b_flags |= _XBF_INODES; +- list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); +- xfs_trans_brelse(tp, bp); +- } +- +- /* +- * Always OR in the bits from the ili_last_fields field. This is to +- * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines +- * in the eventual clearing of the ili_fields bits. See the big comment +- * in xfs_iflush() for an explanation of this coordination mechanism. +- */ +- iip->ili_fields |= (flags | iip->ili_last_fields | iversion_flags); +- spin_unlock(&iip->ili_lock); ++ iip->ili_dirty_flags |= flags; + } + + int +--- a/fs/xfs/xfs_inode_item.c ++++ b/fs/xfs/xfs_inode_item.c +@@ -29,6 +29,153 @@ static inline struct xfs_inode_log_item + return container_of(lip, struct xfs_inode_log_item, ili_item); + } + ++static uint64_t ++xfs_inode_item_sort( ++ struct xfs_log_item *lip) ++{ ++ return INODE_ITEM(lip)->ili_inode->i_ino; ++} ++ ++/* ++ * Prior to finally logging the inode, we have to ensure that all the ++ * per-modification inode state changes are applied. This includes VFS inode ++ * state updates, format conversions, verifier state synchronisation and ++ * ensuring the inode buffer remains in memory whilst the inode is dirty. ++ * ++ * We have to be careful when we grab the inode cluster buffer due to lock ++ * ordering constraints. The unlinked inode modifications (xfs_iunlink_item) ++ * require AGI -> inode cluster buffer lock order. The inode cluster buffer is ++ * not locked until ->precommit, so it happens after everything else has been ++ * modified. ++ * ++ * Further, we have AGI -> AGF lock ordering, and with O_TMPFILE handling we ++ * have AGI -> AGF -> iunlink item -> inode cluster buffer lock order. Hence we ++ * cannot safely lock the inode cluster buffer in xfs_trans_log_inode() because ++ * it can be called on a inode (e.g. via bumplink/droplink) before we take the ++ * AGF lock modifying directory blocks. ++ * ++ * Rather than force a complete rework of all the transactions to call ++ * xfs_trans_log_inode() once and once only at the end of every transaction, we ++ * move the pinning of the inode cluster buffer to a ->precommit operation. This ++ * matches how the xfs_iunlink_item locks the inode cluster buffer, and it ++ * ensures that the inode cluster buffer locking is always done last in a ++ * transaction. i.e. we ensure the lock order is always AGI -> AGF -> inode ++ * cluster buffer. ++ * ++ * If we return the inode number as the precommit sort key then we'll also ++ * guarantee that the order all inode cluster buffer locking is the same all the ++ * inodes and unlink items in the transaction. ++ */ ++static int ++xfs_inode_item_precommit( ++ struct xfs_trans *tp, ++ struct xfs_log_item *lip) ++{ ++ struct xfs_inode_log_item *iip = INODE_ITEM(lip); ++ struct xfs_inode *ip = iip->ili_inode; ++ struct inode *inode = VFS_I(ip); ++ unsigned int flags = iip->ili_dirty_flags; ++ ++ /* ++ * Don't bother with i_lock for the I_DIRTY_TIME check here, as races ++ * don't matter - we either will need an extra transaction in 24 hours ++ * to log the timestamps, or will clear already cleared fields in the ++ * worst case. ++ */ ++ if (inode->i_state & I_DIRTY_TIME) { ++ spin_lock(&inode->i_lock); ++ inode->i_state &= ~I_DIRTY_TIME; ++ spin_unlock(&inode->i_lock); ++ } ++ ++ /* ++ * If we're updating the inode core or the timestamps and it's possible ++ * to upgrade this inode to bigtime format, do so now. ++ */ ++ if ((flags & (XFS_ILOG_CORE | XFS_ILOG_TIMESTAMP)) && ++ xfs_has_bigtime(ip->i_mount) && ++ !xfs_inode_has_bigtime(ip)) { ++ ip->i_diflags2 |= XFS_DIFLAG2_BIGTIME; ++ flags |= XFS_ILOG_CORE; ++ } ++ ++ /* ++ * Inode verifiers do not check that the extent size hint is an integer ++ * multiple of the rt extent size on a directory with both rtinherit ++ * and extszinherit flags set. If we're logging a directory that is ++ * misconfigured in this way, clear the hint. ++ */ ++ if ((ip->i_diflags & XFS_DIFLAG_RTINHERIT) && ++ (ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) && ++ (ip->i_extsize % ip->i_mount->m_sb.sb_rextsize) > 0) { ++ ip->i_diflags &= ~(XFS_DIFLAG_EXTSIZE | ++ XFS_DIFLAG_EXTSZINHERIT); ++ ip->i_extsize = 0; ++ flags |= XFS_ILOG_CORE; ++ } ++ ++ /* ++ * Record the specific change for fdatasync optimisation. This allows ++ * fdatasync to skip log forces for inodes that are only timestamp ++ * dirty. Once we've processed the XFS_ILOG_IVERSION flag, convert it ++ * to XFS_ILOG_CORE so that the actual on-disk dirty tracking ++ * (ili_fields) correctly tracks that the version has changed. ++ */ ++ spin_lock(&iip->ili_lock); ++ iip->ili_fsync_fields |= (flags & ~XFS_ILOG_IVERSION); ++ if (flags & XFS_ILOG_IVERSION) ++ flags = ((flags & ~XFS_ILOG_IVERSION) | XFS_ILOG_CORE); ++ ++ if (!iip->ili_item.li_buf) { ++ struct xfs_buf *bp; ++ int error; ++ ++ /* ++ * We hold the ILOCK here, so this inode is not going to be ++ * flushed while we are here. Further, because there is no ++ * buffer attached to the item, we know that there is no IO in ++ * progress, so nothing will clear the ili_fields while we read ++ * in the buffer. Hence we can safely drop the spin lock and ++ * read the buffer knowing that the state will not change from ++ * here. ++ */ ++ spin_unlock(&iip->ili_lock); ++ error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &bp); ++ if (error) ++ return error; ++ ++ /* ++ * We need an explicit buffer reference for the log item but ++ * don't want the buffer to remain attached to the transaction. ++ * Hold the buffer but release the transaction reference once ++ * we've attached the inode log item to the buffer log item ++ * list. ++ */ ++ xfs_buf_hold(bp); ++ spin_lock(&iip->ili_lock); ++ iip->ili_item.li_buf = bp; ++ bp->b_flags |= _XBF_INODES; ++ list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); ++ xfs_trans_brelse(tp, bp); ++ } ++ ++ /* ++ * Always OR in the bits from the ili_last_fields field. This is to ++ * coordinate with the xfs_iflush() and xfs_buf_inode_iodone() routines ++ * in the eventual clearing of the ili_fields bits. See the big comment ++ * in xfs_iflush() for an explanation of this coordination mechanism. ++ */ ++ iip->ili_fields |= (flags | iip->ili_last_fields); ++ spin_unlock(&iip->ili_lock); ++ ++ /* ++ * We are done with the log item transaction dirty state, so clear it so ++ * that it doesn't pollute future transactions. ++ */ ++ iip->ili_dirty_flags = 0; ++ return 0; ++} ++ + /* + * The logged size of an inode fork is always the current size of the inode + * fork. This means that when an inode fork is relogged, the size of the logged +@@ -662,6 +809,8 @@ xfs_inode_item_committing( + } + + static const struct xfs_item_ops xfs_inode_item_ops = { ++ .iop_sort = xfs_inode_item_sort, ++ .iop_precommit = xfs_inode_item_precommit, + .iop_size = xfs_inode_item_size, + .iop_format = xfs_inode_item_format, + .iop_pin = xfs_inode_item_pin, +--- a/fs/xfs/xfs_inode_item.h ++++ b/fs/xfs/xfs_inode_item.h +@@ -17,6 +17,7 @@ struct xfs_inode_log_item { + struct xfs_log_item ili_item; /* common portion */ + struct xfs_inode *ili_inode; /* inode ptr */ + unsigned short ili_lock_flags; /* inode lock flags */ ++ unsigned int ili_dirty_flags; /* dirty in current tx */ + /* + * The ili_lock protects the interactions between the dirty state and + * the flush state of the inode log item. This allows us to do atomic diff --git a/queue-6.1/xfs-fix-bug_on-in-xfs_getbmap.patch b/queue-6.1/xfs-fix-bug_on-in-xfs_getbmap.patch new file mode 100644 index 00000000000..c624ae398a0 --- /dev/null +++ b/queue-6.1/xfs-fix-bug_on-in-xfs_getbmap.patch @@ -0,0 +1,102 @@ +From stable+bounces-77025-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:53 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:35 -0700 +Subject: xfs: fix BUG_ON in xfs_getbmap() +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Ye Bin , "Darrick J. Wong" , Dave Chinner , Dave Chinner , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-11-leah.rumancik@gmail.com> + +From: Ye Bin + +[ Upstream commit 8ee81ed581ff35882b006a5205100db0b57bf070 ] + +There's issue as follows: +XFS: Assertion failed: (bmv->bmv_iflags & BMV_IF_DELALLOC) != 0, file: fs/xfs/xfs_bmap_util.c, line: 329 +------------[ cut here ]------------ +kernel BUG at fs/xfs/xfs_message.c:102! +invalid opcode: 0000 [#1] PREEMPT SMP KASAN +CPU: 1 PID: 14612 Comm: xfs_io Not tainted 6.3.0-rc2-next-20230315-00006-g2729d23ddb3b-dirty #422 +RIP: 0010:assfail+0x96/0xa0 +RSP: 0018:ffffc9000fa178c0 EFLAGS: 00010246 +RAX: 0000000000000000 RBX: 0000000000000001 RCX: ffff888179a18000 +RDX: 0000000000000000 RSI: ffff888179a18000 RDI: 0000000000000002 +RBP: 0000000000000000 R08: ffffffff8321aab6 R09: 0000000000000000 +R10: 0000000000000001 R11: ffffed1105f85139 R12: ffffffff8aacc4c0 +R13: 0000000000000149 R14: ffff888269f58000 R15: 000000000000000c +FS: 00007f42f27a4740(0000) GS:ffff88882fc00000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 0000000000b92388 CR3: 000000024f006000 CR4: 00000000000006e0 +DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +Call Trace: + + xfs_getbmap+0x1a5b/0x1e40 + xfs_ioc_getbmap+0x1fd/0x5b0 + xfs_file_ioctl+0x2cb/0x1d50 + __x64_sys_ioctl+0x197/0x210 + do_syscall_64+0x39/0xb0 + entry_SYSCALL_64_after_hwframe+0x63/0xcd + +Above issue may happen as follows: + ThreadA ThreadB +do_shared_fault + __do_fault + xfs_filemap_fault + __xfs_filemap_fault + filemap_fault + xfs_ioc_getbmap -> Without BMV_IF_DELALLOC flag + xfs_getbmap + xfs_ilock(ip, XFS_IOLOCK_SHARED); + filemap_write_and_wait + do_page_mkwrite + xfs_filemap_page_mkwrite + __xfs_filemap_fault + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + iomap_page_mkwrite + ... + xfs_buffered_write_iomap_begin + xfs_bmapi_reserve_delalloc -> Allocate delay extent + xfs_ilock_data_map_shared(ip) + xfs_getbmap_report_one + ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0) + -> trigger BUG_ON + +As xfs_filemap_page_mkwrite() only hold XFS_MMAPLOCK_SHARED lock, there's +small window mkwrite can produce delay extent after file write in xfs_getbmap(). +To solve above issue, just skip delalloc extents. + +Signed-off-by: Ye Bin +Reviewed-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_bmap_util.c | 14 ++++++-------- + 1 file changed, 6 insertions(+), 8 deletions(-) + +--- a/fs/xfs/xfs_bmap_util.c ++++ b/fs/xfs/xfs_bmap_util.c +@@ -314,15 +314,13 @@ xfs_getbmap_report_one( + if (isnullstartblock(got->br_startblock) || + got->br_startblock == DELAYSTARTBLOCK) { + /* +- * Delalloc extents that start beyond EOF can occur due to +- * speculative EOF allocation when the delalloc extent is larger +- * than the largest freespace extent at conversion time. These +- * extents cannot be converted by data writeback, so can exist +- * here even if we are not supposed to be finding delalloc +- * extents. ++ * Take the flush completion as being a point-in-time snapshot ++ * where there are no delalloc extents, and if any new ones ++ * have been created racily, just skip them as being 'after' ++ * the flush and so don't get reported. + */ +- if (got->br_startoff < XFS_B_TO_FSB(ip->i_mount, XFS_ISIZE(ip))) +- ASSERT((bmv->bmv_iflags & BMV_IF_DELALLOC) != 0); ++ if (!(bmv->bmv_iflags & BMV_IF_DELALLOC)) ++ return 0; + + p->bmv_oflags |= BMV_OF_DELALLOC; + p->bmv_block = -2; diff --git a/queue-6.1/xfs-fix-deadlock-on-xfs_inodegc_worker.patch b/queue-6.1/xfs-fix-deadlock-on-xfs_inodegc_worker.patch new file mode 100644 index 00000000000..97ba1e2190b --- /dev/null +++ b/queue-6.1/xfs-fix-deadlock-on-xfs_inodegc_worker.patch @@ -0,0 +1,141 @@ +From stable+bounces-77017-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:13 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:27 -0700 +Subject: xfs: Fix deadlock on xfs_inodegc_worker +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Wu Guanghao , "Darrick J. Wong" , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-3-leah.rumancik@gmail.com> + +From: Wu Guanghao + +[ Upstream commit 4da112513c01d7d0acf1025b8764349d46e177d6 ] + +We are doing a test about deleting a large number of files +when memory is low. A deadlock problem was found. + +[ 1240.279183] -> #1 (fs_reclaim){+.+.}-{0:0}: +[ 1240.280450] lock_acquire+0x197/0x460 +[ 1240.281548] fs_reclaim_acquire.part.0+0x20/0x30 +[ 1240.282625] kmem_cache_alloc+0x2b/0x940 +[ 1240.283816] xfs_trans_alloc+0x8a/0x8b0 +[ 1240.284757] xfs_inactive_ifree+0xe4/0x4e0 +[ 1240.285935] xfs_inactive+0x4e9/0x8a0 +[ 1240.286836] xfs_inodegc_worker+0x160/0x5e0 +[ 1240.287969] process_one_work+0xa19/0x16b0 +[ 1240.289030] worker_thread+0x9e/0x1050 +[ 1240.290131] kthread+0x34f/0x460 +[ 1240.290999] ret_from_fork+0x22/0x30 +[ 1240.291905] +[ 1240.291905] -> #0 ((work_completion)(&gc->work)){+.+.}-{0:0}: +[ 1240.293569] check_prev_add+0x160/0x2490 +[ 1240.294473] __lock_acquire+0x2c4d/0x5160 +[ 1240.295544] lock_acquire+0x197/0x460 +[ 1240.296403] __flush_work+0x6bc/0xa20 +[ 1240.297522] xfs_inode_mark_reclaimable+0x6f0/0xdc0 +[ 1240.298649] destroy_inode+0xc6/0x1b0 +[ 1240.299677] dispose_list+0xe1/0x1d0 +[ 1240.300567] prune_icache_sb+0xec/0x150 +[ 1240.301794] super_cache_scan+0x2c9/0x480 +[ 1240.302776] do_shrink_slab+0x3f0/0xaa0 +[ 1240.303671] shrink_slab+0x170/0x660 +[ 1240.304601] shrink_node+0x7f7/0x1df0 +[ 1240.305515] balance_pgdat+0x766/0xf50 +[ 1240.306657] kswapd+0x5bd/0xd20 +[ 1240.307551] kthread+0x34f/0x460 +[ 1240.308346] ret_from_fork+0x22/0x30 +[ 1240.309247] +[ 1240.309247] other info that might help us debug this: +[ 1240.309247] +[ 1240.310944] Possible unsafe locking scenario: +[ 1240.310944] +[ 1240.312379] CPU0 CPU1 +[ 1240.313363] ---- ---- +[ 1240.314433] lock(fs_reclaim); +[ 1240.315107] lock((work_completion)(&gc->work)); +[ 1240.316828] lock(fs_reclaim); +[ 1240.318088] lock((work_completion)(&gc->work)); +[ 1240.319203] +[ 1240.319203] *** DEADLOCK *** +... +[ 2438.431081] Workqueue: xfs-inodegc/sda xfs_inodegc_worker +[ 2438.432089] Call Trace: +[ 2438.432562] __schedule+0xa94/0x1d20 +[ 2438.435787] schedule+0xbf/0x270 +[ 2438.436397] schedule_timeout+0x6f8/0x8b0 +[ 2438.445126] wait_for_completion+0x163/0x260 +[ 2438.448610] __flush_work+0x4c4/0xa40 +[ 2438.455011] xfs_inode_mark_reclaimable+0x6ef/0xda0 +[ 2438.456695] destroy_inode+0xc6/0x1b0 +[ 2438.457375] dispose_list+0xe1/0x1d0 +[ 2438.458834] prune_icache_sb+0xe8/0x150 +[ 2438.461181] super_cache_scan+0x2b3/0x470 +[ 2438.461950] do_shrink_slab+0x3cf/0xa50 +[ 2438.462687] shrink_slab+0x17d/0x660 +[ 2438.466392] shrink_node+0x87e/0x1d40 +[ 2438.467894] do_try_to_free_pages+0x364/0x1300 +[ 2438.471188] try_to_free_pages+0x26c/0x5b0 +[ 2438.473567] __alloc_pages_slowpath.constprop.136+0x7aa/0x2100 +[ 2438.482577] __alloc_pages+0x5db/0x710 +[ 2438.485231] alloc_pages+0x100/0x200 +[ 2438.485923] allocate_slab+0x2c0/0x380 +[ 2438.486623] ___slab_alloc+0x41f/0x690 +[ 2438.490254] __slab_alloc+0x54/0x70 +[ 2438.491692] kmem_cache_alloc+0x23e/0x270 +[ 2438.492437] xfs_trans_alloc+0x88/0x880 +[ 2438.493168] xfs_inactive_ifree+0xe2/0x4e0 +[ 2438.496419] xfs_inactive+0x4eb/0x8b0 +[ 2438.497123] xfs_inodegc_worker+0x16b/0x5e0 +[ 2438.497918] process_one_work+0xbf7/0x1a20 +[ 2438.500316] worker_thread+0x8c/0x1060 +[ 2438.504938] ret_from_fork+0x22/0x30 + +When the memory is insufficient, xfs_inonodegc_worker will trigger memory +reclamation when memory is allocated, then flush_work() may be called to +wait for the work to complete. This causes a deadlock. + +So use memalloc_nofs_save() to avoid triggering memory reclamation in +xfs_inodegc_worker. + +Signed-off-by: Wu Guanghao +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_icache.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -1858,6 +1858,7 @@ xfs_inodegc_worker( + struct xfs_inodegc, work); + struct llist_node *node = llist_del_all(&gc->list); + struct xfs_inode *ip, *n; ++ unsigned int nofs_flag; + + ASSERT(gc->cpu == smp_processor_id()); + +@@ -1866,6 +1867,13 @@ xfs_inodegc_worker( + if (!node) + return; + ++ /* ++ * We can allocate memory here while doing writeback on behalf of ++ * memory reclaim. To avoid memory allocation deadlocks set the ++ * task-wide nofs context for the following operations. ++ */ ++ nofs_flag = memalloc_nofs_save(); ++ + ip = llist_entry(node, struct xfs_inode, i_gclist); + trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits)); + +@@ -1874,6 +1882,8 @@ xfs_inodegc_worker( + xfs_iflags_set(ip, XFS_INACTIVATING); + xfs_inodegc_inactivate(ip); + } ++ ++ memalloc_nofs_restore(nofs_flag); + } + + /* diff --git a/queue-6.1/xfs-fix-extent-busy-updating.patch b/queue-6.1/xfs-fix-extent-busy-updating.patch new file mode 100644 index 00000000000..9a5283aefa0 --- /dev/null +++ b/queue-6.1/xfs-fix-extent-busy-updating.patch @@ -0,0 +1,35 @@ +From stable+bounces-77018-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:14 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:28 -0700 +Subject: xfs: fix extent busy updating +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Wengang Wang , "Darrick J. Wong" , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-4-leah.rumancik@gmail.com> + +From: Wengang Wang + +[ Upstream commit 601a27ea09a317d0fe2895df7d875381fb393041 ] + +In xfs_extent_busy_update_extent() case 6 and 7, whenever bno is modified on +extent busy, the relavent length has to be modified accordingly. + +Signed-off-by: Wengang Wang +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_extent_busy.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/xfs/xfs_extent_busy.c ++++ b/fs/xfs/xfs_extent_busy.c +@@ -236,6 +236,7 @@ xfs_extent_busy_update_extent( + * + */ + busyp->bno = fend; ++ busyp->length = bend - fend; + } else if (bbno < fbno) { + /* + * Case 8: diff --git a/queue-6.1/xfs-fix-low-space-alloc-deadlock.patch b/queue-6.1/xfs-fix-low-space-alloc-deadlock.patch new file mode 100644 index 00000000000..ab1a209a59f --- /dev/null +++ b/queue-6.1/xfs-fix-low-space-alloc-deadlock.patch @@ -0,0 +1,240 @@ +From stable+bounces-77020-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:31 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:30 -0700 +Subject: xfs: fix low space alloc deadlock +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner , Allison Henderson , "Darrick J. Wong" , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-6-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit 1dd0510f6d4b85616a36aabb9be38389467122d9 ] + +I've recently encountered an ABBA deadlock with g/476. The upcoming +changes seem to make this much easier to hit, but the underlying +problem is a pre-existing one. + +Essentially, if we select an AG for allocation, then lock the AGF +and then fail to allocate for some reason (e.g. minimum length +requirements cannot be satisfied), then we drop out of the +allocation with the AGF still locked. + +The caller then modifies the allocation constraints - usually +loosening them up - and tries again. This can result in trying to +access AGFs that are lower than the AGF we already have locked from +the failed attempt. e.g. the failed attempt skipped several AGs +before failing, so we have locks an AG higher than the start AG. +Retrying the allocation from the start AG then causes us to violate +AGF lock ordering and this can lead to deadlocks. + +The deadlock exists even if allocation succeeds - we can do a +followup allocations in the same transaction for BMBT blocks that +aren't guaranteed to be in the same AG as the original, and can move +into higher AGs. Hence we really need to move the tp->t_firstblock +tracking down into xfs_alloc_vextent() where it can be set when we +exit with a locked AG. + +xfs_alloc_vextent() can also check there if the requested +allocation falls within the allow range of AGs set by +tp->t_firstblock. If we can't allocate within the range set, we have +to fail the allocation. If we are allowed to to non-blocking AGF +locking, we can ignore the AG locking order limitations as we can +use try-locks for the first iteration over requested AG range. + +This invalidates a set of post allocation asserts that check that +the allocation is always above tp->t_firstblock if it is set. +Because we can use try-locks to avoid the deadlock in some +circumstances, having a pre-existing locked AGF doesn't always +prevent allocation from lower order AGFs. Hence those ASSERTs need +to be removed. + +Signed-off-by: Dave Chinner +Reviewed-by: Allison Henderson +Reviewed-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_alloc.c | 69 ++++++++++++++++++++++++++++++++++++++-------- + fs/xfs/libxfs/xfs_bmap.c | 14 --------- + fs/xfs/xfs_trace.h | 1 + 3 files changed, 58 insertions(+), 26 deletions(-) + +--- a/fs/xfs/libxfs/xfs_alloc.c ++++ b/fs/xfs/libxfs/xfs_alloc.c +@@ -3164,10 +3164,13 @@ xfs_alloc_vextent( + xfs_alloctype_t type; /* input allocation type */ + int bump_rotor = 0; + xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */ ++ xfs_agnumber_t minimum_agno = 0; + + mp = args->mp; + type = args->otype = args->type; + args->agbno = NULLAGBLOCK; ++ if (args->tp->t_firstblock != NULLFSBLOCK) ++ minimum_agno = XFS_FSB_TO_AGNO(mp, args->tp->t_firstblock); + /* + * Just fix this up, for the case where the last a.g. is shorter + * (or there's only one a.g.) and the caller couldn't easily figure +@@ -3201,6 +3204,13 @@ xfs_alloc_vextent( + */ + args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); + args->pag = xfs_perag_get(mp, args->agno); ++ ++ if (minimum_agno > args->agno) { ++ trace_xfs_alloc_vextent_skip_deadlock(args); ++ error = 0; ++ break; ++ } ++ + error = xfs_alloc_fix_freelist(args, 0); + if (error) { + trace_xfs_alloc_vextent_nofix(args); +@@ -3232,6 +3242,8 @@ xfs_alloc_vextent( + case XFS_ALLOCTYPE_FIRST_AG: + /* + * Rotate through the allocation groups looking for a winner. ++ * If we are blocking, we must obey minimum_agno contraints for ++ * avoiding ABBA deadlocks on AGF locking. + */ + if (type == XFS_ALLOCTYPE_FIRST_AG) { + /* +@@ -3239,7 +3251,7 @@ xfs_alloc_vextent( + */ + args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); + args->type = XFS_ALLOCTYPE_THIS_AG; +- sagno = 0; ++ sagno = minimum_agno; + flags = 0; + } else { + /* +@@ -3248,6 +3260,7 @@ xfs_alloc_vextent( + args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno); + flags = XFS_ALLOC_FLAG_TRYLOCK; + } ++ + /* + * Loop over allocation groups twice; first time with + * trylock set, second time without. +@@ -3276,19 +3289,21 @@ xfs_alloc_vextent( + if (args->agno == sagno && + type == XFS_ALLOCTYPE_START_BNO) + args->type = XFS_ALLOCTYPE_THIS_AG; ++ + /* +- * For the first allocation, we can try any AG to get +- * space. However, if we already have allocated a +- * block, we don't want to try AGs whose number is below +- * sagno. Otherwise, we may end up with out-of-order +- * locking of AGF, which might cause deadlock. +- */ ++ * If we are try-locking, we can't deadlock on AGF ++ * locks, so we can wrap all the way back to the first ++ * AG. Otherwise, wrap back to the start AG so we can't ++ * deadlock, and let the end of scan handler decide what ++ * to do next. ++ */ + if (++(args->agno) == mp->m_sb.sb_agcount) { +- if (args->tp->t_firstblock != NULLFSBLOCK) +- args->agno = sagno; +- else ++ if (flags & XFS_ALLOC_FLAG_TRYLOCK) + args->agno = 0; ++ else ++ args->agno = sagno; + } ++ + /* + * Reached the starting a.g., must either be done + * or switch to non-trylock mode. +@@ -3300,7 +3315,14 @@ xfs_alloc_vextent( + break; + } + ++ /* ++ * Blocking pass next, so we must obey minimum ++ * agno constraints to avoid ABBA AGF deadlocks. ++ */ + flags = 0; ++ if (minimum_agno > sagno) ++ sagno = minimum_agno; ++ + if (type == XFS_ALLOCTYPE_START_BNO) { + args->agbno = XFS_FSB_TO_AGBNO(mp, + args->fsbno); +@@ -3322,9 +3344,9 @@ xfs_alloc_vextent( + ASSERT(0); + /* NOTREACHED */ + } +- if (args->agbno == NULLAGBLOCK) ++ if (args->agbno == NULLAGBLOCK) { + args->fsbno = NULLFSBLOCK; +- else { ++ } else { + args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); + #ifdef DEBUG + ASSERT(args->len >= args->minlen); +@@ -3335,6 +3357,29 @@ xfs_alloc_vextent( + #endif + + } ++ ++ /* ++ * We end up here with a locked AGF. If we failed, the caller is likely ++ * going to try to allocate again with different parameters, and that ++ * can widen the AGs that are searched for free space. If we have to do ++ * BMBT block allocation, we have to do a new allocation. ++ * ++ * Hence leaving this function with the AGF locked opens up potential ++ * ABBA AGF deadlocks because a future allocation attempt in this ++ * transaction may attempt to lock a lower number AGF. ++ * ++ * We can't release the AGF until the transaction is commited, so at ++ * this point we must update the "firstblock" tracker to point at this ++ * AG if the tracker is empty or points to a lower AG. This allows the ++ * next allocation attempt to be modified appropriately to avoid ++ * deadlocks. ++ */ ++ if (args->agbp && ++ (args->tp->t_firstblock == NULLFSBLOCK || ++ args->pag->pag_agno > minimum_agno)) { ++ args->tp->t_firstblock = XFS_AGB_TO_FSB(mp, ++ args->pag->pag_agno, 0); ++ } + xfs_perag_put(args->pag); + return 0; + error0: +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -3413,21 +3413,7 @@ xfs_bmap_process_allocated_extent( + xfs_fileoff_t orig_offset, + xfs_extlen_t orig_length) + { +- int nullfb; +- +- nullfb = ap->tp->t_firstblock == NULLFSBLOCK; +- +- /* +- * check the allocation happened at the same or higher AG than +- * the first block that was allocated. +- */ +- ASSERT(nullfb || +- XFS_FSB_TO_AGNO(args->mp, ap->tp->t_firstblock) <= +- XFS_FSB_TO_AGNO(args->mp, args->fsbno)); +- + ap->blkno = args->fsbno; +- if (nullfb) +- ap->tp->t_firstblock = args->fsbno; + ap->length = args->len; + /* + * If the extent size hint is active, we tried to round the +--- a/fs/xfs/xfs_trace.h ++++ b/fs/xfs/xfs_trace.h +@@ -1877,6 +1877,7 @@ DEFINE_ALLOC_EVENT(xfs_alloc_small_noten + DEFINE_ALLOC_EVENT(xfs_alloc_small_done); + DEFINE_ALLOC_EVENT(xfs_alloc_small_error); + DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs); ++DEFINE_ALLOC_EVENT(xfs_alloc_vextent_skip_deadlock); + DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix); + DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp); + DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed); diff --git a/queue-6.1/xfs-fix-negative-array-access-in-xfs_getbmap.patch b/queue-6.1/xfs-fix-negative-array-access-in-xfs_getbmap.patch new file mode 100644 index 00000000000..21a7ba6f185 --- /dev/null +++ b/queue-6.1/xfs-fix-negative-array-access-in-xfs_getbmap.patch @@ -0,0 +1,67 @@ +From stable+bounces-77034-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:17 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:44 -0700 +Subject: xfs: fix negative array access in xfs_getbmap +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, "Darrick J. Wong" , syzbot+c103d3808a0de5faaf80@syzkaller.appspotmail.com, Dave Chinner , Dave Chinner , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-20-leah.rumancik@gmail.com> + +From: "Darrick J. Wong" + +[ Upstream commit 1bba82fe1afac69c85c1f5ea137c8e73de3c8032 ] + +In commit 8ee81ed581ff, Ye Bin complained about an ASSERT in the bmapx +code that trips if we encounter a delalloc extent after flushing the +pagecache to disk. The ioctl code does not hold MMAPLOCK so it's +entirely possible that a racing write page fault can create a delalloc +extent after the file has been flushed. The proposed solution was to +replace the assertion with an early return that avoids filling out the +bmap recordset with a delalloc entry if the caller didn't ask for it. + +At the time, I recall thinking that the forward logic sounded ok, but +felt hesitant because I suspected that changing this code would cause +something /else/ to burst loose due to some other subtlety. + +syzbot of course found that subtlety. If all the extent mappings found +after the flush are delalloc mappings, we'll reach the end of the data +fork without ever incrementing bmv->bmv_entries. This is new, since +before we'd have emitted the delalloc mappings even though the caller +didn't ask for them. Once we reach the end, we'll try to set +BMV_OF_LAST on the -1st entry (because bmv_entries is zero) and go +corrupt something else in memory. Yay. + +I really dislike all these stupid patches that fiddle around with debug +code and break things that otherwise worked well enough. Nobody was +complaining that calling XFS_IOC_BMAPX without BMV_IF_DELALLOC would +return BMV_OF_DELALLOC records, and now we've gone from "weird behavior +that nobody cared about" to "bad behavior that must be addressed +immediately". + +Maybe I'll just ignore anything from Huawei from now on for my own sake. + +Reported-by: syzbot+c103d3808a0de5faaf80@syzkaller.appspotmail.com +Link: https://lore.kernel.org/linux-xfs/20230412024907.GP360889@frogsfrogsfrogs/ +Fixes: 8ee81ed581ff ("xfs: fix BUG_ON in xfs_getbmap()") +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_bmap_util.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/fs/xfs/xfs_bmap_util.c ++++ b/fs/xfs/xfs_bmap_util.c +@@ -558,7 +558,9 @@ xfs_getbmap( + if (!xfs_iext_next_extent(ifp, &icur, &got)) { + xfs_fileoff_t end = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); + +- out[bmv->bmv_entries - 1].bmv_oflags |= BMV_OF_LAST; ++ if (bmv->bmv_entries > 0) ++ out[bmv->bmv_entries - 1].bmv_oflags |= ++ BMV_OF_LAST; + + if (whichfork != XFS_ATTR_FORK && bno < end && + !xfs_getbmap_full(bmv)) { diff --git a/queue-6.1/xfs-fix-reloading-entire-unlinked-bucket-lists.patch b/queue-6.1/xfs-fix-reloading-entire-unlinked-bucket-lists.patch new file mode 100644 index 00000000000..b0e6f94e751 --- /dev/null +++ b/queue-6.1/xfs-fix-reloading-entire-unlinked-bucket-lists.patch @@ -0,0 +1,199 @@ +From stable+bounces-77040-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:49 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:50 -0700 +Subject: xfs: fix reloading entire unlinked bucket lists +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, "Darrick J. Wong" , Dave Chinner , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-26-leah.rumancik@gmail.com> + +From: "Darrick J. Wong" + +[ Upstream commit 537c013b140d373d1ffe6290b841dc00e67effaa ] + +During review of the patcheset that provided reloading of the incore +iunlink list, Dave made a few suggestions, and I updated the copy in my +dev tree. Unfortunately, I then got distracted by ... who even knows +what ... and forgot to backport those changes from my dev tree to my +release candidate branch. I then sent multiple pull requests with stale +patches, and that's what was merged into -rc3. + +So. + +This patch re-adds the use of an unlocked iunlink list check to +determine if we want to allocate the resources to recreate the incore +list. Since lost iunlinked inodes are supposed to be rare, this change +helps us avoid paying the transaction and AGF locking costs every time +we open any inode. + +This also re-adds the shutdowns on failure, and re-applies the +restructuring of the inner loop in xfs_inode_reload_unlinked_bucket, and +re-adds a requested comment about the quotachecking code. + +Retain the original RVB tag from Dave since there's no code change from +the last submission. + +Fixes: 68b957f64fca1 ("xfs: load uncached unlinked inodes into memory on demand") +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_export.c | 16 ++++++++++++---- + fs/xfs/xfs_inode.c | 48 +++++++++++++++++++++++++++++++++++------------- + fs/xfs/xfs_itable.c | 2 ++ + fs/xfs/xfs_qm.c | 15 ++++++++++++--- + 4 files changed, 61 insertions(+), 20 deletions(-) + +--- a/fs/xfs/xfs_export.c ++++ b/fs/xfs/xfs_export.c +@@ -146,10 +146,18 @@ xfs_nfs_get_inode( + return ERR_PTR(error); + } + +- error = xfs_inode_reload_unlinked(ip); +- if (error) { +- xfs_irele(ip); +- return ERR_PTR(error); ++ /* ++ * Reload the incore unlinked list to avoid failure in inodegc. ++ * Use an unlocked check here because unrecovered unlinked inodes ++ * should be somewhat rare. ++ */ ++ if (xfs_inode_unlinked_incomplete(ip)) { ++ error = xfs_inode_reload_unlinked(ip); ++ if (error) { ++ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); ++ xfs_irele(ip); ++ return ERR_PTR(error); ++ } + } + + if (VFS_I(ip)->i_generation != generation) { +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -1744,6 +1744,14 @@ xfs_inactive( + truncate = 1; + + if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) { ++ /* ++ * If this inode is being inactivated during a quotacheck and ++ * has not yet been scanned by quotacheck, we /must/ remove ++ * the dquots from the inode before inactivation changes the ++ * block and inode counts. Most probably this is a result of ++ * reloading the incore iunlinked list to purge unrecovered ++ * unlinked inodes. ++ */ + xfs_qm_dqdetach(ip); + } else { + error = xfs_qm_dqattach(ip); +@@ -3657,6 +3665,16 @@ xfs_inode_reload_unlinked_bucket( + if (error) + return error; + ++ /* ++ * We've taken ILOCK_SHARED and the AGI buffer lock to stabilize the ++ * incore unlinked list pointers for this inode. Check once more to ++ * see if we raced with anyone else to reload the unlinked list. ++ */ ++ if (!xfs_inode_unlinked_incomplete(ip)) { ++ foundit = true; ++ goto out_agibp; ++ } ++ + bucket = agino % XFS_AGI_UNLINKED_BUCKETS; + agi = agibp->b_addr; + +@@ -3671,25 +3689,27 @@ xfs_inode_reload_unlinked_bucket( + while (next_agino != NULLAGINO) { + struct xfs_inode *next_ip = NULL; + ++ /* Found this caller's inode, set its backlink. */ + if (next_agino == agino) { +- /* Found this inode, set its backlink. */ + next_ip = ip; + next_ip->i_prev_unlinked = prev_agino; + foundit = true; ++ goto next_inode; + } +- if (!next_ip) { +- /* Inode already in memory. */ +- next_ip = xfs_iunlink_lookup(pag, next_agino); +- } +- if (!next_ip) { +- /* Inode not in memory, reload. */ +- error = xfs_iunlink_reload_next(tp, agibp, prev_agino, +- next_agino); +- if (error) +- break; + +- next_ip = xfs_iunlink_lookup(pag, next_agino); +- } ++ /* Try in-memory lookup first. */ ++ next_ip = xfs_iunlink_lookup(pag, next_agino); ++ if (next_ip) ++ goto next_inode; ++ ++ /* Inode not in memory, try reloading it. */ ++ error = xfs_iunlink_reload_next(tp, agibp, prev_agino, ++ next_agino); ++ if (error) ++ break; ++ ++ /* Grab the reloaded inode. */ ++ next_ip = xfs_iunlink_lookup(pag, next_agino); + if (!next_ip) { + /* No incore inode at all? We reloaded it... */ + ASSERT(next_ip != NULL); +@@ -3697,10 +3717,12 @@ xfs_inode_reload_unlinked_bucket( + break; + } + ++next_inode: + prev_agino = next_agino; + next_agino = next_ip->i_next_unlinked; + } + ++out_agibp: + xfs_trans_brelse(tp, agibp); + /* Should have found this inode somewhere in the iunlinked bucket. */ + if (!error && !foundit) +--- a/fs/xfs/xfs_itable.c ++++ b/fs/xfs/xfs_itable.c +@@ -80,10 +80,12 @@ xfs_bulkstat_one_int( + if (error) + goto out; + ++ /* Reload the incore unlinked list to avoid failure in inodegc. */ + if (xfs_inode_unlinked_incomplete(ip)) { + error = xfs_inode_reload_unlinked_bucket(tp, ip); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); ++ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + xfs_irele(ip); + return error; + } +--- a/fs/xfs/xfs_qm.c ++++ b/fs/xfs/xfs_qm.c +@@ -1160,9 +1160,18 @@ xfs_qm_dqusage_adjust( + if (error) + return error; + +- error = xfs_inode_reload_unlinked(ip); +- if (error) +- goto error0; ++ /* ++ * Reload the incore unlinked list to avoid failure in inodegc. ++ * Use an unlocked check here because unrecovered unlinked inodes ++ * should be somewhat rare. ++ */ ++ if (xfs_inode_unlinked_incomplete(ip)) { ++ error = xfs_inode_reload_unlinked(ip); ++ if (error) { ++ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); ++ goto error0; ++ } ++ } + + ASSERT(ip->i_delayed_blks == 0); + diff --git a/queue-6.1/xfs-fix-the-calculation-for-end-and-length.patch b/queue-6.1/xfs-fix-the-calculation-for-end-and-length.patch new file mode 100644 index 00000000000..dc203b1582c --- /dev/null +++ b/queue-6.1/xfs-fix-the-calculation-for-end-and-length.patch @@ -0,0 +1,56 @@ +From stable+bounces-77032-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:11 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:42 -0700 +Subject: xfs: fix the calculation for "end" and "length" +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Shiyang Ruan , "Darrick J. Wong" , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-18-leah.rumancik@gmail.com> + +From: Shiyang Ruan + +[ Upstream commit 5cf32f63b0f4c520460c1a5dd915dc4f09085f29 ] + +The value of "end" should be "start + length - 1". + +Signed-off-by: Shiyang Ruan +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_notify_failure.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +--- a/fs/xfs/xfs_notify_failure.c ++++ b/fs/xfs/xfs_notify_failure.c +@@ -114,7 +114,8 @@ xfs_dax_notify_ddev_failure( + int error = 0; + xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr); + xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno); +- xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, daddr + bblen); ++ xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp, ++ daddr + bblen - 1); + xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno); + + error = xfs_trans_alloc_empty(mp, &tp); +@@ -210,7 +211,7 @@ xfs_dax_notify_failure( + ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1; + + /* Ignore the range out of filesystem area */ +- if (offset + len < ddev_start) ++ if (offset + len - 1 < ddev_start) + return -ENXIO; + if (offset > ddev_end) + return -ENXIO; +@@ -222,8 +223,8 @@ xfs_dax_notify_failure( + len -= ddev_start - offset; + offset = 0; + } +- if (offset + len > ddev_end) +- len -= ddev_end - offset; ++ if (offset + len - 1 > ddev_end) ++ len = ddev_end - offset + 1; + + return xfs_dax_notify_ddev_failure(mp, BTOBB(offset), BTOBB(len), + mf_flags); diff --git a/queue-6.1/xfs-fix-uninitialized-variable-access.patch b/queue-6.1/xfs-fix-uninitialized-variable-access.patch new file mode 100644 index 00000000000..96ee1ddc10e --- /dev/null +++ b/queue-6.1/xfs-fix-uninitialized-variable-access.patch @@ -0,0 +1,36 @@ +From stable+bounces-77023-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:45 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:33 -0700 +Subject: xfs: fix uninitialized variable access +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, "Darrick J. Wong" , syzbot+090ae72d552e6bd93cfe@syzkaller.appspotmail.com, Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-9-leah.rumancik@gmail.com> + +From: "Darrick J. Wong" + +[ Upstream commit 60b730a40c43fbcc034970d3e77eb0f25b8cc1cf ] + +If the end position of a GETFSMAP query overlaps an allocated space and +we're using the free space info to generate fsmap info, the akeys +information gets fed into the fsmap formatter with bad results. +Zero-init the space. + +Reported-by: syzbot+090ae72d552e6bd93cfe@syzkaller.appspotmail.com +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_fsmap.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/xfs/xfs_fsmap.c ++++ b/fs/xfs/xfs_fsmap.c +@@ -761,6 +761,7 @@ xfs_getfsmap_datadev_bnobt( + { + struct xfs_alloc_rec_incore akeys[2]; + ++ memset(akeys, 0, sizeof(akeys)); + info->missing_owner = XFS_FMR_OWN_UNKNOWN; + return __xfs_getfsmap_datadev(tp, keys, info, + xfs_getfsmap_datadev_bnobt_query, &akeys[0]); diff --git a/queue-6.1/xfs-fix-unlink-vs-cluster-buffer-instantiation-race.patch b/queue-6.1/xfs-fix-unlink-vs-cluster-buffer-instantiation-race.patch new file mode 100644 index 00000000000..b647feeb848 --- /dev/null +++ b/queue-6.1/xfs-fix-unlink-vs-cluster-buffer-instantiation-race.patch @@ -0,0 +1,180 @@ +From stable+bounces-77035-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:20 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:45 -0700 +Subject: xfs: fix unlink vs cluster buffer instantiation race +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner , Luis Chamberlain , Christoph Hellwig , "Darrick J. Wong" , Chandan Babu R , Leah Rumancik +Message-ID: <20240924183851.1901667-21-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit 348a1983cf4cf5099fc398438a968443af4c9f65 ] + +Luis has been reporting an assert failure when freeing an inode +cluster during inode inactivation for a while. The assert looks +like: + + XFS: Assertion failed: bp->b_flags & XBF_DONE, file: fs/xfs/xfs_trans_buf.c, line: 241 + ------------[ cut here ]------------ + kernel BUG at fs/xfs/xfs_message.c:102! + Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI + CPU: 4 PID: 73 Comm: kworker/4:1 Not tainted 6.10.0-rc1 #4 + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 + Workqueue: xfs-inodegc/loop5 xfs_inodegc_worker [xfs] + RIP: 0010:assfail (fs/xfs/xfs_message.c:102) xfs + RSP: 0018:ffff88810188f7f0 EFLAGS: 00010202 + RAX: 0000000000000000 RBX: ffff88816e748250 RCX: 1ffffffff844b0e7 + RDX: 0000000000000004 RSI: ffff88810188f558 RDI: ffffffffc2431fa0 + RBP: 1ffff11020311f01 R08: 0000000042431f9f R09: ffffed1020311e9b + R10: ffff88810188f4df R11: ffffffffac725d70 R12: ffff88817a3f4000 + R13: ffff88812182f000 R14: ffff88810188f998 R15: ffffffffc2423f80 + FS: 0000000000000000(0000) GS:ffff8881c8400000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 000055fe9d0f109c CR3: 000000014426c002 CR4: 0000000000770ef0 + DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 + PKRU: 55555554 + Call Trace: + + xfs_trans_read_buf_map (fs/xfs/xfs_trans_buf.c:241 (discriminator 1)) xfs + xfs_imap_to_bp (fs/xfs/xfs_trans.h:210 fs/xfs/libxfs/xfs_inode_buf.c:138) xfs + xfs_inode_item_precommit (fs/xfs/xfs_inode_item.c:145) xfs + xfs_trans_run_precommits (fs/xfs/xfs_trans.c:931) xfs + __xfs_trans_commit (fs/xfs/xfs_trans.c:966) xfs + xfs_inactive_ifree (fs/xfs/xfs_inode.c:1811) xfs + xfs_inactive (fs/xfs/xfs_inode.c:2013) xfs + xfs_inodegc_worker (fs/xfs/xfs_icache.c:1841 fs/xfs/xfs_icache.c:1886) xfs + process_one_work (kernel/workqueue.c:3231) + worker_thread (kernel/workqueue.c:3306 (discriminator 2) kernel/workqueue.c:3393 (discriminator 2)) + kthread (kernel/kthread.c:389) + ret_from_fork (arch/x86/kernel/process.c:147) + ret_from_fork_asm (arch/x86/entry/entry_64.S:257) + + +And occurs when the the inode precommit handlers is attempt to look +up the inode cluster buffer to attach the inode for writeback. + +The trail of logic that I can reconstruct is as follows. + + 1. the inode is clean when inodegc runs, so it is not + attached to a cluster buffer when precommit runs. + + 2. #1 implies the inode cluster buffer may be clean and not + pinned by dirty inodes when inodegc runs. + + 3. #2 implies that the inode cluster buffer can be reclaimed + by memory pressure at any time. + + 4. The assert failure implies that the cluster buffer was + attached to the transaction, but not marked done. It had + been accessed earlier in the transaction, but not marked + done. + + 5. #4 implies the cluster buffer has been invalidated (i.e. + marked stale). + + 6. #5 implies that the inode cluster buffer was instantiated + uninitialised in the transaction in xfs_ifree_cluster(), + which only instantiates the buffers to invalidate them + and never marks them as done. + +Given factors 1-3, this issue is highly dependent on timing and +environmental factors. Hence the issue can be very difficult to +reproduce in some situations, but highly reliable in others. Luis +has an environment where it can be reproduced easily by g/531 but, +OTOH, I've reproduced it only once in ~2000 cycles of g/531. + +I think the fix is to have xfs_ifree_cluster() set the XBF_DONE flag +on the cluster buffers, even though they may not be initialised. The +reasons why I think this is safe are: + + 1. A buffer cache lookup hit on a XBF_STALE buffer will + clear the XBF_DONE flag. Hence all future users of the + buffer know they have to re-initialise the contents + before use and mark it done themselves. + + 2. xfs_trans_binval() sets the XFS_BLI_STALE flag, which + means the buffer remains locked until the journal commit + completes and the buffer is unpinned. Hence once marked + XBF_STALE/XFS_BLI_STALE by xfs_ifree_cluster(), the only + context that can access the freed buffer is the currently + running transaction. + + 3. #2 implies that future buffer lookups in the currently + running transaction will hit the transaction match code + and not the buffer cache. Hence XBF_STALE and + XFS_BLI_STALE will not be cleared unless the transaction + initialises and logs the buffer with valid contents + again. At which point, the buffer will be marked marked + XBF_DONE again, so having XBF_DONE already set on the + stale buffer is a moot point. + + 4. #2 also implies that any concurrent access to that + cluster buffer will block waiting on the buffer lock + until the inode cluster has been fully freed and is no + longer an active inode cluster buffer. + + 5. #4 + #1 means that any future user of the disk range of + that buffer will always see the range of disk blocks + covered by the cluster buffer as not done, and hence must + initialise the contents themselves. + + 6. Setting XBF_DONE in xfs_ifree_cluster() then means the + unlinked inode precommit code will see a XBF_DONE buffer + from the transaction match as it expects. It can then + attach the stale but newly dirtied inode to the stale + but newly dirtied cluster buffer without unexpected + failures. The stale buffer will then sail through the + journal and do the right thing with the attached stale + inode during unpin. + +Hence the fix is just one line of extra code. The explanation of +why we have to set XBF_DONE in xfs_ifree_cluster, OTOH, is long and +complex.... + +Fixes: 82842fee6e59 ("xfs: fix AGF vs inode cluster buffer deadlock") +Signed-off-by: Dave Chinner +Tested-by: Luis Chamberlain +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Chandan Babu R +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_inode.c | 23 +++++++++++++++++++---- + 1 file changed, 19 insertions(+), 4 deletions(-) + +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -2297,11 +2297,26 @@ xfs_ifree_cluster( + * This buffer may not have been correctly initialised as we + * didn't read it from disk. That's not important because we are + * only using to mark the buffer as stale in the log, and to +- * attach stale cached inodes on it. That means it will never be +- * dispatched for IO. If it is, we want to know about it, and we +- * want it to fail. We can acheive this by adding a write +- * verifier to the buffer. ++ * attach stale cached inodes on it. ++ * ++ * For the inode that triggered the cluster freeing, this ++ * attachment may occur in xfs_inode_item_precommit() after we ++ * have marked this buffer stale. If this buffer was not in ++ * memory before xfs_ifree_cluster() started, it will not be ++ * marked XBF_DONE and this will cause problems later in ++ * xfs_inode_item_precommit() when we trip over a (stale, !done) ++ * buffer to attached to the transaction. ++ * ++ * Hence we have to mark the buffer as XFS_DONE here. This is ++ * safe because we are also marking the buffer as XBF_STALE and ++ * XFS_BLI_STALE. That means it will never be dispatched for ++ * IO and it won't be unlocked until the cluster freeing has ++ * been committed to the journal and the buffer unpinned. If it ++ * is written, we want to know about it, and we want it to ++ * fail. We can acheive this by adding a write verifier to the ++ * buffer. + */ ++ bp->b_flags |= XBF_DONE; + bp->b_ops = &xfs_inode_buf_ops; + + /* diff --git a/queue-6.1/xfs-journal-geometry-is-not-properly-bounds-checked.patch b/queue-6.1/xfs-journal-geometry-is-not-properly-bounds-checked.patch new file mode 100644 index 00000000000..a61a9f853ce --- /dev/null +++ b/queue-6.1/xfs-journal-geometry-is-not-properly-bounds-checked.patch @@ -0,0 +1,219 @@ +From stable+bounces-77744-greg=kroah.com@vger.kernel.org Thu Sep 26 02:57:23 2024 +From: Leah Rumancik +Date: Wed, 25 Sep 2024 17:57:05 -0700 +Subject: xfs: journal geometry is not properly bounds checked +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner , Christoph Hellwig , "Darrick J. Wong" , Leah Rumancik +Message-ID: <20240926005705.2896598-1-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit f1e1765aad7de7a8b8102044fc6a44684bc36180 ] + +If the journal geometry results in a sector or log stripe unit +validation problem, it indicates that we cannot set the log up to +safely write to the the journal. In these cases, we must abort the +mount because the corruption needs external intervention to resolve. +Similarly, a journal that is too large cannot be written to safely, +either, so we shouldn't allow those geometries to mount, either. + +If the log is too small, we risk having transaction reservations +overruning the available log space and the system hanging waiting +for space it can never provide. This is purely a runtime hang issue, +not a corruption issue as per the first cases listed above. We abort +mounts of the log is too small for V5 filesystems, but we must allow +v4 filesystems to mount because, historically, there was no log size +validity checking and so some systems may still be out there with +undersized logs. + +The problem is that on V4 filesystems, when we discover a log +geometry problem, we skip all the remaining checks and then allow +the log to continue mounting. This mean that if one of the log size +checks fails, we skip the log stripe unit check. i.e. we allow the +mount because a "non-fatal" geometry is violated, and then fail to +check the hard fail geometries that should fail the mount. + +Move all these fatal checks to the superblock verifier, and add a +new check for the two log sector size geometry variables having the +same values. This will prevent any attempt to mount a log that has +invalid or inconsistent geometries long before we attempt to mount +the log. + +However, for the minimum log size checks, we can only do that once +we've setup up the log and calculated all the iclog sizes and +roundoffs. Hence this needs to remain in the log mount code after +the log has been initialised. It is also the only case where we +should allow a v4 filesystem to continue running, so leave that +handling in place, too. + +Signed-off-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Signed-off-by: Greg Kroah-Hartman +--- + +Notes: + A new fix for the latest 6.1.y backport series just came in. Ran + some tests on it as well and all looks good. Please include with + the original set. + + Thanks, + Leah + + fs/xfs/libxfs/xfs_sb.c | 56 ++++++++++++++++++++++++++++++++++++++++++++++++- + fs/xfs/xfs_log.c | 47 +++++++++++++---------------------------- + 2 files changed, 70 insertions(+), 33 deletions(-) + +--- a/fs/xfs/libxfs/xfs_sb.c ++++ b/fs/xfs/libxfs/xfs_sb.c +@@ -413,7 +413,6 @@ xfs_validate_sb_common( + sbp->sb_inodelog < XFS_DINODE_MIN_LOG || + sbp->sb_inodelog > XFS_DINODE_MAX_LOG || + sbp->sb_inodesize != (1 << sbp->sb_inodelog) || +- sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE || + sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) || + XFS_FSB_TO_B(mp, sbp->sb_agblocks) < XFS_MIN_AG_BYTES || + XFS_FSB_TO_B(mp, sbp->sb_agblocks) > XFS_MAX_AG_BYTES || +@@ -431,6 +430,61 @@ xfs_validate_sb_common( + return -EFSCORRUPTED; + } + ++ /* ++ * Logs that are too large are not supported at all. Reject them ++ * outright. Logs that are too small are tolerated on v4 filesystems, ++ * but we can only check that when mounting the log. Hence we skip ++ * those checks here. ++ */ ++ if (sbp->sb_logblocks > XFS_MAX_LOG_BLOCKS) { ++ xfs_notice(mp, ++ "Log size 0x%x blocks too large, maximum size is 0x%llx blocks", ++ sbp->sb_logblocks, XFS_MAX_LOG_BLOCKS); ++ return -EFSCORRUPTED; ++ } ++ ++ if (XFS_FSB_TO_B(mp, sbp->sb_logblocks) > XFS_MAX_LOG_BYTES) { ++ xfs_warn(mp, ++ "log size 0x%llx bytes too large, maximum size is 0x%llx bytes", ++ XFS_FSB_TO_B(mp, sbp->sb_logblocks), ++ XFS_MAX_LOG_BYTES); ++ return -EFSCORRUPTED; ++ } ++ ++ /* ++ * Do not allow filesystems with corrupted log sector or stripe units to ++ * be mounted. We cannot safely size the iclogs or write to the log if ++ * the log stripe unit is not valid. ++ */ ++ if (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT) { ++ if (sbp->sb_logsectsize != (1U << sbp->sb_logsectlog)) { ++ xfs_notice(mp, ++ "log sector size in bytes/log2 (0x%x/0x%x) must match", ++ sbp->sb_logsectsize, 1U << sbp->sb_logsectlog); ++ return -EFSCORRUPTED; ++ } ++ } else if (sbp->sb_logsectsize || sbp->sb_logsectlog) { ++ xfs_notice(mp, ++ "log sector size in bytes/log2 (0x%x/0x%x) are not zero", ++ sbp->sb_logsectsize, sbp->sb_logsectlog); ++ return -EFSCORRUPTED; ++ } ++ ++ if (sbp->sb_logsunit > 1) { ++ if (sbp->sb_logsunit % sbp->sb_blocksize) { ++ xfs_notice(mp, ++ "log stripe unit 0x%x bytes must be a multiple of block size", ++ sbp->sb_logsunit); ++ return -EFSCORRUPTED; ++ } ++ if (sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE) { ++ xfs_notice(mp, ++ "log stripe unit 0x%x bytes over maximum size (0x%x bytes)", ++ sbp->sb_logsunit, XLOG_MAX_RECORD_BSIZE); ++ return -EFSCORRUPTED; ++ } ++ } ++ + /* Validate the realtime geometry; stolen from xfs_repair */ + if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || + sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) { +--- a/fs/xfs/xfs_log.c ++++ b/fs/xfs/xfs_log.c +@@ -639,7 +639,6 @@ xfs_log_mount( + int num_bblks) + { + struct xlog *log; +- bool fatal = xfs_has_crc(mp); + int error = 0; + int min_logfsbs; + +@@ -661,53 +660,37 @@ xfs_log_mount( + mp->m_log = log; + + /* +- * Validate the given log space and drop a critical message via syslog +- * if the log size is too small that would lead to some unexpected +- * situations in transaction log space reservation stage. ++ * Now that we have set up the log and it's internal geometry ++ * parameters, we can validate the given log space and drop a critical ++ * message via syslog if the log size is too small. A log that is too ++ * small can lead to unexpected situations in transaction log space ++ * reservation stage. The superblock verifier has already validated all ++ * the other log geometry constraints, so we don't have to check those ++ * here. + * +- * Note: we can't just reject the mount if the validation fails. This +- * would mean that people would have to downgrade their kernel just to +- * remedy the situation as there is no way to grow the log (short of +- * black magic surgery with xfs_db). ++ * Note: For v4 filesystems, we can't just reject the mount if the ++ * validation fails. This would mean that people would have to ++ * downgrade their kernel just to remedy the situation as there is no ++ * way to grow the log (short of black magic surgery with xfs_db). + * +- * We can, however, reject mounts for CRC format filesystems, as the ++ * We can, however, reject mounts for V5 format filesystems, as the + * mkfs binary being used to make the filesystem should never create a + * filesystem with a log that is too small. + */ + min_logfsbs = xfs_log_calc_minimum_size(mp); +- + if (mp->m_sb.sb_logblocks < min_logfsbs) { + xfs_warn(mp, + "Log size %d blocks too small, minimum size is %d blocks", + mp->m_sb.sb_logblocks, min_logfsbs); +- error = -EINVAL; +- } else if (mp->m_sb.sb_logblocks > XFS_MAX_LOG_BLOCKS) { +- xfs_warn(mp, +- "Log size %d blocks too large, maximum size is %lld blocks", +- mp->m_sb.sb_logblocks, XFS_MAX_LOG_BLOCKS); +- error = -EINVAL; +- } else if (XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks) > XFS_MAX_LOG_BYTES) { +- xfs_warn(mp, +- "log size %lld bytes too large, maximum size is %lld bytes", +- XFS_FSB_TO_B(mp, mp->m_sb.sb_logblocks), +- XFS_MAX_LOG_BYTES); +- error = -EINVAL; +- } else if (mp->m_sb.sb_logsunit > 1 && +- mp->m_sb.sb_logsunit % mp->m_sb.sb_blocksize) { +- xfs_warn(mp, +- "log stripe unit %u bytes must be a multiple of block size", +- mp->m_sb.sb_logsunit); +- error = -EINVAL; +- fatal = true; +- } +- if (error) { ++ + /* + * Log check errors are always fatal on v5; or whenever bad + * metadata leads to a crash. + */ +- if (fatal) { ++ if (xfs_has_crc(mp)) { + xfs_crit(mp, "AAIEEE! Log failed size checks. Abort!"); + ASSERT(0); ++ error = -EINVAL; + goto out_free_log; + } + xfs_crit(mp, "Log size out of supported range."); diff --git a/queue-6.1/xfs-load-uncached-unlinked-inodes-into-memory-on-demand.patch b/queue-6.1/xfs-load-uncached-unlinked-inodes-into-memory-on-demand.patch new file mode 100644 index 00000000000..04fe8c82797 --- /dev/null +++ b/queue-6.1/xfs-load-uncached-unlinked-inodes-into-memory-on-demand.patch @@ -0,0 +1,218 @@ +From stable+bounces-77033-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:13 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:43 -0700 +Subject: xfs: load uncached unlinked inodes into memory on demand +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, "Darrick J. Wong" , shrikanth hegde , Ritesh Harjani , Dave Chinner , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-19-leah.rumancik@gmail.com> + +From: "Darrick J. Wong" + +[ Upstream commit 68b957f64fca1930164bfc6d6d379acdccd547d7 ] + +shrikanth hegde reports that filesystems fail shortly after mount with +the following failure: + + WARNING: CPU: 56 PID: 12450 at fs/xfs/xfs_inode.c:1839 xfs_iunlink_lookup+0x58/0x80 [xfs] + +This of course is the WARN_ON_ONCE in xfs_iunlink_lookup: + + ip = radix_tree_lookup(&pag->pag_ici_root, agino); + if (WARN_ON_ONCE(!ip || !ip->i_ino)) { ... } + +>From diagnostic data collected by the bug reporters, it would appear +that we cleanly mounted a filesystem that contained unlinked inodes. +Unlinked inodes are only processed as a final step of log recovery, +which means that clean mounts do not process the unlinked list at all. + +Prior to the introduction of the incore unlinked lists, this wasn't a +problem because the unlink code would (very expensively) traverse the +entire ondisk metadata iunlink chain to keep things up to date. +However, the incore unlinked list code complains when it realizes that +it is out of sync with the ondisk metadata and shuts down the fs, which +is bad. + +Ritesh proposed to solve this problem by unconditionally parsing the +unlinked lists at mount time, but this imposes a mount time cost for +every filesystem to catch something that should be very infrequent. +Instead, let's target the places where we can encounter a next_unlinked +pointer that refers to an inode that is not in cache, and load it into +cache. + +Note: This patch does not address the problem of iget loading an inode +from the middle of the iunlink list and needing to set i_prev_unlinked +correctly. + +Reported-by: shrikanth hegde +Triaged-by: Ritesh Harjani +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_inode.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++---- + fs/xfs/xfs_trace.h | 25 ++++++++++++++++ + 2 files changed, 100 insertions(+), 5 deletions(-) + +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -1829,12 +1829,17 @@ xfs_iunlink_lookup( + + rcu_read_lock(); + ip = radix_tree_lookup(&pag->pag_ici_root, agino); ++ if (!ip) { ++ /* Caller can handle inode not being in memory. */ ++ rcu_read_unlock(); ++ return NULL; ++ } + + /* +- * Inode not in memory or in RCU freeing limbo should not happen. +- * Warn about this and let the caller handle the failure. ++ * Inode in RCU freeing limbo should not happen. Warn about this and ++ * let the caller handle the failure. + */ +- if (WARN_ON_ONCE(!ip || !ip->i_ino)) { ++ if (WARN_ON_ONCE(!ip->i_ino)) { + rcu_read_unlock(); + return NULL; + } +@@ -1843,7 +1848,10 @@ xfs_iunlink_lookup( + return ip; + } + +-/* Update the prev pointer of the next agino. */ ++/* ++ * Update the prev pointer of the next agino. Returns -ENOLINK if the inode ++ * is not in cache. ++ */ + static int + xfs_iunlink_update_backref( + struct xfs_perag *pag, +@@ -1858,7 +1866,8 @@ xfs_iunlink_update_backref( + + ip = xfs_iunlink_lookup(pag, next_agino); + if (!ip) +- return -EFSCORRUPTED; ++ return -ENOLINK; ++ + ip->i_prev_unlinked = prev_agino; + return 0; + } +@@ -1902,6 +1911,62 @@ xfs_iunlink_update_bucket( + return 0; + } + ++/* ++ * Load the inode @next_agino into the cache and set its prev_unlinked pointer ++ * to @prev_agino. Caller must hold the AGI to synchronize with other changes ++ * to the unlinked list. ++ */ ++STATIC int ++xfs_iunlink_reload_next( ++ struct xfs_trans *tp, ++ struct xfs_buf *agibp, ++ xfs_agino_t prev_agino, ++ xfs_agino_t next_agino) ++{ ++ struct xfs_perag *pag = agibp->b_pag; ++ struct xfs_mount *mp = pag->pag_mount; ++ struct xfs_inode *next_ip = NULL; ++ xfs_ino_t ino; ++ int error; ++ ++ ASSERT(next_agino != NULLAGINO); ++ ++#ifdef DEBUG ++ rcu_read_lock(); ++ next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino); ++ ASSERT(next_ip == NULL); ++ rcu_read_unlock(); ++#endif ++ ++ xfs_info_ratelimited(mp, ++ "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.", ++ next_agino, pag->pag_agno); ++ ++ /* ++ * Use an untrusted lookup just to be cautious in case the AGI has been ++ * corrupted and now points at a free inode. That shouldn't happen, ++ * but we'd rather shut down now since we're already running in a weird ++ * situation. ++ */ ++ ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino); ++ error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip); ++ if (error) ++ return error; ++ ++ /* If this is not an unlinked inode, something is very wrong. */ ++ if (VFS_I(next_ip)->i_nlink != 0) { ++ error = -EFSCORRUPTED; ++ goto rele; ++ } ++ ++ next_ip->i_prev_unlinked = prev_agino; ++ trace_xfs_iunlink_reload_next(next_ip); ++rele: ++ ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE)); ++ xfs_irele(next_ip); ++ return error; ++} ++ + static int + xfs_iunlink_insert_inode( + struct xfs_trans *tp, +@@ -1933,6 +1998,8 @@ xfs_iunlink_insert_inode( + * inode. + */ + error = xfs_iunlink_update_backref(pag, agino, next_agino); ++ if (error == -ENOLINK) ++ error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino); + if (error) + return error; + +@@ -2027,6 +2094,9 @@ xfs_iunlink_remove_inode( + */ + error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, + ip->i_next_unlinked); ++ if (error == -ENOLINK) ++ error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked, ++ ip->i_next_unlinked); + if (error) + return error; + +--- a/fs/xfs/xfs_trace.h ++++ b/fs/xfs/xfs_trace.h +@@ -3679,6 +3679,31 @@ TRACE_EVENT(xfs_iunlink_update_dinode, + __entry->new_ptr) + ); + ++TRACE_EVENT(xfs_iunlink_reload_next, ++ TP_PROTO(struct xfs_inode *ip), ++ TP_ARGS(ip), ++ TP_STRUCT__entry( ++ __field(dev_t, dev) ++ __field(xfs_agnumber_t, agno) ++ __field(xfs_agino_t, agino) ++ __field(xfs_agino_t, prev_agino) ++ __field(xfs_agino_t, next_agino) ++ ), ++ TP_fast_assign( ++ __entry->dev = ip->i_mount->m_super->s_dev; ++ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); ++ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); ++ __entry->prev_agino = ip->i_prev_unlinked; ++ __entry->next_agino = ip->i_next_unlinked; ++ ), ++ TP_printk("dev %d:%d agno 0x%x agino 0x%x prev_unlinked 0x%x next_unlinked 0x%x", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->agno, ++ __entry->agino, ++ __entry->prev_agino, ++ __entry->next_agino) ++); ++ + DECLARE_EVENT_CLASS(xfs_ag_inode_class, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), diff --git a/queue-6.1/xfs-make-inode-unlinked-bucket-recovery-work-with-quotacheck.patch b/queue-6.1/xfs-make-inode-unlinked-bucket-recovery-work-with-quotacheck.patch new file mode 100644 index 00000000000..18702d4e3e0 --- /dev/null +++ b/queue-6.1/xfs-make-inode-unlinked-bucket-recovery-work-with-quotacheck.patch @@ -0,0 +1,154 @@ +From stable+bounces-77039-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:38 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:49 -0700 +Subject: xfs: make inode unlinked bucket recovery work with quotacheck +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, "Darrick J. Wong" , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-25-leah.rumancik@gmail.com> + +From: "Darrick J. Wong" + +[ Upstream commit 49813a21ed57895b73ec4ed3b99d4beec931496f ] + +Teach quotacheck to reload the unlinked inode lists when walking the +inode table. This requires extra state handling, since it's possible +that a reloaded inode will get inactivated before quotacheck tries to +scan it; in this case, we need to ensure that the reloaded inode does +not have dquots attached when it is freed. + +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_attr_inactive.c | 1 - + fs/xfs/xfs_inode.c | 12 +++++++++--- + fs/xfs/xfs_inode.h | 5 ++++- + fs/xfs/xfs_mount.h | 10 +++++++++- + fs/xfs/xfs_qm.c | 7 +++++++ + 5 files changed, 29 insertions(+), 6 deletions(-) + +--- a/fs/xfs/xfs_attr_inactive.c ++++ b/fs/xfs/xfs_attr_inactive.c +@@ -333,7 +333,6 @@ xfs_attr_inactive( + int error = 0; + + mp = dp->i_mount; +- ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); + + xfs_ilock(dp, lock_mode); + if (!xfs_inode_has_attr_fork(dp)) +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -1743,9 +1743,13 @@ xfs_inactive( + ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) + truncate = 1; + +- error = xfs_qm_dqattach(ip); +- if (error) +- goto out; ++ if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) { ++ xfs_qm_dqdetach(ip); ++ } else { ++ error = xfs_qm_dqattach(ip); ++ if (error) ++ goto out; ++ } + + if (S_ISLNK(VFS_I(ip)->i_mode)) + error = xfs_inactive_symlink(ip); +@@ -1963,6 +1967,8 @@ xfs_iunlink_reload_next( + trace_xfs_iunlink_reload_next(next_ip); + rele: + ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE)); ++ if (xfs_is_quotacheck_running(mp) && next_ip) ++ xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED); + xfs_irele(next_ip); + return error; + } +--- a/fs/xfs/xfs_inode.h ++++ b/fs/xfs/xfs_inode.h +@@ -344,6 +344,9 @@ static inline bool xfs_inode_has_large_e + */ + #define XFS_INACTIVATING (1 << 13) + ++/* Quotacheck is running but inode has not been added to quota counts. */ ++#define XFS_IQUOTAUNCHECKED (1 << 14) ++ + /* All inode state flags related to inode reclaim. */ + #define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ + XFS_IRECLAIM | \ +@@ -358,7 +361,7 @@ static inline bool xfs_inode_has_large_e + #define XFS_IRECLAIM_RESET_FLAGS \ + (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ + XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \ +- XFS_INACTIVATING) ++ XFS_INACTIVATING | XFS_IQUOTAUNCHECKED) + + /* + * Flags for inode locking. +--- a/fs/xfs/xfs_mount.h ++++ b/fs/xfs/xfs_mount.h +@@ -401,6 +401,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID) + #define XFS_OPSTATE_WARNED_SHRINK 8 + /* Kernel has logged a warning about logged xattr updates being used. */ + #define XFS_OPSTATE_WARNED_LARP 9 ++/* Mount time quotacheck is running */ ++#define XFS_OPSTATE_QUOTACHECK_RUNNING 10 + + #define __XFS_IS_OPSTATE(name, NAME) \ + static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ +@@ -423,6 +425,11 @@ __XFS_IS_OPSTATE(inode32, INODE32) + __XFS_IS_OPSTATE(readonly, READONLY) + __XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED) + __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) ++#ifdef CONFIG_XFS_QUOTA ++__XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING) ++#else ++# define xfs_is_quotacheck_running(mp) (false) ++#endif + + static inline bool + xfs_should_warn(struct xfs_mount *mp, long nr) +@@ -440,7 +447,8 @@ xfs_should_warn(struct xfs_mount *mp, lo + { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \ + { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \ + { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \ +- { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" } ++ { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \ ++ { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" } + + /* + * Max and min values for mount-option defined I/O +--- a/fs/xfs/xfs_qm.c ++++ b/fs/xfs/xfs_qm.c +@@ -1160,6 +1160,10 @@ xfs_qm_dqusage_adjust( + if (error) + return error; + ++ error = xfs_inode_reload_unlinked(ip); ++ if (error) ++ goto error0; ++ + ASSERT(ip->i_delayed_blks == 0); + + if (XFS_IS_REALTIME_INODE(ip)) { +@@ -1173,6 +1177,7 @@ xfs_qm_dqusage_adjust( + } + + nblks = (xfs_qcnt_t)ip->i_nblocks - rtblks; ++ xfs_iflags_clear(ip, XFS_IQUOTAUNCHECKED); + + /* + * Add the (disk blocks and inode) resources occupied by this +@@ -1319,8 +1324,10 @@ xfs_qm_quotacheck( + flags |= XFS_PQUOTA_CHKD; + } + ++ xfs_set_quotacheck_running(mp); + error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true, + NULL); ++ xfs_clear_quotacheck_running(mp); + + /* + * On error, the inode walk may have partially populated the dquot diff --git a/queue-6.1/xfs-prefer-free-inodes-at-enospc-over-chunk-allocation.patch b/queue-6.1/xfs-prefer-free-inodes-at-enospc-over-chunk-allocation.patch new file mode 100644 index 00000000000..f0d1a7e4c8d --- /dev/null +++ b/queue-6.1/xfs-prefer-free-inodes-at-enospc-over-chunk-allocation.patch @@ -0,0 +1,95 @@ +From stable+bounces-77021-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:37 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:31 -0700 +Subject: xfs: prefer free inodes at ENOSPC over chunk allocation +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner , Allison Henderson , "Darrick J. Wong" , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-7-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit f08f984c63e9980614ae3a0a574b31eaaef284b2 ] + +When an XFS filesystem has free inodes in chunks already allocated +on disk, it will still allocate new inode chunks if the target AG +has no free inodes in it. Normally, this is a good idea as it +preserves locality of all the inodes in a given directory. + +However, at ENOSPC this can lead to using the last few remaining +free filesystem blocks to allocate a new chunk when there are many, +many free inodes that could be allocated without consuming free +space. This results in speeding up the consumption of the last few +blocks and inode create operations then returning ENOSPC when there +free inodes available because we don't have enough block left in the +filesystem for directory creation reservations to proceed. + +Hence when we are near ENOSPC, we should be attempting to preserve +the remaining blocks for directory block allocation rather than +using them for unnecessary inode chunk creation. + +This particular behaviour is exposed by xfs/294, when it drives to +ENOSPC on empty file creation whilst there are still thousands of +free inodes available for allocation in other AGs in the filesystem. + +Hence, when we are within 1% of ENOSPC, change the inode allocation +behaviour to prefer to use existing free inodes over allocating new +inode chunks, even though it results is poorer locality of the data +set. It is more important for the allocations to be space efficient +near ENOSPC than to have optimal locality for performance, so lets +modify the inode AG selection code to reflect that fact. + +This allows generic/294 to not only pass with this allocator rework +patchset, but to increase the number of post-ENOSPC empty inode +allocations to from ~600 to ~9080 before we hit ENOSPC on the +directory create transaction reservation. + +Signed-off-by: Dave Chinner +Reviewed-by: Allison Henderson +Reviewed-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_ialloc.c | 17 +++++++++++++++++ + 1 file changed, 17 insertions(+) + +--- a/fs/xfs/libxfs/xfs_ialloc.c ++++ b/fs/xfs/libxfs/xfs_ialloc.c +@@ -1737,6 +1737,7 @@ xfs_dialloc( + struct xfs_perag *pag; + struct xfs_ino_geometry *igeo = M_IGEO(mp); + bool ok_alloc = true; ++ bool low_space = false; + int flags; + xfs_ino_t ino; + +@@ -1768,6 +1769,20 @@ xfs_dialloc( + } + + /* ++ * If we are near to ENOSPC, we want to prefer allocation from AGs that ++ * have free inodes in them rather than use up free space allocating new ++ * inode chunks. Hence we turn off allocation for the first non-blocking ++ * pass through the AGs if we are near ENOSPC to consume free inodes ++ * that we can immediately allocate, but then we allow allocation on the ++ * second pass if we fail to find an AG with free inodes in it. ++ */ ++ if (percpu_counter_read_positive(&mp->m_fdblocks) < ++ mp->m_low_space[XFS_LOWSP_1_PCNT]) { ++ ok_alloc = false; ++ low_space = true; ++ } ++ ++ /* + * Loop until we find an allocation group that either has free inodes + * or in which we can allocate some inodes. Iterate through the + * allocation groups upward, wrapping at the end. +@@ -1795,6 +1810,8 @@ xfs_dialloc( + break; + } + flags = 0; ++ if (low_space) ++ ok_alloc = true; + } + xfs_perag_put(pag); + } diff --git a/queue-6.1/xfs-quotacheck-failure-can-race-with-background-inode-inactivation.patch b/queue-6.1/xfs-quotacheck-failure-can-race-with-background-inode-inactivation.patch new file mode 100644 index 00000000000..8b5dc78f0b4 --- /dev/null +++ b/queue-6.1/xfs-quotacheck-failure-can-race-with-background-inode-inactivation.patch @@ -0,0 +1,128 @@ +From stable+bounces-77024-greg=kroah.com@vger.kernel.org Tue Sep 24 20:39:50 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:34 -0700 +Subject: xfs: quotacheck failure can race with background inode inactivation +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner , Pengfei Xu , "Darrick J. Wong" , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-10-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit 0c7273e494dd5121e20e160cb2f047a593ee14a8 ] + +The background inode inactivation can attached dquots to inodes, but +this can race with a foreground quotacheck failure that leads to +disabling quotas and freeing the mp->m_quotainfo structure. The +background inode inactivation then tries to allocate a quota, tries +to dereference mp->m_quotainfo, and crashes like so: + +XFS (loop1): Quotacheck: Unsuccessful (Error -5): Disabling quotas. +xfs filesystem being mounted at /root/syzkaller.qCVHXV/0/file0 supports timestamps until 2038 (0x7fffffff) +BUG: kernel NULL pointer dereference, address: 00000000000002a8 +.... +CPU: 0 PID: 161 Comm: kworker/0:4 Not tainted 6.2.0-c9c3395d5e3d #1 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 +Workqueue: xfs-inodegc/loop1 xfs_inodegc_worker +RIP: 0010:xfs_dquot_alloc+0x95/0x1e0 +.... +Call Trace: + + xfs_qm_dqread+0x46/0x440 + xfs_qm_dqget_inode+0x154/0x500 + xfs_qm_dqattach_one+0x142/0x3c0 + xfs_qm_dqattach_locked+0x14a/0x170 + xfs_qm_dqattach+0x52/0x80 + xfs_inactive+0x186/0x340 + xfs_inodegc_worker+0xd3/0x430 + process_one_work+0x3b1/0x960 + worker_thread+0x52/0x660 + kthread+0x161/0x1a0 + ret_from_fork+0x29/0x50 + +.... + +Prevent this race by flushing all the queued background inode +inactivations pending before purging all the cached dquots when +quotacheck fails. + +Reported-by: Pengfei Xu +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_qm.c | 40 ++++++++++++++++++++++++++-------------- + 1 file changed, 26 insertions(+), 14 deletions(-) + +--- a/fs/xfs/xfs_qm.c ++++ b/fs/xfs/xfs_qm.c +@@ -1321,15 +1321,14 @@ xfs_qm_quotacheck( + + error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true, + NULL); +- if (error) { +- /* +- * The inode walk may have partially populated the dquot +- * caches. We must purge them before disabling quota and +- * tearing down the quotainfo, or else the dquots will leak. +- */ +- xfs_qm_dqpurge_all(mp); +- goto error_return; +- } ++ ++ /* ++ * On error, the inode walk may have partially populated the dquot ++ * caches. We must purge them before disabling quota and tearing down ++ * the quotainfo, or else the dquots will leak. ++ */ ++ if (error) ++ goto error_purge; + + /* + * We've made all the changes that we need to make incore. Flush them +@@ -1363,10 +1362,8 @@ xfs_qm_quotacheck( + * and turn quotaoff. The dquots won't be attached to any of the inodes + * at this point (because we intentionally didn't in dqget_noattach). + */ +- if (error) { +- xfs_qm_dqpurge_all(mp); +- goto error_return; +- } ++ if (error) ++ goto error_purge; + + /* + * If one type of quotas is off, then it will lose its +@@ -1376,7 +1373,7 @@ xfs_qm_quotacheck( + mp->m_qflags &= ~XFS_ALL_QUOTA_CHKD; + mp->m_qflags |= flags; + +- error_return: ++error_return: + xfs_buf_delwri_cancel(&buffer_list); + + if (error) { +@@ -1395,6 +1392,21 @@ xfs_qm_quotacheck( + } else + xfs_notice(mp, "Quotacheck: Done."); + return error; ++ ++error_purge: ++ /* ++ * On error, we may have inodes queued for inactivation. This may try ++ * to attach dquots to the inode before running cleanup operations on ++ * the inode and this can race with the xfs_qm_destroy_quotainfo() call ++ * below that frees mp->m_quotainfo. To avoid this race, flush all the ++ * pending inodegc operations before we purge the dquots from memory, ++ * ensuring that background inactivation is idle whilst we turn off ++ * quotas. ++ */ ++ xfs_inodegc_flush(mp); ++ xfs_qm_dqpurge_all(mp); ++ goto error_return; ++ + } + + /* diff --git a/queue-6.1/xfs-reload-entire-unlinked-bucket-lists.patch b/queue-6.1/xfs-reload-entire-unlinked-bucket-lists.patch new file mode 100644 index 00000000000..b9de0092ca9 --- /dev/null +++ b/queue-6.1/xfs-reload-entire-unlinked-bucket-lists.patch @@ -0,0 +1,221 @@ +From stable+bounces-77038-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:31 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:48 -0700 +Subject: xfs: reload entire unlinked bucket lists +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, "Darrick J. Wong" , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-24-leah.rumancik@gmail.com> + +From: "Darrick J. Wong" + +[ Upstream commit 83771c50e42b92de6740a63e152c96c052d37736 ] + +The previous patch to reload unrecovered unlinked inodes when adding a +newly created inode to the unlinked list is missing a key piece of +functionality. It doesn't handle the case that someone calls xfs_iget +on an inode that is not the last item in the incore list. For example, +if at mount time the ondisk iunlink bucket looks like this: + +AGI -> 7 -> 22 -> 3 -> NULL + +None of these three inodes are cached in memory. Now let's say that +someone tries to open inode 3 by handle. We need to walk the list to +make sure that inodes 7 and 22 get loaded cold, and that the +i_prev_unlinked of inode 3 gets set to 22. + +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_export.c | 6 +++ + fs/xfs/xfs_inode.c | 100 ++++++++++++++++++++++++++++++++++++++++++++++++++++ + fs/xfs/xfs_inode.h | 9 ++++ + fs/xfs/xfs_itable.c | 9 ++++ + fs/xfs/xfs_trace.h | 20 ++++++++++ + 5 files changed, 144 insertions(+) + +--- a/fs/xfs/xfs_export.c ++++ b/fs/xfs/xfs_export.c +@@ -146,6 +146,12 @@ xfs_nfs_get_inode( + return ERR_PTR(error); + } + ++ error = xfs_inode_reload_unlinked(ip); ++ if (error) { ++ xfs_irele(ip); ++ return ERR_PTR(error); ++ } ++ + if (VFS_I(ip)->i_generation != generation) { + xfs_irele(ip); + return ERR_PTR(-ESTALE); +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -3622,3 +3622,103 @@ xfs_iunlock2_io_mmap( + if (ip1 != ip2) + inode_unlock(VFS_I(ip1)); + } ++ ++/* ++ * Reload the incore inode list for this inode. Caller should ensure that ++ * the link count cannot change, either by taking ILOCK_SHARED or otherwise ++ * preventing other threads from executing. ++ */ ++int ++xfs_inode_reload_unlinked_bucket( ++ struct xfs_trans *tp, ++ struct xfs_inode *ip) ++{ ++ struct xfs_mount *mp = tp->t_mountp; ++ struct xfs_buf *agibp; ++ struct xfs_agi *agi; ++ struct xfs_perag *pag; ++ xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); ++ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); ++ xfs_agino_t prev_agino, next_agino; ++ unsigned int bucket; ++ bool foundit = false; ++ int error; ++ ++ /* Grab the first inode in the list */ ++ pag = xfs_perag_get(mp, agno); ++ error = xfs_ialloc_read_agi(pag, tp, &agibp); ++ xfs_perag_put(pag); ++ if (error) ++ return error; ++ ++ bucket = agino % XFS_AGI_UNLINKED_BUCKETS; ++ agi = agibp->b_addr; ++ ++ trace_xfs_inode_reload_unlinked_bucket(ip); ++ ++ xfs_info_ratelimited(mp, ++ "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating list recovery.", ++ agino, agno); ++ ++ prev_agino = NULLAGINO; ++ next_agino = be32_to_cpu(agi->agi_unlinked[bucket]); ++ while (next_agino != NULLAGINO) { ++ struct xfs_inode *next_ip = NULL; ++ ++ if (next_agino == agino) { ++ /* Found this inode, set its backlink. */ ++ next_ip = ip; ++ next_ip->i_prev_unlinked = prev_agino; ++ foundit = true; ++ } ++ if (!next_ip) { ++ /* Inode already in memory. */ ++ next_ip = xfs_iunlink_lookup(pag, next_agino); ++ } ++ if (!next_ip) { ++ /* Inode not in memory, reload. */ ++ error = xfs_iunlink_reload_next(tp, agibp, prev_agino, ++ next_agino); ++ if (error) ++ break; ++ ++ next_ip = xfs_iunlink_lookup(pag, next_agino); ++ } ++ if (!next_ip) { ++ /* No incore inode at all? We reloaded it... */ ++ ASSERT(next_ip != NULL); ++ error = -EFSCORRUPTED; ++ break; ++ } ++ ++ prev_agino = next_agino; ++ next_agino = next_ip->i_next_unlinked; ++ } ++ ++ xfs_trans_brelse(tp, agibp); ++ /* Should have found this inode somewhere in the iunlinked bucket. */ ++ if (!error && !foundit) ++ error = -EFSCORRUPTED; ++ return error; ++} ++ ++/* Decide if this inode is missing its unlinked list and reload it. */ ++int ++xfs_inode_reload_unlinked( ++ struct xfs_inode *ip) ++{ ++ struct xfs_trans *tp; ++ int error; ++ ++ error = xfs_trans_alloc_empty(ip->i_mount, &tp); ++ if (error) ++ return error; ++ ++ xfs_ilock(ip, XFS_ILOCK_SHARED); ++ if (xfs_inode_unlinked_incomplete(ip)) ++ error = xfs_inode_reload_unlinked_bucket(tp, ip); ++ xfs_iunlock(ip, XFS_ILOCK_SHARED); ++ xfs_trans_cancel(tp); ++ ++ return error; ++} +--- a/fs/xfs/xfs_inode.h ++++ b/fs/xfs/xfs_inode.h +@@ -593,4 +593,13 @@ void xfs_end_io(struct work_struct *work + int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); + void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); + ++static inline bool ++xfs_inode_unlinked_incomplete( ++ struct xfs_inode *ip) ++{ ++ return VFS_I(ip)->i_nlink == 0 && !xfs_inode_on_unlinked_list(ip); ++} ++int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip); ++int xfs_inode_reload_unlinked(struct xfs_inode *ip); ++ + #endif /* __XFS_INODE_H__ */ +--- a/fs/xfs/xfs_itable.c ++++ b/fs/xfs/xfs_itable.c +@@ -80,6 +80,15 @@ xfs_bulkstat_one_int( + if (error) + goto out; + ++ if (xfs_inode_unlinked_incomplete(ip)) { ++ error = xfs_inode_reload_unlinked_bucket(tp, ip); ++ if (error) { ++ xfs_iunlock(ip, XFS_ILOCK_SHARED); ++ xfs_irele(ip); ++ return error; ++ } ++ } ++ + ASSERT(ip != NULL); + ASSERT(ip->i_imap.im_blkno != 0); + inode = VFS_I(ip); +--- a/fs/xfs/xfs_trace.h ++++ b/fs/xfs/xfs_trace.h +@@ -3704,6 +3704,26 @@ TRACE_EVENT(xfs_iunlink_reload_next, + __entry->next_agino) + ); + ++TRACE_EVENT(xfs_inode_reload_unlinked_bucket, ++ TP_PROTO(struct xfs_inode *ip), ++ TP_ARGS(ip), ++ TP_STRUCT__entry( ++ __field(dev_t, dev) ++ __field(xfs_agnumber_t, agno) ++ __field(xfs_agino_t, agino) ++ ), ++ TP_fast_assign( ++ __entry->dev = ip->i_mount->m_super->s_dev; ++ __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); ++ __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); ++ ), ++ TP_printk("dev %d:%d agno 0x%x agino 0x%x bucket %u", ++ MAJOR(__entry->dev), MINOR(__entry->dev), ++ __entry->agno, ++ __entry->agino, ++ __entry->agino % XFS_AGI_UNLINKED_BUCKETS) ++); ++ + DECLARE_EVENT_CLASS(xfs_ag_inode_class, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), diff --git a/queue-6.1/xfs-remove-warn-when-dquot-cache-insertion-fails.patch b/queue-6.1/xfs-remove-warn-when-dquot-cache-insertion-fails.patch new file mode 100644 index 00000000000..d865fcb4302 --- /dev/null +++ b/queue-6.1/xfs-remove-warn-when-dquot-cache-insertion-fails.patch @@ -0,0 +1,35 @@ +From stable+bounces-77031-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:06 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:41 -0700 +Subject: xfs: remove WARN when dquot cache insertion fails +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, Dave Chinner , syzbot+6ae213503fb12e87934f@syzkaller.appspotmail.com, "Darrick J. Wong" , Dave Chinner , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-17-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit 4b827b3f305d1fcf837265f1e12acc22ee84327c ] + +It just creates unnecessary bot noise these days. + +Reported-by: syzbot+6ae213503fb12e87934f@syzkaller.appspotmail.com +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_dquot.c | 1 - + 1 file changed, 1 deletion(-) + +--- a/fs/xfs/xfs_dquot.c ++++ b/fs/xfs/xfs_dquot.c +@@ -798,7 +798,6 @@ xfs_qm_dqget_cache_insert( + error = radix_tree_insert(tree, id, dqp); + if (unlikely(error)) { + /* Duplicate found! Caller must try again. */ +- WARN_ON(error != -EEXIST); + mutex_unlock(&qi->qi_tree_lock); + trace_xfs_dqget_dup(dqp); + return error; diff --git a/queue-6.1/xfs-set-bnobt-cntbt-numrecs-correctly-when-formatting-new-ags.patch b/queue-6.1/xfs-set-bnobt-cntbt-numrecs-correctly-when-formatting-new-ags.patch new file mode 100644 index 00000000000..565b1cfad7e --- /dev/null +++ b/queue-6.1/xfs-set-bnobt-cntbt-numrecs-correctly-when-formatting-new-ags.patch @@ -0,0 +1,129 @@ +From stable+bounces-77041-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:54 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:51 -0700 +Subject: xfs: set bnobt/cntbt numrecs correctly when formatting new AGs +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, "Darrick J. Wong" , Dave Chinner , Dave Chinner , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-27-leah.rumancik@gmail.com> + +From: "Darrick J. Wong" + +[ Upstream commit 8e698ee72c4ecbbf18264568eb310875839fd601 ] + +Through generic/300, I discovered that mkfs.xfs creates corrupt +filesystems when given these parameters: + +Filesystems formatted with --unsupported are not supported!! +meta-data=/dev/sda isize=512 agcount=8, agsize=16352 blks + = sectsz=512 attr=2, projid32bit=1 + = crc=1 finobt=1, sparse=1, rmapbt=1 + = reflink=1 bigtime=1 inobtcount=1 nrext64=1 +data = bsize=4096 blocks=130816, imaxpct=25 + = sunit=32 swidth=128 blks +naming =version 2 bsize=4096 ascii-ci=0, ftype=1 +log =internal log bsize=4096 blocks=8192, version=2 + = sectsz=512 sunit=32 blks, lazy-count=1 +realtime =none extsz=4096 blocks=0, rtextents=0 + = rgcount=0 rgsize=0 blks +Discarding blocks...Done. +Phase 1 - find and verify superblock... + - reporting progress in intervals of 15 minutes +Phase 2 - using internal log + - zero log... + - 16:30:50: zeroing log - 16320 of 16320 blocks done + - scan filesystem freespace and inode maps... +agf_freeblks 25, counted 0 in ag 4 +sb_fdblocks 8823, counted 8798 + +The root cause of this problem is the numrecs handling in +xfs_freesp_init_recs, which is used to initialize a new AG. Prior to +calling the function, we set up the new bnobt block with numrecs == 1 +and rely on _freesp_init_recs to format that new record. If the last +record created has a blockcount of zero, then it sets numrecs = 0. + +That last bit isn't correct if the AG contains the log, the start of the +log is not immediately after the initial blocks due to stripe alignment, +and the end of the log is perfectly aligned with the end of the AG. For +this case, we actually formatted a single bnobt record to handle the +free space before the start of the (stripe aligned) log, and incremented +arec to try to format a second record. That second record turned out to +be unnecessary, so what we really want is to leave numrecs at 1. + +The numrecs handling itself is overly complicated because a different +function sets numrecs == 1. Change the bnobt creation code to start +with numrecs set to zero and only increment it after successfully +formatting a free space extent into the btree block. + +Fixes: f327a00745ff ("xfs: account for log space when formatting new AGs") +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_ag.c | 19 +++++++++---------- + 1 file changed, 9 insertions(+), 10 deletions(-) + +--- a/fs/xfs/libxfs/xfs_ag.c ++++ b/fs/xfs/libxfs/xfs_ag.c +@@ -415,10 +415,12 @@ xfs_freesp_init_recs( + ASSERT(start >= mp->m_ag_prealloc_blocks); + if (start != mp->m_ag_prealloc_blocks) { + /* +- * Modify first record to pad stripe align of log ++ * Modify first record to pad stripe align of log and ++ * bump the record count. + */ + arec->ar_blockcount = cpu_to_be32(start - + mp->m_ag_prealloc_blocks); ++ be16_add_cpu(&block->bb_numrecs, 1); + nrec = arec + 1; + + /* +@@ -429,7 +431,6 @@ xfs_freesp_init_recs( + be32_to_cpu(arec->ar_startblock) + + be32_to_cpu(arec->ar_blockcount)); + arec = nrec; +- be16_add_cpu(&block->bb_numrecs, 1); + } + /* + * Change record start to after the internal log +@@ -438,15 +439,13 @@ xfs_freesp_init_recs( + } + + /* +- * Calculate the record block count and check for the case where +- * the log might have consumed all available space in the AG. If +- * so, reset the record count to 0 to avoid exposure of an invalid +- * record start block. ++ * Calculate the block count of this record; if it is nonzero, ++ * increment the record count. + */ + arec->ar_blockcount = cpu_to_be32(id->agsize - + be32_to_cpu(arec->ar_startblock)); +- if (!arec->ar_blockcount) +- block->bb_numrecs = 0; ++ if (arec->ar_blockcount) ++ be16_add_cpu(&block->bb_numrecs, 1); + } + + /* +@@ -458,7 +457,7 @@ xfs_bnoroot_init( + struct xfs_buf *bp, + struct aghdr_init_data *id) + { +- xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno); ++ xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 0, id->agno); + xfs_freesp_init_recs(mp, bp, id); + } + +@@ -468,7 +467,7 @@ xfs_cntroot_init( + struct xfs_buf *bp, + struct aghdr_init_data *id) + { +- xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno); ++ xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 0, id->agno); + xfs_freesp_init_recs(mp, bp, id); + } + diff --git a/queue-6.1/xfs-use-i_prev_unlinked-to-distinguish-inodes-that-are-not-on-the-unlinked-list.patch b/queue-6.1/xfs-use-i_prev_unlinked-to-distinguish-inodes-that-are-not-on-the-unlinked-list.patch new file mode 100644 index 00000000000..f9c81563a7b --- /dev/null +++ b/queue-6.1/xfs-use-i_prev_unlinked-to-distinguish-inodes-that-are-not-on-the-unlinked-list.patch @@ -0,0 +1,109 @@ +From stable+bounces-77037-greg=kroah.com@vger.kernel.org Tue Sep 24 20:40:23 2024 +From: Leah Rumancik +Date: Tue, 24 Sep 2024 11:38:47 -0700 +Subject: xfs: use i_prev_unlinked to distinguish inodes that are not on the unlinked list +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, cem@kernel.org, catherine.hoang@oracle.com, "Darrick J. Wong" , Leah Rumancik , Chandan Babu R +Message-ID: <20240924183851.1901667-23-leah.rumancik@gmail.com> + +From: "Darrick J. Wong" + +[ Upstream commit f12b96683d6976a3a07fdf3323277c79dbe8f6ab ] + +Alter the definition of i_prev_unlinked slightly to make it more obvious +when an inode with 0 link count is not part of the iunlink bucket lists +rooted in the AGI. This distinction is necessary because it is not +sufficient to check inode.i_nlink to decide if an inode is on the +unlinked list. Updates to i_nlink can happen while holding only +ILOCK_EXCL, but updates to an inode's position in the AGI unlinked list +(which happen after the nlink update) requires both ILOCK_EXCL and the +AGI buffer lock. + +The next few patches will make it possible to reload an entire unlinked +bucket list when we're walking the inode table or performing handle +operations and need more than the ability to iget the last inode in the +chain. + +The upcoming directory repair code also needs to be able to make this +distinction to decide if a zero link count directory should be moved to +the orphanage or allowed to inactivate. An upcoming enhancement to the +online AGI fsck code will need this distinction to check and rebuild the +AGI unlinked buckets. + +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Chandan Babu R +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_icache.c | 2 +- + fs/xfs/xfs_inode.c | 3 ++- + fs/xfs/xfs_inode.h | 20 +++++++++++++++++++- + 3 files changed, 22 insertions(+), 3 deletions(-) + +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -113,7 +113,7 @@ xfs_inode_alloc( + INIT_LIST_HEAD(&ip->i_ioend_list); + spin_lock_init(&ip->i_ioend_lock); + ip->i_next_unlinked = NULLAGINO; +- ip->i_prev_unlinked = NULLAGINO; ++ ip->i_prev_unlinked = 0; + + return ip; + } +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -2015,6 +2015,7 @@ xfs_iunlink_insert_inode( + } + + /* Point the head of the list to point to this inode. */ ++ ip->i_prev_unlinked = NULLAGINO; + return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); + } + +@@ -2117,7 +2118,7 @@ xfs_iunlink_remove_inode( + } + + ip->i_next_unlinked = NULLAGINO; +- ip->i_prev_unlinked = NULLAGINO; ++ ip->i_prev_unlinked = 0; + return error; + } + +--- a/fs/xfs/xfs_inode.h ++++ b/fs/xfs/xfs_inode.h +@@ -68,8 +68,21 @@ typedef struct xfs_inode { + uint64_t i_diflags2; /* XFS_DIFLAG2_... */ + struct timespec64 i_crtime; /* time created */ + +- /* unlinked list pointers */ ++ /* ++ * Unlinked list pointers. These point to the next and previous inodes ++ * in the AGI unlinked bucket list, respectively. These fields can ++ * only be updated with the AGI locked. ++ * ++ * i_next_unlinked caches di_next_unlinked. ++ */ + xfs_agino_t i_next_unlinked; ++ ++ /* ++ * If the inode is not on an unlinked list, this field is zero. If the ++ * inode is the first element in an unlinked list, this field is ++ * NULLAGINO. Otherwise, i_prev_unlinked points to the previous inode ++ * in the unlinked list. ++ */ + xfs_agino_t i_prev_unlinked; + + /* VFS inode */ +@@ -81,6 +94,11 @@ typedef struct xfs_inode { + struct list_head i_ioend_list; + } xfs_inode_t; + ++static inline bool xfs_inode_on_unlinked_list(const struct xfs_inode *ip) ++{ ++ return ip->i_prev_unlinked != 0; ++} ++ + static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip) + { + return ip->i_forkoff > 0;