From: Greg Kroah-Hartman Date: Fri, 21 Feb 2025 15:23:26 +0000 (+0100) Subject: 6.6-stable patches X-Git-Tag: v6.6.80~29 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=d0c33e3949c703a813e55b0e8790d2a676a89886;p=thirdparty%2Fkernel%2Fstable-queue.git 6.6-stable patches added patches: xfs-assert-a-valid-limit-in-xfs_rtfind_forw.patch xfs-call-xfs_bmap_exact_minlen_extent_alloc-from-xfs_bmap_btalloc.patch xfs-check-for-delayed-allocations-before-setting-extsize.patch xfs-distinguish-extra-split-from-real-enospc-from-xfs_attr3_leaf_split.patch xfs-distinguish-extra-split-from-real-enospc-from-xfs_attr_node_try_addname.patch xfs-don-t-free-cowblocks-from-under-dirty-pagecache-on-unshare.patch xfs-don-t-ifdef-around-the-exact-minlen-allocations.patch xfs-don-t-use-__gfp_retry_mayfail-in-xfs_initialize_perag.patch xfs-error-out-when-a-superblock-buffer-update-reduces-the-agcount.patch xfs-fix-a-sloppy-memory-handling-bug-in-xfs_iroot_realloc.patch xfs-fix-a-typo.patch xfs-fold-xfs_bmap_alloc_userdata-into-xfs_bmapi_allocate.patch xfs-merge-xfs_attr_leaf_try_add-into-xfs_attr_leaf_addname.patch xfs-pass-the-exact-range-to-initialize-to-xfs_initialize_perag.patch xfs-reduce-unnecessary-searches-when-searching-for-the-best-extents.patch xfs-remove-empty-declartion-in-header-file.patch xfs-return-bool-from-xfs_attr3_leaf_add.patch xfs-skip-background-cowblock-trims-on-inodes-open-for-write.patch xfs-streamline-xfs_filestream_pick_ag.patch xfs-support-lowmode-allocations-in-xfs_bmap_exact_minlen_extent_alloc.patch xfs-update-the-file-system-geometry-after-recoverying-superblock-buffers.patch xfs-update-the-pag-for-the-last-ag-at-recovery-time.patch xfs-use-try_cmpxchg-in-xlog_cil_insert_pcp_aggregate.patch xfs-validate-inumber-in-xfs_iget.patch --- diff --git a/queue-6.6/series b/queue-6.6/series index 23ce9dbdbe..414dd126c5 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -1 +1,25 @@ arm64-mte-do-not-allow-prot_mte-on-map_hugetlb-user-mappings.patch +xfs-assert-a-valid-limit-in-xfs_rtfind_forw.patch +xfs-validate-inumber-in-xfs_iget.patch +xfs-fix-a-sloppy-memory-handling-bug-in-xfs_iroot_realloc.patch +xfs-fix-a-typo.patch +xfs-skip-background-cowblock-trims-on-inodes-open-for-write.patch +xfs-don-t-free-cowblocks-from-under-dirty-pagecache-on-unshare.patch +xfs-merge-xfs_attr_leaf_try_add-into-xfs_attr_leaf_addname.patch +xfs-return-bool-from-xfs_attr3_leaf_add.patch +xfs-distinguish-extra-split-from-real-enospc-from-xfs_attr3_leaf_split.patch +xfs-distinguish-extra-split-from-real-enospc-from-xfs_attr_node_try_addname.patch +xfs-fold-xfs_bmap_alloc_userdata-into-xfs_bmapi_allocate.patch +xfs-don-t-ifdef-around-the-exact-minlen-allocations.patch +xfs-call-xfs_bmap_exact_minlen_extent_alloc-from-xfs_bmap_btalloc.patch +xfs-support-lowmode-allocations-in-xfs_bmap_exact_minlen_extent_alloc.patch +xfs-use-try_cmpxchg-in-xlog_cil_insert_pcp_aggregate.patch +xfs-remove-empty-declartion-in-header-file.patch +xfs-pass-the-exact-range-to-initialize-to-xfs_initialize_perag.patch +xfs-update-the-file-system-geometry-after-recoverying-superblock-buffers.patch +xfs-error-out-when-a-superblock-buffer-update-reduces-the-agcount.patch +xfs-don-t-use-__gfp_retry_mayfail-in-xfs_initialize_perag.patch +xfs-update-the-pag-for-the-last-ag-at-recovery-time.patch +xfs-reduce-unnecessary-searches-when-searching-for-the-best-extents.patch +xfs-streamline-xfs_filestream_pick_ag.patch +xfs-check-for-delayed-allocations-before-setting-extsize.patch diff --git a/queue-6.6/xfs-assert-a-valid-limit-in-xfs_rtfind_forw.patch b/queue-6.6/xfs-assert-a-valid-limit-in-xfs_rtfind_forw.patch new file mode 100644 index 0000000000..bee8ae8134 --- /dev/null +++ b/queue-6.6/xfs-assert-a-valid-limit-in-xfs_rtfind_forw.patch @@ -0,0 +1,36 @@ +From stable+bounces-113967-greg=kroah.com@vger.kernel.org Wed Feb 5 22:40:45 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:02 -0800 +Subject: xfs: assert a valid limit in xfs_rtfind_forw +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-2-catherine.hoang@oracle.com> + +From: Christoph Hellwig + +commit 6d2db12d56a389b3e8efa236976f8dc3a8ae00f0 upstream. + +Protect against developers passing stupid limits when refactoring the +RT code once again. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_rtbitmap.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/xfs/libxfs/xfs_rtbitmap.c ++++ b/fs/xfs/libxfs/xfs_rtbitmap.c +@@ -288,6 +288,8 @@ xfs_rtfind_forw( + xfs_rtword_t wdiff; /* difference from wanted value */ + int word; /* word number in the buffer */ + ++ ASSERT(start <= limit); ++ + /* + * Compute and read in starting bitmap block for starting block. + */ diff --git a/queue-6.6/xfs-call-xfs_bmap_exact_minlen_extent_alloc-from-xfs_bmap_btalloc.patch b/queue-6.6/xfs-call-xfs_bmap_exact_minlen_extent_alloc-from-xfs_bmap_btalloc.patch new file mode 100644 index 0000000000..3b8aa6f517 --- /dev/null +++ b/queue-6.6/xfs-call-xfs_bmap_exact_minlen_extent_alloc-from-xfs_bmap_btalloc.patch @@ -0,0 +1,125 @@ +From stable+bounces-113979-greg=kroah.com@vger.kernel.org Wed Feb 5 22:41:07 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:14 -0800 +Subject: xfs: call xfs_bmap_exact_minlen_extent_alloc from xfs_bmap_btalloc +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-14-catherine.hoang@oracle.com> + +From: Christoph Hellwig + +commit 405ee87c6938f67e6ab62a3f8f85b3c60a093886 upstream. + +[backport: dependency of 6aac770] + +xfs_bmap_exact_minlen_extent_alloc duplicates the args setup in +xfs_bmap_btalloc. Switch to call it from xfs_bmap_btalloc after +doing the basic setup. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_bmap.c | 61 ++++++++++------------------------------------- + 1 file changed, 13 insertions(+), 48 deletions(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -3390,28 +3390,17 @@ xfs_bmap_process_allocated_extent( + + static int + xfs_bmap_exact_minlen_extent_alloc( +- struct xfs_bmalloca *ap) ++ struct xfs_bmalloca *ap, ++ struct xfs_alloc_arg *args) + { +- struct xfs_mount *mp = ap->ip->i_mount; +- struct xfs_alloc_arg args = { .tp = ap->tp, .mp = mp }; +- xfs_fileoff_t orig_offset; +- xfs_extlen_t orig_length; +- int error; +- +- ASSERT(ap->length); +- + if (ap->minlen != 1) { +- ap->blkno = NULLFSBLOCK; +- ap->length = 0; ++ args->fsbno = NULLFSBLOCK; + return 0; + } + +- orig_offset = ap->offset; +- orig_length = ap->length; +- +- args.alloc_minlen_only = 1; +- +- xfs_bmap_compute_alignments(ap, &args); ++ args->alloc_minlen_only = 1; ++ args->minlen = args->maxlen = ap->minlen; ++ args->total = ap->total; + + /* + * Unlike the longest extent available in an AG, we don't track +@@ -3421,33 +3410,9 @@ xfs_bmap_exact_minlen_extent_alloc( + * we need not be concerned about a drop in performance in + * "debug only" code paths. + */ +- ap->blkno = XFS_AGB_TO_FSB(mp, 0, 0); +- +- args.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; +- args.minlen = args.maxlen = ap->minlen; +- args.total = ap->total; ++ ap->blkno = XFS_AGB_TO_FSB(ap->ip->i_mount, 0, 0); + +- args.alignment = 1; +- args.minalignslop = 0; +- +- args.minleft = ap->minleft; +- args.wasdel = ap->wasdel; +- args.resv = XFS_AG_RESV_NONE; +- args.datatype = ap->datatype; +- +- error = xfs_alloc_vextent_first_ag(&args, ap->blkno); +- if (error) +- return error; +- +- if (args.fsbno != NULLFSBLOCK) { +- xfs_bmap_process_allocated_extent(ap, &args, orig_offset, +- orig_length); +- } else { +- ap->blkno = NULLFSBLOCK; +- ap->length = 0; +- } +- +- return 0; ++ return xfs_alloc_vextent_first_ag(args, ap->blkno); + } + + /* +@@ -3706,8 +3671,11 @@ xfs_bmap_btalloc( + /* Trim the allocation back to the maximum an AG can fit. */ + args.maxlen = min(ap->length, mp->m_ag_max_usable); + +- if ((ap->datatype & XFS_ALLOC_USERDATA) && +- xfs_inode_is_filestream(ap->ip)) ++ if (unlikely(XFS_TEST_ERROR(false, mp, ++ XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) ++ error = xfs_bmap_exact_minlen_extent_alloc(ap, &args); ++ else if ((ap->datatype & XFS_ALLOC_USERDATA) && ++ xfs_inode_is_filestream(ap->ip)) + error = xfs_bmap_btalloc_filestreams(ap, &args, stripe_align); + else + error = xfs_bmap_btalloc_best_length(ap, &args, stripe_align); +@@ -4128,9 +4096,6 @@ xfs_bmapi_allocate( + if ((bma->datatype & XFS_ALLOC_USERDATA) && + XFS_IS_REALTIME_INODE(bma->ip)) + error = xfs_bmap_rtalloc(bma); +- else if (unlikely(XFS_TEST_ERROR(false, mp, +- XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) +- error = xfs_bmap_exact_minlen_extent_alloc(bma); + else + error = xfs_bmap_btalloc(bma); + if (error) diff --git a/queue-6.6/xfs-check-for-delayed-allocations-before-setting-extsize.patch b/queue-6.6/xfs-check-for-delayed-allocations-before-setting-extsize.patch new file mode 100644 index 0000000000..626d3a0525 --- /dev/null +++ b/queue-6.6/xfs-check-for-delayed-allocations-before-setting-extsize.patch @@ -0,0 +1,95 @@ +From stable+bounces-113989-greg=kroah.com@vger.kernel.org Wed Feb 5 22:41:36 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:25 -0800 +Subject: xfs: Check for delayed allocations before setting extsize +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-25-catherine.hoang@oracle.com> + +From: Ojaswin Mujoo + +commit 2a492ff66673c38a77d0815d67b9a8cce2ef57f8 upstream. + +Extsize should only be allowed to be set on files with no data in it. +For this, we check if the files have extents but miss to check if +delayed extents are present. This patch adds that check. + +While we are at it, also refactor this check into a helper since +it's used in some other places as well like xfs_inactive() or +xfs_ioctl_setattr_xflags() + +**Without the patch (SUCCEEDS)** + +$ xfs_io -c 'open -f testfile' -c 'pwrite 0 1024' -c 'extsize 65536' + +wrote 1024/1024 bytes at offset 0 +1 KiB, 1 ops; 0.0002 sec (4.628 MiB/sec and 4739.3365 ops/sec) + +**With the patch (FAILS as expected)** + +$ xfs_io -c 'open -f testfile' -c 'pwrite 0 1024' -c 'extsize 65536' + +wrote 1024/1024 bytes at offset 0 +1 KiB, 1 ops; 0.0002 sec (4.628 MiB/sec and 4739.3365 ops/sec) +xfs_io: FS_IOC_FSSETXATTR testfile: Invalid argument + +Fixes: e94af02a9cd7 ("[XFS] fix old xfs_setattr mis-merge from irix; mostly harmless esp if not using xfs rt") +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Reviewed-by: John Garry +Signed-off-by: Ojaswin Mujoo +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_inode.c | 2 +- + fs/xfs/xfs_inode.h | 5 +++++ + fs/xfs/xfs_ioctl.c | 4 ++-- + 3 files changed, 8 insertions(+), 3 deletions(-) + +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -1758,7 +1758,7 @@ xfs_inactive( + + if (S_ISREG(VFS_I(ip)->i_mode) && + (ip->i_disk_size != 0 || XFS_ISIZE(ip) != 0 || +- ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) ++ xfs_inode_has_filedata(ip))) + truncate = 1; + + if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) { +--- a/fs/xfs/xfs_inode.h ++++ b/fs/xfs/xfs_inode.h +@@ -286,6 +286,11 @@ static inline bool xfs_is_metadata_inode + xfs_is_quota_inode(&mp->m_sb, ip->i_ino); + } + ++static inline bool xfs_inode_has_filedata(const struct xfs_inode *ip) ++{ ++ return ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0; ++} ++ + /* + * Check if an inode has any data in the COW fork. This might be often false + * even for inodes with the reflink flag when there is no pending COW operation. +--- a/fs/xfs/xfs_ioctl.c ++++ b/fs/xfs/xfs_ioctl.c +@@ -1126,7 +1126,7 @@ xfs_ioctl_setattr_xflags( + + if (rtflag != XFS_IS_REALTIME_INODE(ip)) { + /* Can't change realtime flag if any extents are allocated. */ +- if (ip->i_df.if_nextents || ip->i_delayed_blks) ++ if (xfs_inode_has_filedata(ip)) + return -EINVAL; + + /* +@@ -1247,7 +1247,7 @@ xfs_ioctl_setattr_check_extsize( + if (!fa->fsx_valid) + return 0; + +- if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_df.if_nextents && ++ if (S_ISREG(VFS_I(ip)->i_mode) && xfs_inode_has_filedata(ip) && + XFS_FSB_TO_B(mp, ip->i_extsize) != fa->fsx_extsize) + return -EINVAL; + diff --git a/queue-6.6/xfs-distinguish-extra-split-from-real-enospc-from-xfs_attr3_leaf_split.patch b/queue-6.6/xfs-distinguish-extra-split-from-real-enospc-from-xfs_attr3_leaf_split.patch new file mode 100644 index 0000000000..61746bbb0b --- /dev/null +++ b/queue-6.6/xfs-distinguish-extra-split-from-real-enospc-from-xfs_attr3_leaf_split.patch @@ -0,0 +1,84 @@ +From stable+bounces-113975-greg=kroah.com@vger.kernel.org Wed Feb 5 22:40:58 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:10 -0800 +Subject: xfs: distinguish extra split from real ENOSPC from xfs_attr3_leaf_split +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-10-catherine.hoang@oracle.com> + +From: Christoph Hellwig + +commit a5f73342abe1f796140f6585e43e2aa7bc1b7975 upstream. + +xfs_attr3_leaf_split propagates the need for an extra btree split as +-ENOSPC to it's only caller, but the same return value can also be +returned from xfs_da_grow_inode when it fails to find free space. + +Distinguish the two cases by returning 1 for the extra split case instead +of overloading -ENOSPC. + +This can be triggered relatively easily with the pending realtime group +support and a file system with a lot of small zones that use metadata +space on the main device. In this case every about 5-10th run of +xfs/538 runs into the following assert: + + ASSERT(oldblk->magic == XFS_ATTR_LEAF_MAGIC); + +in xfs_attr3_leaf_split caused by an allocation failure. Note that +the allocation failure is caused by another bug that will be fixed +subsequently, but this commit at least sorts out the error handling. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_attr_leaf.c | 5 ++++- + fs/xfs/libxfs/xfs_da_btree.c | 5 +++-- + 2 files changed, 7 insertions(+), 3 deletions(-) + +--- a/fs/xfs/libxfs/xfs_attr_leaf.c ++++ b/fs/xfs/libxfs/xfs_attr_leaf.c +@@ -1340,6 +1340,9 @@ xfs_attr3_leaf_create( + + /* + * Split the leaf node, rebalance, then add the new entry. ++ * ++ * Returns 0 if the entry was added, 1 if a further split is needed or a ++ * negative error number otherwise. + */ + int + xfs_attr3_leaf_split( +@@ -1396,7 +1399,7 @@ xfs_attr3_leaf_split( + oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL); + newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL); + if (!added) +- return -ENOSPC; ++ return 1; + return 0; + } + +--- a/fs/xfs/libxfs/xfs_da_btree.c ++++ b/fs/xfs/libxfs/xfs_da_btree.c +@@ -522,9 +522,8 @@ xfs_da3_split( + switch (oldblk->magic) { + case XFS_ATTR_LEAF_MAGIC: + error = xfs_attr3_leaf_split(state, oldblk, newblk); +- if ((error != 0) && (error != -ENOSPC)) { ++ if (error < 0) + return error; /* GROT: attr is inconsistent */ +- } + if (!error) { + addblk = newblk; + break; +@@ -546,6 +545,8 @@ xfs_da3_split( + error = xfs_attr3_leaf_split(state, newblk, + &state->extrablk); + } ++ if (error == 1) ++ return -ENOSPC; + if (error) + return error; /* GROT: attr inconsistent */ + addblk = newblk; diff --git a/queue-6.6/xfs-distinguish-extra-split-from-real-enospc-from-xfs_attr_node_try_addname.patch b/queue-6.6/xfs-distinguish-extra-split-from-real-enospc-from-xfs_attr_node_try_addname.patch new file mode 100644 index 0000000000..70521c849d --- /dev/null +++ b/queue-6.6/xfs-distinguish-extra-split-from-real-enospc-from-xfs_attr_node_try_addname.patch @@ -0,0 +1,66 @@ +From stable+bounces-113976-greg=kroah.com@vger.kernel.org Wed Feb 5 22:41:00 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:11 -0800 +Subject: xfs: distinguish extra split from real ENOSPC from xfs_attr_node_try_addname +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-11-catherine.hoang@oracle.com> + +From: Christoph Hellwig + +commit b3f4e84e2f438a119b7ca8684a25452b3e57c0f0 upstream. + +Just like xfs_attr3_leaf_split, xfs_attr_node_try_addname can return +-ENOSPC both for an actual failure to allocate a disk block, but also +to signal the caller to convert the format of the attr fork. Use magic +1 to ask for the conversion here as well. + +Note that unlike the similar issue in xfs_attr3_leaf_split, this one was +only found by code review. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_attr.c | 13 ++++++++----- + 1 file changed, 8 insertions(+), 5 deletions(-) + +--- a/fs/xfs/libxfs/xfs_attr.c ++++ b/fs/xfs/libxfs/xfs_attr.c +@@ -543,7 +543,7 @@ xfs_attr_node_addname( + return error; + + error = xfs_attr_node_try_addname(attr); +- if (error == -ENOSPC) { ++ if (error == 1) { + error = xfs_attr3_leaf_to_node(args); + if (error) + return error; +@@ -1380,9 +1380,12 @@ error: + /* + * Add a name to a Btree-format attribute list. + * +- * This will involve walking down the Btree, and may involve splitting +- * leaf nodes and even splitting intermediate nodes up to and including +- * the root node (a special case of an intermediate node). ++ * This will involve walking down the Btree, and may involve splitting leaf ++ * nodes and even splitting intermediate nodes up to and including the root ++ * node (a special case of an intermediate node). ++ * ++ * If the tree was still in single leaf format and needs to converted to ++ * real node format return 1 and let the caller handle that. + */ + static int + xfs_attr_node_try_addname( +@@ -1404,7 +1407,7 @@ xfs_attr_node_try_addname( + * out-of-line values so it looked like it *might* + * have been a b-tree. Let the caller deal with this. + */ +- error = -ENOSPC; ++ error = 1; + goto out; + } + diff --git a/queue-6.6/xfs-don-t-free-cowblocks-from-under-dirty-pagecache-on-unshare.patch b/queue-6.6/xfs-don-t-free-cowblocks-from-under-dirty-pagecache-on-unshare.patch new file mode 100644 index 0000000000..d2a01e7c56 --- /dev/null +++ b/queue-6.6/xfs-don-t-free-cowblocks-from-under-dirty-pagecache-on-unshare.patch @@ -0,0 +1,125 @@ +From stable+bounces-113972-greg=kroah.com@vger.kernel.org Wed Feb 5 22:40:54 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:07 -0800 +Subject: xfs: don't free cowblocks from under dirty pagecache on unshare +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-7-catherine.hoang@oracle.com> + +From: Brian Foster + +commit 4390f019ad7866c3791c3d768d2ff185d89e8ebe upstream. + +fallocate unshare mode explicitly breaks extent sharing. When a +command completes, it checks the data fork for any remaining shared +extents to determine whether the reflink inode flag and COW fork +preallocation can be removed. This logic doesn't consider in-core +pagecache and I/O state, however, which means we can unsafely remove +COW fork blocks that are still needed under certain conditions. + +For example, consider the following command sequence: + +xfs_io -fc "pwrite 0 1k" -c "reflink 0 256k 1k" \ + -c "pwrite 0 32k" -c "funshare 0 1k" + +This allocates a data block at offset 0, shares it, and then +overwrites it with a larger buffered write. The overwrite triggers +COW fork preallocation, 32 blocks by default, which maps the entire +32k write to delalloc in the COW fork. All but the shared block at +offset 0 remains hole mapped in the data fork. The unshare command +redirties and flushes the folio at offset 0, removing the only +shared extent from the inode. Since the inode no longer maps shared +extents, unshare purges the COW fork before the remaining 28k may +have written back. + +This leaves dirty pagecache backed by holes, which writeback quietly +skips, thus leaving clean, non-zeroed pagecache over holes in the +file. To verify, fiemap shows holes in the first 32k of the file and +reads return different data across a remount: + +$ xfs_io -c "fiemap -v" +: + EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS + ... + 1: [8..511]: hole 504 + ... +$ xfs_io -c "pread -v 4k 8" +00001000: cd cd cd cd cd cd cd cd ........ +$ umount ; mount +$ xfs_io -c "pread -v 4k 8" +00001000: 00 00 00 00 00 00 00 00 ........ + +To avoid this problem, make unshare follow the same rules used for +background cowblock scanning and never purge the COW fork for inodes +with dirty pagecache or in-flight I/O. + +Fixes: 46afb0628b86347 ("xfs: only flush the unshared range in xfs_reflink_unshare") +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_icache.c | 8 +------- + fs/xfs/xfs_reflink.c | 3 +++ + fs/xfs/xfs_reflink.h | 19 +++++++++++++++++++ + 3 files changed, 23 insertions(+), 7 deletions(-) + +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -1271,13 +1271,7 @@ xfs_prep_free_cowblocks( + */ + if (!sync && inode_is_open_for_write(VFS_I(ip))) + return false; +- if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || +- mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || +- mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || +- atomic_read(&VFS_I(ip)->i_dio_count)) +- return false; +- +- return true; ++ return xfs_can_free_cowblocks(ip); + } + + /* +--- a/fs/xfs/xfs_reflink.c ++++ b/fs/xfs/xfs_reflink.c +@@ -1600,6 +1600,9 @@ xfs_reflink_clear_inode_flag( + + ASSERT(xfs_is_reflink_inode(ip)); + ++ if (!xfs_can_free_cowblocks(ip)) ++ return 0; ++ + error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag); + if (error || needs_flag) + return error; +--- a/fs/xfs/xfs_reflink.h ++++ b/fs/xfs/xfs_reflink.h +@@ -16,6 +16,25 @@ static inline bool xfs_is_cow_inode(stru + return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip); + } + ++/* ++ * Check whether it is safe to free COW fork blocks from an inode. It is unsafe ++ * to do so when an inode has dirty cache or I/O in-flight, even if no shared ++ * extents exist in the data fork, because outstanding I/O may target blocks ++ * that were speculatively allocated to the COW fork. ++ */ ++static inline bool ++xfs_can_free_cowblocks(struct xfs_inode *ip) ++{ ++ struct inode *inode = VFS_I(ip); ++ ++ if ((inode->i_state & I_DIRTY_PAGES) || ++ mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) || ++ mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK) || ++ atomic_read(&inode->i_dio_count)) ++ return false; ++ return true; ++} ++ + extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip, + struct xfs_bmbt_irec *irec, bool *shared); + int xfs_bmap_trim_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, diff --git a/queue-6.6/xfs-don-t-ifdef-around-the-exact-minlen-allocations.patch b/queue-6.6/xfs-don-t-ifdef-around-the-exact-minlen-allocations.patch new file mode 100644 index 0000000000..74b9049da5 --- /dev/null +++ b/queue-6.6/xfs-don-t-ifdef-around-the-exact-minlen-allocations.patch @@ -0,0 +1,107 @@ +From stable+bounces-113978-greg=kroah.com@vger.kernel.org Wed Feb 5 22:41:07 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:13 -0800 +Subject: xfs: don't ifdef around the exact minlen allocations +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-13-catherine.hoang@oracle.com> + +From: Christoph Hellwig + +commit b611fddc0435738e64453bbf1dadd4b12a801858 upstream. + +Exact minlen allocations only exist as an error injection tool for debug +builds. Currently this is implemented using ifdefs, which means the code +isn't even compiled for non-XFS_DEBUG builds. Enhance the compile test +coverage by always building the code and use the compilers' dead code +elimination to remove it from the generated binary instead. + +The only downside is that the alloc_minlen_only field is unconditionally +added to struct xfs_alloc_args now, but by moving it around and packing +it tightly this doesn't actually increase the size of the structure. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_alloc.c | 7 ++----- + fs/xfs/libxfs/xfs_alloc.h | 4 +--- + fs/xfs/libxfs/xfs_bmap.c | 6 ------ + 3 files changed, 3 insertions(+), 14 deletions(-) + +--- a/fs/xfs/libxfs/xfs_alloc.c ++++ b/fs/xfs/libxfs/xfs_alloc.c +@@ -2581,7 +2581,6 @@ __xfs_free_extent_later( + return 0; + } + +-#ifdef DEBUG + /* + * Check if an AGF has a free extent record whose length is equal to + * args->minlen. +@@ -2620,7 +2619,6 @@ out: + + return error; + } +-#endif + + /* + * Decide whether to use this allocation group for this allocation. +@@ -2694,15 +2692,14 @@ xfs_alloc_fix_freelist( + if (!xfs_alloc_space_available(args, need, alloc_flags)) + goto out_agbp_relse; + +-#ifdef DEBUG +- if (args->alloc_minlen_only) { ++ if (IS_ENABLED(CONFIG_XFS_DEBUG) && args->alloc_minlen_only) { + int stat; + + error = xfs_exact_minlen_extent_available(args, agbp, &stat); + if (error || !stat) + goto out_agbp_relse; + } +-#endif ++ + /* + * Make the freelist shorter if it's too long. + * +--- a/fs/xfs/libxfs/xfs_alloc.h ++++ b/fs/xfs/libxfs/xfs_alloc.h +@@ -53,11 +53,9 @@ typedef struct xfs_alloc_arg { + int datatype; /* mask defining data type treatment */ + char wasdel; /* set if allocation was prev delayed */ + char wasfromfl; /* set if allocation is from freelist */ ++ bool alloc_minlen_only; /* allocate exact minlen extent */ + struct xfs_owner_info oinfo; /* owner of blocks being allocated */ + enum xfs_ag_resv_type resv; /* block reservation to use */ +-#ifdef DEBUG +- bool alloc_minlen_only; /* allocate exact minlen extent */ +-#endif + } xfs_alloc_arg_t; + + /* +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -3388,7 +3388,6 @@ xfs_bmap_process_allocated_extent( + xfs_bmap_btalloc_accounting(ap, args); + } + +-#ifdef DEBUG + static int + xfs_bmap_exact_minlen_extent_alloc( + struct xfs_bmalloca *ap) +@@ -3450,11 +3449,6 @@ xfs_bmap_exact_minlen_extent_alloc( + + return 0; + } +-#else +- +-#define xfs_bmap_exact_minlen_extent_alloc(bma) (-EFSCORRUPTED) +- +-#endif + + /* + * If we are not low on available data blocks and we are allocating at diff --git a/queue-6.6/xfs-don-t-use-__gfp_retry_mayfail-in-xfs_initialize_perag.patch b/queue-6.6/xfs-don-t-use-__gfp_retry_mayfail-in-xfs_initialize_perag.patch new file mode 100644 index 0000000000..763bf4deb3 --- /dev/null +++ b/queue-6.6/xfs-don-t-use-__gfp_retry_mayfail-in-xfs_initialize_perag.patch @@ -0,0 +1,40 @@ +From stable+bounces-113990-greg=kroah.com@vger.kernel.org Wed Feb 5 22:41:37 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:21 -0800 +Subject: xfs: don't use __GFP_RETRY_MAYFAIL in xfs_initialize_perag +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-21-catherine.hoang@oracle.com> + +From: Christoph Hellwig + +commit 069cf5e32b700f94c6ac60f6171662bdfb04f325 upstream. + +[backport: uses kmem_zalloc instead of kzalloc] + +__GFP_RETRY_MAYFAIL increases the likelyhood of allocations to fail, +which isn't really helpful during log recovery. Remove the flag and +stick to the default GFP_KERNEL policies. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_ag.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xfs/libxfs/xfs_ag.c ++++ b/fs/xfs/libxfs/xfs_ag.c +@@ -370,7 +370,7 @@ xfs_initialize_perag( + int error; + + for (index = old_agcount; index < new_agcount; index++) { +- pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); ++ pag = kmem_zalloc(sizeof(*pag), 0); + if (!pag) { + error = -ENOMEM; + goto out_unwind_new_pags; diff --git a/queue-6.6/xfs-error-out-when-a-superblock-buffer-update-reduces-the-agcount.patch b/queue-6.6/xfs-error-out-when-a-superblock-buffer-update-reduces-the-agcount.patch new file mode 100644 index 0000000000..6d081a4a26 --- /dev/null +++ b/queue-6.6/xfs-error-out-when-a-superblock-buffer-update-reduces-the-agcount.patch @@ -0,0 +1,40 @@ +From stable+bounces-113985-greg=kroah.com@vger.kernel.org Wed Feb 5 22:41:18 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:20 -0800 +Subject: xfs: error out when a superblock buffer update reduces the agcount +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-20-catherine.hoang@oracle.com> + +From: Christoph Hellwig + +commit b882b0f8138ffa935834e775953f1630f89bbb62 upstream. + +XFS currently does not support reducing the agcount, so error out if +a logged sb buffer tries to shrink the agcount. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_buf_item_recover.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/fs/xfs/xfs_buf_item_recover.c ++++ b/fs/xfs/xfs_buf_item_recover.c +@@ -713,6 +713,11 @@ xlog_recover_do_primary_sb_buffer( + */ + xfs_sb_from_disk(&mp->m_sb, dsb); + ++ if (mp->m_sb.sb_agcount < orig_agcount) { ++ xfs_alert(mp, "Shrinking AG count in log recovery not supported"); ++ return -EFSCORRUPTED; ++ } ++ + /* + * Initialize the new perags, and also update various block and inode + * allocator setting based off the number of AGs or total blocks. diff --git a/queue-6.6/xfs-fix-a-sloppy-memory-handling-bug-in-xfs_iroot_realloc.patch b/queue-6.6/xfs-fix-a-sloppy-memory-handling-bug-in-xfs_iroot_realloc.patch new file mode 100644 index 0000000000..965f5db294 --- /dev/null +++ b/queue-6.6/xfs-fix-a-sloppy-memory-handling-bug-in-xfs_iroot_realloc.patch @@ -0,0 +1,55 @@ +From stable+bounces-113969-greg=kroah.com@vger.kernel.org Wed Feb 5 22:40:47 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:04 -0800 +Subject: xfs: fix a sloppy memory handling bug in xfs_iroot_realloc +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-4-catherine.hoang@oracle.com> + +From: "Darrick J. Wong" + +commit de55149b6639e903c4d06eb0474ab2c05060e61d upstream. + +While refactoring code, I noticed that when xfs_iroot_realloc tries to +shrink a bmbt root block, it allocates a smaller new block and then +copies "records" and pointers to the new block. However, bmbt root +blocks cannot ever be leaves, which means that it's not technically +correct to copy records. We /should/ be copying keys. + +Note that this has never resulted in actual memory corruption because +sizeof(bmbt_rec) == (sizeof(bmbt_key) + sizeof(bmbt_ptr)). However, +this will no longer be true when we start adding realtime rmap stuff, +so fix this now. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_inode_fork.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +--- a/fs/xfs/libxfs/xfs_inode_fork.c ++++ b/fs/xfs/libxfs/xfs_inode_fork.c +@@ -449,15 +449,15 @@ xfs_iroot_realloc( + } + + /* +- * Only copy the records and pointers if there are any. ++ * Only copy the keys and pointers if there are any. + */ + if (new_max > 0) { + /* +- * First copy the records. ++ * First copy the keys. + */ +- op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1); +- np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1); +- memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t)); ++ op = (char *)XFS_BMBT_KEY_ADDR(mp, ifp->if_broot, 1); ++ np = (char *)XFS_BMBT_KEY_ADDR(mp, new_broot, 1); ++ memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_key_t)); + + /* + * Then copy the pointers. diff --git a/queue-6.6/xfs-fix-a-typo.patch b/queue-6.6/xfs-fix-a-typo.patch new file mode 100644 index 0000000000..25b6ead20a --- /dev/null +++ b/queue-6.6/xfs-fix-a-typo.patch @@ -0,0 +1,35 @@ +From stable+bounces-113970-greg=kroah.com@vger.kernel.org Wed Feb 5 22:40:52 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:05 -0800 +Subject: xfs: fix a typo +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-5-catherine.hoang@oracle.com> + +From: Andrew Kreimer + +commit 77bfe1b11ea0c0c4b0ce19b742cd1aa82f60e45d upstream. + +Fix a typo in comments. + +Signed-off-by: Andrew Kreimer +Reviewed-by: Darrick J. Wong +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_log_recover.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xfs/xfs_log_recover.c ++++ b/fs/xfs/xfs_log_recover.c +@@ -1820,7 +1820,7 @@ xlog_find_item_ops( + * from the transaction. However, we can't do that until after we've + * replayed all the other items because they may be dependent on the + * cancelled buffer and replaying the cancelled buffer can remove it +- * form the cancelled buffer table. Hence they have tobe done last. ++ * form the cancelled buffer table. Hence they have to be done last. + * + * 3. Inode allocation buffers must be replayed before inode items that + * read the buffer and replay changes into it. For filesystems using the diff --git a/queue-6.6/xfs-fold-xfs_bmap_alloc_userdata-into-xfs_bmapi_allocate.patch b/queue-6.6/xfs-fold-xfs_bmap_alloc_userdata-into-xfs_bmapi_allocate.patch new file mode 100644 index 0000000000..46ed70b81b --- /dev/null +++ b/queue-6.6/xfs-fold-xfs_bmap_alloc_userdata-into-xfs_bmapi_allocate.patch @@ -0,0 +1,118 @@ +From stable+bounces-113977-greg=kroah.com@vger.kernel.org Wed Feb 5 22:41:01 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:12 -0800 +Subject: xfs: fold xfs_bmap_alloc_userdata into xfs_bmapi_allocate +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-12-catherine.hoang@oracle.com> + +From: Christoph Hellwig + +commit 865469cd41bce2b04bef9539cbf70676878bc8df upstream. + +[backport: dependency of 6aac770] + +Userdata and metadata allocations end up in the same allocation helpers. +Remove the separate xfs_bmap_alloc_userdata function to make this more +clear. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_bmap.c | 73 ++++++++++++++++++----------------------------- + 1 file changed, 28 insertions(+), 45 deletions(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -4078,43 +4078,6 @@ out: + } + + static int +-xfs_bmap_alloc_userdata( +- struct xfs_bmalloca *bma) +-{ +- struct xfs_mount *mp = bma->ip->i_mount; +- int whichfork = xfs_bmapi_whichfork(bma->flags); +- int error; +- +- /* +- * Set the data type being allocated. For the data fork, the first data +- * in the file is treated differently to all other allocations. For the +- * attribute fork, we only need to ensure the allocated range is not on +- * the busy list. +- */ +- bma->datatype = XFS_ALLOC_NOBUSY; +- if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) { +- bma->datatype |= XFS_ALLOC_USERDATA; +- if (bma->offset == 0) +- bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; +- +- if (mp->m_dalign && bma->length >= mp->m_dalign) { +- error = xfs_bmap_isaeof(bma, whichfork); +- if (error) +- return error; +- } +- +- if (XFS_IS_REALTIME_INODE(bma->ip)) +- return xfs_bmap_rtalloc(bma); +- } +- +- if (unlikely(XFS_TEST_ERROR(false, mp, +- XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) +- return xfs_bmap_exact_minlen_extent_alloc(bma); +- +- return xfs_bmap_btalloc(bma); +-} +- +-static int + xfs_bmapi_allocate( + struct xfs_bmalloca *bma) + { +@@ -4147,15 +4110,35 @@ xfs_bmapi_allocate( + else + bma->minlen = 1; + +- if (bma->flags & XFS_BMAPI_METADATA) { +- if (unlikely(XFS_TEST_ERROR(false, mp, +- XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) +- error = xfs_bmap_exact_minlen_extent_alloc(bma); +- else +- error = xfs_bmap_btalloc(bma); +- } else { +- error = xfs_bmap_alloc_userdata(bma); ++ if (!(bma->flags & XFS_BMAPI_METADATA)) { ++ /* ++ * For the data and COW fork, the first data in the file is ++ * treated differently to all other allocations. For the ++ * attribute fork, we only need to ensure the allocated range ++ * is not on the busy list. ++ */ ++ bma->datatype = XFS_ALLOC_NOBUSY; ++ if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) { ++ bma->datatype |= XFS_ALLOC_USERDATA; ++ if (bma->offset == 0) ++ bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; ++ ++ if (mp->m_dalign && bma->length >= mp->m_dalign) { ++ error = xfs_bmap_isaeof(bma, whichfork); ++ if (error) ++ return error; ++ } ++ } + } ++ ++ if ((bma->datatype & XFS_ALLOC_USERDATA) && ++ XFS_IS_REALTIME_INODE(bma->ip)) ++ error = xfs_bmap_rtalloc(bma); ++ else if (unlikely(XFS_TEST_ERROR(false, mp, ++ XFS_ERRTAG_BMAP_ALLOC_MINLEN_EXTENT))) ++ error = xfs_bmap_exact_minlen_extent_alloc(bma); ++ else ++ error = xfs_bmap_btalloc(bma); + if (error) + return error; + if (bma->blkno == NULLFSBLOCK) diff --git a/queue-6.6/xfs-merge-xfs_attr_leaf_try_add-into-xfs_attr_leaf_addname.patch b/queue-6.6/xfs-merge-xfs_attr_leaf_try_add-into-xfs_attr_leaf_addname.patch new file mode 100644 index 0000000000..a1bd576bb2 --- /dev/null +++ b/queue-6.6/xfs-merge-xfs_attr_leaf_try_add-into-xfs_attr_leaf_addname.patch @@ -0,0 +1,260 @@ +From stable+bounces-113973-greg=kroah.com@vger.kernel.org Wed Feb 5 22:40:56 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:08 -0800 +Subject: xfs: merge xfs_attr_leaf_try_add into xfs_attr_leaf_addname +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-8-catherine.hoang@oracle.com> + +From: Christoph Hellwig + +commit b1c649da15c2e4c86344c8e5af69c8afa215efec upstream. + +[backport: dependency of a5f7334 and b3f4e84] + +xfs_attr_leaf_try_add is only called by xfs_attr_leaf_addname, and +merging the two will simplify a following error handling fix. + +To facilitate this move the remote block state save/restore helpers up in +the file so that they don't need forward declarations now. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_attr.c | 176 +++++++++++++++++++---------------------------- + 1 file changed, 74 insertions(+), 102 deletions(-) + +--- a/fs/xfs/libxfs/xfs_attr.c ++++ b/fs/xfs/libxfs/xfs_attr.c +@@ -50,7 +50,6 @@ STATIC int xfs_attr_shortform_addname(xf + STATIC int xfs_attr_leaf_get(xfs_da_args_t *args); + STATIC int xfs_attr_leaf_removename(xfs_da_args_t *args); + STATIC int xfs_attr_leaf_hasname(struct xfs_da_args *args, struct xfs_buf **bp); +-STATIC int xfs_attr_leaf_try_add(struct xfs_da_args *args); + + /* + * Internal routines when attribute list is more than one block. +@@ -401,6 +400,33 @@ out: + return error; + } + ++/* Save the current remote block info and clear the current pointers. */ ++static void ++xfs_attr_save_rmt_blk( ++ struct xfs_da_args *args) ++{ ++ args->blkno2 = args->blkno; ++ args->index2 = args->index; ++ args->rmtblkno2 = args->rmtblkno; ++ args->rmtblkcnt2 = args->rmtblkcnt; ++ args->rmtvaluelen2 = args->rmtvaluelen; ++ args->rmtblkno = 0; ++ args->rmtblkcnt = 0; ++ args->rmtvaluelen = 0; ++} ++ ++/* Set stored info about a remote block */ ++static void ++xfs_attr_restore_rmt_blk( ++ struct xfs_da_args *args) ++{ ++ args->blkno = args->blkno2; ++ args->index = args->index2; ++ args->rmtblkno = args->rmtblkno2; ++ args->rmtblkcnt = args->rmtblkcnt2; ++ args->rmtvaluelen = args->rmtvaluelen2; ++} ++ + /* + * Handle the state change on completion of a multi-state attr operation. + * +@@ -428,49 +454,77 @@ xfs_attr_complete_op( + return XFS_DAS_DONE; + } + ++/* ++ * Try to add an attribute to an inode in leaf form. ++ */ + static int + xfs_attr_leaf_addname( + struct xfs_attr_intent *attr) + { + struct xfs_da_args *args = attr->xattri_da_args; ++ struct xfs_buf *bp; + int error; + + ASSERT(xfs_attr_is_leaf(args->dp)); + ++ error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); ++ if (error) ++ return error; ++ + /* +- * Use the leaf buffer we may already hold locked as a result of +- * a sf-to-leaf conversion. ++ * Look up the xattr name to set the insertion point for the new xattr. + */ +- error = xfs_attr_leaf_try_add(args); +- +- if (error == -ENOSPC) { +- error = xfs_attr3_leaf_to_node(args); +- if (error) +- return error; ++ error = xfs_attr3_leaf_lookup_int(bp, args); ++ switch (error) { ++ case -ENOATTR: ++ if (args->op_flags & XFS_DA_OP_REPLACE) ++ goto out_brelse; ++ break; ++ case -EEXIST: ++ if (!(args->op_flags & XFS_DA_OP_REPLACE)) ++ goto out_brelse; + ++ trace_xfs_attr_leaf_replace(args); + /* +- * We're not in leaf format anymore, so roll the transaction and +- * retry the add to the newly allocated node block. ++ * Save the existing remote attr state so that the current ++ * values reflect the state of the new attribute we are about to ++ * add, not the attribute we just found and will remove later. + */ +- attr->xattri_dela_state = XFS_DAS_NODE_ADD; +- goto out; ++ xfs_attr_save_rmt_blk(args); ++ break; ++ case 0: ++ break; ++ default: ++ goto out_brelse; + } +- if (error) +- return error; + + /* + * We need to commit and roll if we need to allocate remote xattr blocks + * or perform more xattr manipulations. Otherwise there is nothing more + * to do and we can return success. + */ +- if (args->rmtblkno) ++ error = xfs_attr3_leaf_add(bp, args); ++ if (error) { ++ if (error != -ENOSPC) ++ return error; ++ error = xfs_attr3_leaf_to_node(args); ++ if (error) ++ return error; ++ ++ attr->xattri_dela_state = XFS_DAS_NODE_ADD; ++ } else if (args->rmtblkno) { + attr->xattri_dela_state = XFS_DAS_LEAF_SET_RMT; +- else +- attr->xattri_dela_state = xfs_attr_complete_op(attr, +- XFS_DAS_LEAF_REPLACE); +-out: ++ } else { ++ attr->xattri_dela_state = ++ xfs_attr_complete_op(attr, XFS_DAS_LEAF_REPLACE); ++ } ++ + trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp); + return error; ++ ++out_brelse: ++ xfs_trans_brelse(args->trans, bp); ++ return error; + } + + /* +@@ -1164,88 +1218,6 @@ xfs_attr_shortform_addname( + * External routines when attribute list is one block + *========================================================================*/ + +-/* Save the current remote block info and clear the current pointers. */ +-static void +-xfs_attr_save_rmt_blk( +- struct xfs_da_args *args) +-{ +- args->blkno2 = args->blkno; +- args->index2 = args->index; +- args->rmtblkno2 = args->rmtblkno; +- args->rmtblkcnt2 = args->rmtblkcnt; +- args->rmtvaluelen2 = args->rmtvaluelen; +- args->rmtblkno = 0; +- args->rmtblkcnt = 0; +- args->rmtvaluelen = 0; +-} +- +-/* Set stored info about a remote block */ +-static void +-xfs_attr_restore_rmt_blk( +- struct xfs_da_args *args) +-{ +- args->blkno = args->blkno2; +- args->index = args->index2; +- args->rmtblkno = args->rmtblkno2; +- args->rmtblkcnt = args->rmtblkcnt2; +- args->rmtvaluelen = args->rmtvaluelen2; +-} +- +-/* +- * Tries to add an attribute to an inode in leaf form +- * +- * This function is meant to execute as part of a delayed operation and leaves +- * the transaction handling to the caller. On success the attribute is added +- * and the inode and transaction are left dirty. If there is not enough space, +- * the attr data is converted to node format and -ENOSPC is returned. Caller is +- * responsible for handling the dirty inode and transaction or adding the attr +- * in node format. +- */ +-STATIC int +-xfs_attr_leaf_try_add( +- struct xfs_da_args *args) +-{ +- struct xfs_buf *bp; +- int error; +- +- error = xfs_attr3_leaf_read(args->trans, args->dp, 0, &bp); +- if (error) +- return error; +- +- /* +- * Look up the xattr name to set the insertion point for the new xattr. +- */ +- error = xfs_attr3_leaf_lookup_int(bp, args); +- switch (error) { +- case -ENOATTR: +- if (args->op_flags & XFS_DA_OP_REPLACE) +- goto out_brelse; +- break; +- case -EEXIST: +- if (!(args->op_flags & XFS_DA_OP_REPLACE)) +- goto out_brelse; +- +- trace_xfs_attr_leaf_replace(args); +- /* +- * Save the existing remote attr state so that the current +- * values reflect the state of the new attribute we are about to +- * add, not the attribute we just found and will remove later. +- */ +- xfs_attr_save_rmt_blk(args); +- break; +- case 0: +- break; +- default: +- goto out_brelse; +- } +- +- return xfs_attr3_leaf_add(bp, args); +- +-out_brelse: +- xfs_trans_brelse(args->trans, bp); +- return error; +-} +- + /* + * Return EEXIST if attr is found, or ENOATTR if not + */ diff --git a/queue-6.6/xfs-pass-the-exact-range-to-initialize-to-xfs_initialize_perag.patch b/queue-6.6/xfs-pass-the-exact-range-to-initialize-to-xfs_initialize_perag.patch new file mode 100644 index 0000000000..6c6e4f39af --- /dev/null +++ b/queue-6.6/xfs-pass-the-exact-range-to-initialize-to-xfs_initialize_perag.patch @@ -0,0 +1,193 @@ +From stable+bounces-113983-greg=kroah.com@vger.kernel.org Wed Feb 5 22:41:16 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:18 -0800 +Subject: xfs: pass the exact range to initialize to xfs_initialize_perag +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-18-catherine.hoang@oracle.com> + +From: Christoph Hellwig + +commit 82742f8c3f1a93787a05a00aca50c2a565231f84 upstream. + +[backport: dependency of 6a18765b] + +Currently only the new agcount is passed to xfs_initialize_perag, which +requires lookups of existing AGs to skip them and complicates error +handling. Also pass the previous agcount so that the range that +xfs_initialize_perag operates on is exactly defined. That way the +extra lookups can be avoided, and error handling can clean up the +exact range from the old count to the last added perag structure. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Reviewed-by: Brian Foster +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_ag.c | 28 ++++++---------------------- + fs/xfs/libxfs/xfs_ag.h | 5 +++-- + fs/xfs/xfs_fsops.c | 18 ++++++++---------- + fs/xfs/xfs_log_recover.c | 5 +++-- + fs/xfs/xfs_mount.c | 4 ++-- + 5 files changed, 22 insertions(+), 38 deletions(-) + +--- a/fs/xfs/libxfs/xfs_ag.c ++++ b/fs/xfs/libxfs/xfs_ag.c +@@ -360,27 +360,16 @@ xfs_free_unused_perag_range( + int + xfs_initialize_perag( + struct xfs_mount *mp, +- xfs_agnumber_t agcount, ++ xfs_agnumber_t old_agcount, ++ xfs_agnumber_t new_agcount, + xfs_rfsblock_t dblocks, + xfs_agnumber_t *maxagi) + { + struct xfs_perag *pag; + xfs_agnumber_t index; +- xfs_agnumber_t first_initialised = NULLAGNUMBER; + int error; + +- /* +- * Walk the current per-ag tree so we don't try to initialise AGs +- * that already exist (growfs case). Allocate and insert all the +- * AGs we don't find ready for initialisation. +- */ +- for (index = 0; index < agcount; index++) { +- pag = xfs_perag_get(mp, index); +- if (pag) { +- xfs_perag_put(pag); +- continue; +- } +- ++ for (index = old_agcount; index < new_agcount; index++) { + pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); + if (!pag) { + error = -ENOMEM; +@@ -425,21 +414,17 @@ xfs_initialize_perag( + /* Active ref owned by mount indicates AG is online. */ + atomic_set(&pag->pag_active_ref, 1); + +- /* first new pag is fully initialized */ +- if (first_initialised == NULLAGNUMBER) +- first_initialised = index; +- + /* + * Pre-calculated geometry + */ +- pag->block_count = __xfs_ag_block_count(mp, index, agcount, ++ pag->block_count = __xfs_ag_block_count(mp, index, new_agcount, + dblocks); + pag->min_block = XFS_AGFL_BLOCK(mp); + __xfs_agino_range(mp, pag->block_count, &pag->agino_min, + &pag->agino_max); + } + +- index = xfs_set_inode_alloc(mp, agcount); ++ index = xfs_set_inode_alloc(mp, new_agcount); + + if (maxagi) + *maxagi = index; +@@ -455,8 +440,7 @@ out_remove_pag: + out_free_pag: + kmem_free(pag); + out_unwind_new_pags: +- /* unwind any prior newly initialized pags */ +- xfs_free_unused_perag_range(mp, first_initialised, agcount); ++ xfs_free_unused_perag_range(mp, old_agcount, index); + return error; + } + +--- a/fs/xfs/libxfs/xfs_ag.h ++++ b/fs/xfs/libxfs/xfs_ag.h +@@ -135,8 +135,9 @@ __XFS_AG_OPSTATE(agfl_needs_reset, AGFL_ + + void xfs_free_unused_perag_range(struct xfs_mount *mp, xfs_agnumber_t agstart, + xfs_agnumber_t agend); +-int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount, +- xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi); ++int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t old_agcount, ++ xfs_agnumber_t agcount, xfs_rfsblock_t dcount, ++ xfs_agnumber_t *maxagi); + int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno); + void xfs_free_perag(struct xfs_mount *mp); + +--- a/fs/xfs/xfs_fsops.c ++++ b/fs/xfs/xfs_fsops.c +@@ -87,6 +87,7 @@ xfs_growfs_data_private( + struct xfs_mount *mp, /* mount point for filesystem */ + struct xfs_growfs_data *in) /* growfs data input struct */ + { ++ xfs_agnumber_t oagcount = mp->m_sb.sb_agcount; + struct xfs_buf *bp; + int error; + xfs_agnumber_t nagcount; +@@ -94,7 +95,6 @@ xfs_growfs_data_private( + xfs_rfsblock_t nb, nb_div, nb_mod; + int64_t delta; + bool lastag_extended = false; +- xfs_agnumber_t oagcount; + struct xfs_trans *tp; + struct aghdr_init_data id = {}; + struct xfs_perag *last_pag; +@@ -138,16 +138,14 @@ xfs_growfs_data_private( + if (delta == 0) + return 0; + +- oagcount = mp->m_sb.sb_agcount; +- /* allocate the new per-ag structures */ +- if (nagcount > oagcount) { +- error = xfs_initialize_perag(mp, nagcount, nb, &nagimax); +- if (error) +- return error; +- } else if (nagcount < oagcount) { +- /* TODO: shrinking the entire AGs hasn't yet completed */ ++ /* TODO: shrinking the entire AGs hasn't yet completed */ ++ if (nagcount < oagcount) + return -EINVAL; +- } ++ ++ /* allocate the new per-ag structures */ ++ error = xfs_initialize_perag(mp, oagcount, nagcount, nb, &nagimax); ++ if (error) ++ return error; + + if (delta > 0) + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, +--- a/fs/xfs/xfs_log_recover.c ++++ b/fs/xfs/xfs_log_recover.c +@@ -3317,6 +3317,7 @@ xlog_do_recover( + struct xfs_mount *mp = log->l_mp; + struct xfs_buf *bp = mp->m_sb_bp; + struct xfs_sb *sbp = &mp->m_sb; ++ xfs_agnumber_t orig_agcount = sbp->sb_agcount; + int error; + + trace_xfs_log_recover(log, head_blk, tail_blk); +@@ -3365,8 +3366,8 @@ xlog_do_recover( + /* re-initialise in-core superblock and geometry structures */ + mp->m_features |= xfs_sb_version_to_features(sbp); + xfs_reinit_percpu_counters(mp); +- error = xfs_initialize_perag(mp, sbp->sb_agcount, sbp->sb_dblocks, +- &mp->m_maxagi); ++ error = xfs_initialize_perag(mp, orig_agcount, sbp->sb_agcount, ++ sbp->sb_dblocks, &mp->m_maxagi); + if (error) { + xfs_warn(mp, "Failed post-recovery per-ag init: %d", error); + return error; +--- a/fs/xfs/xfs_mount.c ++++ b/fs/xfs/xfs_mount.c +@@ -797,8 +797,8 @@ xfs_mountfs( + /* + * Allocate and initialize the per-ag data. + */ +- error = xfs_initialize_perag(mp, sbp->sb_agcount, mp->m_sb.sb_dblocks, +- &mp->m_maxagi); ++ error = xfs_initialize_perag(mp, 0, sbp->sb_agcount, ++ mp->m_sb.sb_dblocks, &mp->m_maxagi); + if (error) { + xfs_warn(mp, "Failed per-ag init: %d", error); + goto out_free_dir; diff --git a/queue-6.6/xfs-reduce-unnecessary-searches-when-searching-for-the-best-extents.patch b/queue-6.6/xfs-reduce-unnecessary-searches-when-searching-for-the-best-extents.patch new file mode 100644 index 0000000000..60edee04bf --- /dev/null +++ b/queue-6.6/xfs-reduce-unnecessary-searches-when-searching-for-the-best-extents.patch @@ -0,0 +1,61 @@ +From stable+bounces-113987-greg=kroah.com@vger.kernel.org Wed Feb 5 22:41:28 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:23 -0800 +Subject: xfs: Reduce unnecessary searches when searching for the best extents +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-23-catherine.hoang@oracle.com> + +From: Chi Zhiling + +commit 3ef22684038aa577c10972ee9c6a2455f5fac941 upstream. + +Recently, we found that the CPU spent a lot of time in +xfs_alloc_ag_vextent_size when the filesystem has millions of fragmented +spaces. + +The reason is that we conducted much extra searching for extents that +could not yield a better result, and these searches would cost a lot of +time when there were millions of extents to search through. Even if we +get the same result length, we don't switch our choice to the new one, +so we can definitely terminate the search early. + +Since the result length cannot exceed the found length, when the found +length equals the best result length we already have, we can conclude +the search. + +We did a test in that filesystem: +[root@localhost ~]# xfs_db -c freesp /dev/vdb + from to extents blocks pct + 1 1 215 215 0.01 + 2 3 994476 1988952 99.99 + +Before this patch: + 0) | xfs_alloc_ag_vextent_size [xfs]() { + 0) * 15597.94 us | } + +After this patch: + 0) | xfs_alloc_ag_vextent_size [xfs]() { + 0) 19.176 us | } + +Signed-off-by: Chi Zhiling +Reviewed-by: Dave Chinner +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_alloc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xfs/libxfs/xfs_alloc.c ++++ b/fs/xfs/libxfs/xfs_alloc.c +@@ -1783,7 +1783,7 @@ restart: + error = -EFSCORRUPTED; + goto error0; + } +- if (flen < bestrlen) ++ if (flen <= bestrlen) + break; + busy = xfs_alloc_compute_aligned(args, fbno, flen, + &rbno, &rlen, &busy_gen); diff --git a/queue-6.6/xfs-remove-empty-declartion-in-header-file.patch b/queue-6.6/xfs-remove-empty-declartion-in-header-file.patch new file mode 100644 index 0000000000..38c39df838 --- /dev/null +++ b/queue-6.6/xfs-remove-empty-declartion-in-header-file.patch @@ -0,0 +1,35 @@ +From stable+bounces-113982-greg=kroah.com@vger.kernel.org Wed Feb 5 22:41:13 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:17 -0800 +Subject: xfs: Remove empty declartion in header file +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-17-catherine.hoang@oracle.com> + +From: Zhang Zekun + +commit f6225eebd76f371dab98b4d1c1a7c1e255190aef upstream. + +The definition of xfs_attr_use_log_assist() has been removed since +commit d9c61ccb3b09 ("xfs: move xfs_attr_use_log_assist out of xfs_log.c"). +So, Remove the empty declartion in header files. + +Signed-off-by: Zhang Zekun +Reviewed-by: Christoph Hellwig +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_log.h | 1 - + 1 file changed, 1 deletion(-) + +--- a/fs/xfs/xfs_log.h ++++ b/fs/xfs/xfs_log.h +@@ -161,6 +161,5 @@ bool xlog_force_shutdown(struct xlog * + + void xlog_use_incompat_feat(struct xlog *log); + void xlog_drop_incompat_feat(struct xlog *log); +-int xfs_attr_use_log_assist(struct xfs_mount *mp); + + #endif /* __XFS_LOG_H__ */ diff --git a/queue-6.6/xfs-return-bool-from-xfs_attr3_leaf_add.patch b/queue-6.6/xfs-return-bool-from-xfs_attr3_leaf_add.patch new file mode 100644 index 0000000000..7be996d13b --- /dev/null +++ b/queue-6.6/xfs-return-bool-from-xfs_attr3_leaf_add.patch @@ -0,0 +1,222 @@ +From stable+bounces-113974-greg=kroah.com@vger.kernel.org Wed Feb 5 22:40:56 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:09 -0800 +Subject: xfs: return bool from xfs_attr3_leaf_add +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-9-catherine.hoang@oracle.com> + +From: Christoph Hellwig + +commit 346c1d46d4c631c0c88592d371f585214d714da4 upstream. + +[backport: dependency of a5f7334 and b3f4e84] + +xfs_attr3_leaf_add only has two potential return values, indicating if the +entry could be added or not. Replace the errno return with a bool so that +ENOSPC from it can't easily be confused with a real ENOSPC. + +Remove the return value from the xfs_attr3_leaf_add_work helper entirely, +as it always return 0. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_attr.c | 13 +++++-------- + fs/xfs/libxfs/xfs_attr_leaf.c | 37 +++++++++++++++++++------------------ + fs/xfs/libxfs/xfs_attr_leaf.h | 2 +- + 3 files changed, 25 insertions(+), 27 deletions(-) + +--- a/fs/xfs/libxfs/xfs_attr.c ++++ b/fs/xfs/libxfs/xfs_attr.c +@@ -503,10 +503,7 @@ xfs_attr_leaf_addname( + * or perform more xattr manipulations. Otherwise there is nothing more + * to do and we can return success. + */ +- error = xfs_attr3_leaf_add(bp, args); +- if (error) { +- if (error != -ENOSPC) +- return error; ++ if (!xfs_attr3_leaf_add(bp, args)) { + error = xfs_attr3_leaf_to_node(args); + if (error) + return error; +@@ -520,7 +517,7 @@ xfs_attr_leaf_addname( + } + + trace_xfs_attr_leaf_addname_return(attr->xattri_dela_state, args->dp); +- return error; ++ return 0; + + out_brelse: + xfs_trans_brelse(args->trans, bp); +@@ -1393,21 +1390,21 @@ xfs_attr_node_try_addname( + { + struct xfs_da_state *state = attr->xattri_da_state; + struct xfs_da_state_blk *blk; +- int error; ++ int error = 0; + + trace_xfs_attr_node_addname(state->args); + + blk = &state->path.blk[state->path.active-1]; + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); + +- error = xfs_attr3_leaf_add(blk->bp, state->args); +- if (error == -ENOSPC) { ++ if (!xfs_attr3_leaf_add(blk->bp, state->args)) { + if (state->path.active == 1) { + /* + * Its really a single leaf node, but it had + * out-of-line values so it looked like it *might* + * have been a b-tree. Let the caller deal with this. + */ ++ error = -ENOSPC; + goto out; + } + +--- a/fs/xfs/libxfs/xfs_attr_leaf.c ++++ b/fs/xfs/libxfs/xfs_attr_leaf.c +@@ -46,7 +46,7 @@ + */ + STATIC int xfs_attr3_leaf_create(struct xfs_da_args *args, + xfs_dablk_t which_block, struct xfs_buf **bpp); +-STATIC int xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer, ++STATIC void xfs_attr3_leaf_add_work(struct xfs_buf *leaf_buffer, + struct xfs_attr3_icleaf_hdr *ichdr, + struct xfs_da_args *args, int freemap_index); + STATIC void xfs_attr3_leaf_compact(struct xfs_da_args *args, +@@ -990,10 +990,8 @@ xfs_attr_shortform_to_leaf( + } + error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ + ASSERT(error == -ENOATTR); +- error = xfs_attr3_leaf_add(bp, &nargs); +- ASSERT(error != -ENOSPC); +- if (error) +- goto out; ++ if (!xfs_attr3_leaf_add(bp, &nargs)) ++ ASSERT(0); + sfe = xfs_attr_sf_nextentry(sfe); + } + error = 0; +@@ -1349,8 +1347,9 @@ xfs_attr3_leaf_split( + struct xfs_da_state_blk *oldblk, + struct xfs_da_state_blk *newblk) + { +- xfs_dablk_t blkno; +- int error; ++ bool added; ++ xfs_dablk_t blkno; ++ int error; + + trace_xfs_attr_leaf_split(state->args); + +@@ -1385,10 +1384,10 @@ xfs_attr3_leaf_split( + */ + if (state->inleaf) { + trace_xfs_attr_leaf_add_old(state->args); +- error = xfs_attr3_leaf_add(oldblk->bp, state->args); ++ added = xfs_attr3_leaf_add(oldblk->bp, state->args); + } else { + trace_xfs_attr_leaf_add_new(state->args); +- error = xfs_attr3_leaf_add(newblk->bp, state->args); ++ added = xfs_attr3_leaf_add(newblk->bp, state->args); + } + + /* +@@ -1396,13 +1395,15 @@ xfs_attr3_leaf_split( + */ + oldblk->hashval = xfs_attr_leaf_lasthash(oldblk->bp, NULL); + newblk->hashval = xfs_attr_leaf_lasthash(newblk->bp, NULL); +- return error; ++ if (!added) ++ return -ENOSPC; ++ return 0; + } + + /* + * Add a name to the leaf attribute list structure. + */ +-int ++bool + xfs_attr3_leaf_add( + struct xfs_buf *bp, + struct xfs_da_args *args) +@@ -1411,6 +1412,7 @@ xfs_attr3_leaf_add( + struct xfs_attr3_icleaf_hdr ichdr; + int tablesize; + int entsize; ++ bool added = true; + int sum; + int tmp; + int i; +@@ -1439,7 +1441,7 @@ xfs_attr3_leaf_add( + if (ichdr.freemap[i].base < ichdr.firstused) + tmp += sizeof(xfs_attr_leaf_entry_t); + if (ichdr.freemap[i].size >= tmp) { +- tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, i); ++ xfs_attr3_leaf_add_work(bp, &ichdr, args, i); + goto out_log_hdr; + } + sum += ichdr.freemap[i].size; +@@ -1451,7 +1453,7 @@ xfs_attr3_leaf_add( + * no good and we should just give up. + */ + if (!ichdr.holes && sum < entsize) +- return -ENOSPC; ++ return false; + + /* + * Compact the entries to coalesce free space. +@@ -1464,24 +1466,24 @@ xfs_attr3_leaf_add( + * free region, in freemap[0]. If it is not big enough, give up. + */ + if (ichdr.freemap[0].size < (entsize + sizeof(xfs_attr_leaf_entry_t))) { +- tmp = -ENOSPC; ++ added = false; + goto out_log_hdr; + } + +- tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); ++ xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); + + out_log_hdr: + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); + xfs_trans_log_buf(args->trans, bp, + XFS_DA_LOGRANGE(leaf, &leaf->hdr, + xfs_attr3_leaf_hdr_size(leaf))); +- return tmp; ++ return added; + } + + /* + * Add a name to a leaf attribute list structure. + */ +-STATIC int ++STATIC void + xfs_attr3_leaf_add_work( + struct xfs_buf *bp, + struct xfs_attr3_icleaf_hdr *ichdr, +@@ -1599,7 +1601,6 @@ xfs_attr3_leaf_add_work( + } + } + ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index); +- return 0; + } + + /* +--- a/fs/xfs/libxfs/xfs_attr_leaf.h ++++ b/fs/xfs/libxfs/xfs_attr_leaf.h +@@ -78,7 +78,7 @@ int xfs_attr3_leaf_split(struct xfs_da_s + int xfs_attr3_leaf_lookup_int(struct xfs_buf *leaf, + struct xfs_da_args *args); + int xfs_attr3_leaf_getvalue(struct xfs_buf *bp, struct xfs_da_args *args); +-int xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, ++bool xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer, + struct xfs_da_args *args); + int xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer, + struct xfs_da_args *args); diff --git a/queue-6.6/xfs-skip-background-cowblock-trims-on-inodes-open-for-write.patch b/queue-6.6/xfs-skip-background-cowblock-trims-on-inodes-open-for-write.patch new file mode 100644 index 0000000000..d567d6c3ee --- /dev/null +++ b/queue-6.6/xfs-skip-background-cowblock-trims-on-inodes-open-for-write.patch @@ -0,0 +1,129 @@ +From stable+bounces-113971-greg=kroah.com@vger.kernel.org Wed Feb 5 22:40:53 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:06 -0800 +Subject: xfs: skip background cowblock trims on inodes open for write +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-6-catherine.hoang@oracle.com> + +From: Brian Foster + +commit 90a71daaf73f5d39bb0cbb3c7ab6af942fe6233e upstream. + +The background blockgc scanner runs on a 5m interval by default and +trims preallocation (post-eof and cow fork) from inodes that are +otherwise idle. Idle effectively means that iolock can be acquired +without blocking and that the inode has no dirty pagecache or I/O in +flight. + +This simple mechanism and heuristic has worked fairly well for +post-eof speculative preallocations. Support for reflink and COW +fork preallocations came sometime later and plugged into the same +mechanism, with similar heuristics. Some recent testing has shown +that COW fork preallocation may be notably more sensitive to blockgc +processing than post-eof preallocation, however. + +For example, consider an 8GB reflinked file with a COW extent size +hint of 1MB. A worst case fully randomized overwrite of this file +results in ~8k extents of an average size of ~1MB. If the same +workload is interrupted a couple times for blockgc processing +(assuming the file goes idle), the resulting extent count explodes +to over 100k extents with an average size <100kB. This is +significantly worse than ideal and essentially defeats the COW +extent size hint mechanism. + +While this particular test is instrumented, it reflects a fairly +reasonable pattern in practice where random I/Os might spread out +over a large period of time with varying periods of (in)activity. +For example, consider a cloned disk image file for a VM or container +with long uptime and variable and bursty usage. A background blockgc +scan that races and processes the image file when it happens to be +clean and idle can have a significant effect on the future +fragmentation level of the file, even when still in use. + +To help combat this, update the heuristic to skip cowblocks inodes +that are currently opened for write access during non-sync blockgc +scans. This allows COW fork preallocations to persist for as long as +possible unless otherwise needed for functional purposes (i.e. a +sync scan), the file is idle and closed, or the inode is being +evicted from cache. While here, update the comments to help +distinguish performance oriented heuristics from the logic that +exists to maintain functional correctness. + +Suggested-by: Darrick Wong +Signed-off-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_icache.c | 31 +++++++++++++++++++++++-------- + 1 file changed, 23 insertions(+), 8 deletions(-) + +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -1234,14 +1234,17 @@ xfs_inode_clear_eofblocks_tag( + } + + /* +- * Set ourselves up to free CoW blocks from this file. If it's already clean +- * then we can bail out quickly, but otherwise we must back off if the file +- * is undergoing some kind of write. ++ * Prepare to free COW fork blocks from an inode. + */ + static bool + xfs_prep_free_cowblocks( +- struct xfs_inode *ip) ++ struct xfs_inode *ip, ++ struct xfs_icwalk *icw) + { ++ bool sync; ++ ++ sync = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC); ++ + /* + * Just clear the tag if we have an empty cow fork or none at all. It's + * possible the inode was fully unshared since it was originally tagged. +@@ -1253,9 +1256,21 @@ xfs_prep_free_cowblocks( + } + + /* +- * If the mapping is dirty or under writeback we cannot touch the +- * CoW fork. Leave it alone if we're in the midst of a directio. ++ * A cowblocks trim of an inode can have a significant effect on ++ * fragmentation even when a reasonable COW extent size hint is set. ++ * Therefore, we prefer to not process cowblocks unless they are clean ++ * and idle. We can never process a cowblocks inode that is dirty or has ++ * in-flight I/O under any circumstances, because outstanding writeback ++ * or dio expects targeted COW fork blocks exist through write ++ * completion where they can be remapped into the data fork. ++ * ++ * Therefore, the heuristic used here is to never process inodes ++ * currently opened for write from background (i.e. non-sync) scans. For ++ * sync scans, use the pagecache/dio state of the inode to ensure we ++ * never free COW fork blocks out from under pending I/O. + */ ++ if (!sync && inode_is_open_for_write(VFS_I(ip))) ++ return false; + if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || + mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || + mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || +@@ -1291,7 +1306,7 @@ xfs_inode_free_cowblocks( + if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS)) + return 0; + +- if (!xfs_prep_free_cowblocks(ip)) ++ if (!xfs_prep_free_cowblocks(ip, icw)) + return 0; + + if (!xfs_icwalk_match(ip, icw)) +@@ -1320,7 +1335,7 @@ xfs_inode_free_cowblocks( + * Check again, nobody else should be able to dirty blocks or change + * the reflink iflag now that we have the first two locks held. + */ +- if (xfs_prep_free_cowblocks(ip)) ++ if (xfs_prep_free_cowblocks(ip, icw)) + ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); + return ret; + } diff --git a/queue-6.6/xfs-streamline-xfs_filestream_pick_ag.patch b/queue-6.6/xfs-streamline-xfs_filestream_pick_ag.patch new file mode 100644 index 0000000000..b64ccb9df6 --- /dev/null +++ b/queue-6.6/xfs-streamline-xfs_filestream_pick_ag.patch @@ -0,0 +1,164 @@ +From stable+bounces-113988-greg=kroah.com@vger.kernel.org Wed Feb 5 22:41:29 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:24 -0800 +Subject: xfs: streamline xfs_filestream_pick_ag +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-24-catherine.hoang@oracle.com> + +From: Christoph Hellwig + +commit 81a1e1c32ef474c20ccb9f730afe1ac25b1c62a4 upstream. + +Directly return the error from xfs_bmap_longest_free_extent instead +of breaking from the loop and handling it there, and use a done +label to directly jump to the exist when we found a suitable perag +structure to reduce the indentation level and pag/max_pag check +complexity in the tail of the function. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_filestream.c | 96 +++++++++++++++++++++++------------------------- + 1 file changed, 46 insertions(+), 50 deletions(-) + +--- a/fs/xfs/xfs_filestream.c ++++ b/fs/xfs/xfs_filestream.c +@@ -67,22 +67,28 @@ xfs_filestream_pick_ag( + xfs_extlen_t minfree, maxfree = 0; + xfs_agnumber_t agno; + bool first_pass = true; +- int err; + + /* 2% of an AG's blocks must be free for it to be chosen. */ + minfree = mp->m_sb.sb_agblocks / 50; + + restart: + for_each_perag_wrap(mp, start_agno, agno, pag) { ++ int err; ++ + trace_xfs_filestream_scan(pag, pino); ++ + *longest = 0; + err = xfs_bmap_longest_free_extent(pag, NULL, longest); + if (err) { +- if (err != -EAGAIN) +- break; +- /* Couldn't lock the AGF, skip this AG. */ +- err = 0; +- continue; ++ if (err == -EAGAIN) { ++ /* Couldn't lock the AGF, skip this AG. */ ++ err = 0; ++ continue; ++ } ++ xfs_perag_rele(pag); ++ if (max_pag) ++ xfs_perag_rele(max_pag); ++ return err; + } + + /* Keep track of the AG with the most free blocks. */ +@@ -107,7 +113,9 @@ restart: + !(flags & XFS_PICK_USERDATA) || + (flags & XFS_PICK_LOWSPACE))) { + /* Break out, retaining the reference on the AG. */ +- break; ++ if (max_pag) ++ xfs_perag_rele(max_pag); ++ goto done; + } + } + +@@ -115,56 +123,44 @@ restart: + atomic_dec(&pag->pagf_fstrms); + } + +- if (err) { +- xfs_perag_rele(pag); +- if (max_pag) +- xfs_perag_rele(max_pag); +- return err; ++ /* ++ * Allow a second pass to give xfs_bmap_longest_free_extent() another ++ * attempt at locking AGFs that it might have skipped over before we ++ * fail. ++ */ ++ if (first_pass) { ++ first_pass = false; ++ goto restart; + } + +- if (!pag) { +- /* +- * Allow a second pass to give xfs_bmap_longest_free_extent() +- * another attempt at locking AGFs that it might have skipped +- * over before we fail. +- */ +- if (first_pass) { +- first_pass = false; +- goto restart; +- } +- +- /* +- * We must be low on data space, so run a final lowspace +- * optimised selection pass if we haven't already. +- */ +- if (!(flags & XFS_PICK_LOWSPACE)) { +- flags |= XFS_PICK_LOWSPACE; +- goto restart; +- } +- +- /* +- * No unassociated AGs are available, so select the AG with the +- * most free space, regardless of whether it's already in use by +- * another filestream. It none suit, just use whatever AG we can +- * grab. +- */ +- if (!max_pag) { +- for_each_perag_wrap(args->mp, 0, start_agno, pag) { +- max_pag = pag; +- break; +- } ++ /* ++ * We must be low on data space, so run a final lowspace optimised ++ * selection pass if we haven't already. ++ */ ++ if (!(flags & XFS_PICK_LOWSPACE)) { ++ flags |= XFS_PICK_LOWSPACE; ++ goto restart; ++ } + +- /* Bail if there are no AGs at all to select from. */ +- if (!max_pag) +- return -ENOSPC; ++ /* ++ * No unassociated AGs are available, so select the AG with the most ++ * free space, regardless of whether it's already in use by another ++ * filestream. It none suit, just use whatever AG we can grab. ++ */ ++ if (!max_pag) { ++ for_each_perag_wrap(args->mp, 0, start_agno, pag) { ++ max_pag = pag; ++ break; + } + +- pag = max_pag; +- atomic_inc(&pag->pagf_fstrms); +- } else if (max_pag) { +- xfs_perag_rele(max_pag); ++ /* Bail if there are no AGs at all to select from. */ ++ if (!max_pag) ++ return -ENOSPC; + } + ++ pag = max_pag; ++ atomic_inc(&pag->pagf_fstrms); ++done: + trace_xfs_filestream_pick(pag, pino); + args->pag = pag; + return 0; diff --git a/queue-6.6/xfs-support-lowmode-allocations-in-xfs_bmap_exact_minlen_extent_alloc.patch b/queue-6.6/xfs-support-lowmode-allocations-in-xfs_bmap_exact_minlen_extent_alloc.patch new file mode 100644 index 0000000000..5488296e35 --- /dev/null +++ b/queue-6.6/xfs-support-lowmode-allocations-in-xfs_bmap_exact_minlen_extent_alloc.patch @@ -0,0 +1,46 @@ +From stable+bounces-113980-greg=kroah.com@vger.kernel.org Wed Feb 5 22:41:12 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:15 -0800 +Subject: xfs: support lowmode allocations in xfs_bmap_exact_minlen_extent_alloc +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-15-catherine.hoang@oracle.com> + +From: Christoph Hellwig + +commit 6aac77059881e4419df499392c995bf02fb9630b upstream. + +Currently the debug-only xfs_bmap_exact_minlen_extent_alloc allocation +variant fails to drop into the lowmode last resort allocator, and +thus can sometimes fail allocations for which the caller has a +transaction block reservation. + +Fix this by using xfs_bmap_btalloc_low_space to do the actual allocation. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_bmap.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -3412,7 +3412,13 @@ xfs_bmap_exact_minlen_extent_alloc( + */ + ap->blkno = XFS_AGB_TO_FSB(ap->ip->i_mount, 0, 0); + +- return xfs_alloc_vextent_first_ag(args, ap->blkno); ++ /* ++ * Call xfs_bmap_btalloc_low_space here as it first does a "normal" AG ++ * iteration and then drops args->total to args->minlen, which might be ++ * required to find an allocation for the transaction reservation when ++ * the file system is very full. ++ */ ++ return xfs_bmap_btalloc_low_space(ap, args); + } + + /* diff --git a/queue-6.6/xfs-update-the-file-system-geometry-after-recoverying-superblock-buffers.patch b/queue-6.6/xfs-update-the-file-system-geometry-after-recoverying-superblock-buffers.patch new file mode 100644 index 0000000000..fc22ce86a4 --- /dev/null +++ b/queue-6.6/xfs-update-the-file-system-geometry-after-recoverying-superblock-buffers.patch @@ -0,0 +1,133 @@ +From stable+bounces-113984-greg=kroah.com@vger.kernel.org Wed Feb 5 22:41:18 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:19 -0800 +Subject: xfs: update the file system geometry after recoverying superblock buffers +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-19-catherine.hoang@oracle.com> + +From: Christoph Hellwig + +commit 6a18765b54e2e52aebcdb84c3b4f4d1f7cb2c0ca upstream. + +Primary superblock buffers that change the file system geometry after a +growfs operation can affect the operation of later CIL checkpoints that +make use of the newly added space and allocation groups. + +Apply the changes to the in-memory structures as part of recovery pass 2, +to ensure recovery works fine for such cases. + +In the future we should apply the logic to other updates such as features +bits as well. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Brian Foster +Reviewed-by: Darrick J. Wong +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_buf_item_recover.c | 52 ++++++++++++++++++++++++++++++++++++++++++ + fs/xfs/xfs_log_recover.c | 8 ------ + 2 files changed, 52 insertions(+), 8 deletions(-) + +--- a/fs/xfs/xfs_buf_item_recover.c ++++ b/fs/xfs/xfs_buf_item_recover.c +@@ -22,6 +22,9 @@ + #include "xfs_inode.h" + #include "xfs_dir2.h" + #include "xfs_quota.h" ++#include "xfs_alloc.h" ++#include "xfs_ag.h" ++#include "xfs_sb.h" + + /* + * This is the number of entries in the l_buf_cancel_table used during +@@ -685,6 +688,49 @@ xlog_recover_do_inode_buffer( + } + + /* ++ * Update the in-memory superblock and perag structures from the primary SB ++ * buffer. ++ * ++ * This is required because transactions running after growfs may require the ++ * updated values to be set in a previous fully commit transaction. ++ */ ++static int ++xlog_recover_do_primary_sb_buffer( ++ struct xfs_mount *mp, ++ struct xlog_recover_item *item, ++ struct xfs_buf *bp, ++ struct xfs_buf_log_format *buf_f, ++ xfs_lsn_t current_lsn) ++{ ++ struct xfs_dsb *dsb = bp->b_addr; ++ xfs_agnumber_t orig_agcount = mp->m_sb.sb_agcount; ++ int error; ++ ++ xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); ++ ++ /* ++ * Update the in-core super block from the freshly recovered on-disk one. ++ */ ++ xfs_sb_from_disk(&mp->m_sb, dsb); ++ ++ /* ++ * Initialize the new perags, and also update various block and inode ++ * allocator setting based off the number of AGs or total blocks. ++ * Because of the latter this also needs to happen if the agcount did ++ * not change. ++ */ ++ error = xfs_initialize_perag(mp, orig_agcount, ++ mp->m_sb.sb_agcount, mp->m_sb.sb_dblocks, ++ &mp->m_maxagi); ++ if (error) { ++ xfs_warn(mp, "Failed recovery per-ag init: %d", error); ++ return error; ++ } ++ mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); ++ return 0; ++} ++ ++/* + * V5 filesystems know the age of the buffer on disk being recovered. We can + * have newer objects on disk than we are replaying, and so for these cases we + * don't want to replay the current change as that will make the buffer contents +@@ -967,6 +1013,12 @@ xlog_recover_buf_commit_pass2( + dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); + if (!dirty) + goto out_release; ++ } else if ((xfs_blft_from_flags(buf_f) & XFS_BLFT_SB_BUF) && ++ xfs_buf_daddr(bp) == 0) { ++ error = xlog_recover_do_primary_sb_buffer(mp, item, bp, buf_f, ++ current_lsn); ++ if (error) ++ goto out_release; + } else { + xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); + } +--- a/fs/xfs/xfs_log_recover.c ++++ b/fs/xfs/xfs_log_recover.c +@@ -3317,7 +3317,6 @@ xlog_do_recover( + struct xfs_mount *mp = log->l_mp; + struct xfs_buf *bp = mp->m_sb_bp; + struct xfs_sb *sbp = &mp->m_sb; +- xfs_agnumber_t orig_agcount = sbp->sb_agcount; + int error; + + trace_xfs_log_recover(log, head_blk, tail_blk); +@@ -3366,13 +3365,6 @@ xlog_do_recover( + /* re-initialise in-core superblock and geometry structures */ + mp->m_features |= xfs_sb_version_to_features(sbp); + xfs_reinit_percpu_counters(mp); +- error = xfs_initialize_perag(mp, orig_agcount, sbp->sb_agcount, +- sbp->sb_dblocks, &mp->m_maxagi); +- if (error) { +- xfs_warn(mp, "Failed post-recovery per-ag init: %d", error); +- return error; +- } +- mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); + + /* Normal transactions can now occur */ + clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate); diff --git a/queue-6.6/xfs-update-the-pag-for-the-last-ag-at-recovery-time.patch b/queue-6.6/xfs-update-the-pag-for-the-last-ag-at-recovery-time.patch new file mode 100644 index 0000000000..187571e480 --- /dev/null +++ b/queue-6.6/xfs-update-the-pag-for-the-last-ag-at-recovery-time.patch @@ -0,0 +1,107 @@ +From stable+bounces-113986-greg=kroah.com@vger.kernel.org Wed Feb 5 22:41:27 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:22 -0800 +Subject: xfs: update the pag for the last AG at recovery time +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-22-catherine.hoang@oracle.com> + +From: Christoph Hellwig + +commit 4a201dcfa1ff0dcfe4348c40f3ad8bd68b97eb6c upstream. + +Currently log recovery never updates the in-core perag values for the +last allocation group when they were grown by growfs. This leads to +btree record validation failures for the alloc, ialloc or finotbt +trees if a transaction references this new space. + +Found by Brian's new growfs recovery stress test. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Brian Foster +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_ag.c | 17 +++++++++++++++++ + fs/xfs/libxfs/xfs_ag.h | 1 + + fs/xfs/xfs_buf_item_recover.c | 19 ++++++++++++++++--- + 3 files changed, 34 insertions(+), 3 deletions(-) + +--- a/fs/xfs/libxfs/xfs_ag.c ++++ b/fs/xfs/libxfs/xfs_ag.c +@@ -358,6 +358,23 @@ xfs_free_unused_perag_range( + } + + int ++xfs_update_last_ag_size( ++ struct xfs_mount *mp, ++ xfs_agnumber_t prev_agcount) ++{ ++ struct xfs_perag *pag = xfs_perag_grab(mp, prev_agcount - 1); ++ ++ if (!pag) ++ return -EFSCORRUPTED; ++ pag->block_count = __xfs_ag_block_count(mp, prev_agcount - 1, ++ mp->m_sb.sb_agcount, mp->m_sb.sb_dblocks); ++ __xfs_agino_range(mp, pag->block_count, &pag->agino_min, ++ &pag->agino_max); ++ xfs_perag_rele(pag); ++ return 0; ++} ++ ++int + xfs_initialize_perag( + struct xfs_mount *mp, + xfs_agnumber_t old_agcount, +--- a/fs/xfs/libxfs/xfs_ag.h ++++ b/fs/xfs/libxfs/xfs_ag.h +@@ -140,6 +140,7 @@ int xfs_initialize_perag(struct xfs_moun + xfs_agnumber_t *maxagi); + int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno); + void xfs_free_perag(struct xfs_mount *mp); ++int xfs_update_last_ag_size(struct xfs_mount *mp, xfs_agnumber_t prev_agcount); + + /* Passive AG references */ + struct xfs_perag *xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno); +--- a/fs/xfs/xfs_buf_item_recover.c ++++ b/fs/xfs/xfs_buf_item_recover.c +@@ -708,6 +708,11 @@ xlog_recover_do_primary_sb_buffer( + + xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn); + ++ if (orig_agcount == 0) { ++ xfs_alert(mp, "Trying to grow file system without AGs"); ++ return -EFSCORRUPTED; ++ } ++ + /* + * Update the in-core super block from the freshly recovered on-disk one. + */ +@@ -719,14 +724,22 @@ xlog_recover_do_primary_sb_buffer( + } + + /* ++ * Growfs can also grow the last existing AG. In this case we also need ++ * to update the length in the in-core perag structure and values ++ * depending on it. ++ */ ++ error = xfs_update_last_ag_size(mp, orig_agcount); ++ if (error) ++ return error; ++ ++ /* + * Initialize the new perags, and also update various block and inode + * allocator setting based off the number of AGs or total blocks. + * Because of the latter this also needs to happen if the agcount did + * not change. + */ +- error = xfs_initialize_perag(mp, orig_agcount, +- mp->m_sb.sb_agcount, mp->m_sb.sb_dblocks, +- &mp->m_maxagi); ++ error = xfs_initialize_perag(mp, orig_agcount, mp->m_sb.sb_agcount, ++ mp->m_sb.sb_dblocks, &mp->m_maxagi); + if (error) { + xfs_warn(mp, "Failed recovery per-ag init: %d", error); + return error; diff --git a/queue-6.6/xfs-use-try_cmpxchg-in-xlog_cil_insert_pcp_aggregate.patch b/queue-6.6/xfs-use-try_cmpxchg-in-xlog_cil_insert_pcp_aggregate.patch new file mode 100644 index 0000000000..4bbf5c8212 --- /dev/null +++ b/queue-6.6/xfs-use-try_cmpxchg-in-xlog_cil_insert_pcp_aggregate.patch @@ -0,0 +1,65 @@ +From stable+bounces-113981-greg=kroah.com@vger.kernel.org Wed Feb 5 22:41:10 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:16 -0800 +Subject: xfs: Use try_cmpxchg() in xlog_cil_insert_pcp_aggregate() +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-16-catherine.hoang@oracle.com> + +From: Uros Bizjak + +commit 20195d011c840b01fa91a85ebcd099ca95fbf8fc upstream. + +Use !try_cmpxchg instead of cmpxchg (*ptr, old, new) != old in +xlog_cil_insert_pcp_aggregate(). x86 CMPXCHG instruction returns +success in ZF flag, so this change saves a compare after cmpxchg. + +Also, try_cmpxchg implicitly assigns old *ptr value to "old" when +cmpxchg fails. There is no need to re-read the value in the loop. + +Note that the value from *ptr should be read using READ_ONCE to +prevent the compiler from merging, refetching or reordering the read. + +No functional change intended. + +Signed-off-by: Uros Bizjak +Reviewed-by: Christoph Hellwig +Cc: Chandan Babu R +Cc: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Carlos Maiolino +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_log_cil.c | 11 ++++------- + 1 file changed, 4 insertions(+), 7 deletions(-) + +--- a/fs/xfs/xfs_log_cil.c ++++ b/fs/xfs/xfs_log_cil.c +@@ -156,7 +156,6 @@ xlog_cil_insert_pcp_aggregate( + struct xfs_cil *cil, + struct xfs_cil_ctx *ctx) + { +- struct xlog_cil_pcp *cilpcp; + int cpu; + int count = 0; + +@@ -171,13 +170,11 @@ xlog_cil_insert_pcp_aggregate( + * structures that could have a nonzero space_used. + */ + for_each_cpu(cpu, &ctx->cil_pcpmask) { +- int old, prev; ++ struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); ++ int old = READ_ONCE(cilpcp->space_used); + +- cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); +- do { +- old = cilpcp->space_used; +- prev = cmpxchg(&cilpcp->space_used, old, 0); +- } while (old != prev); ++ while (!try_cmpxchg(&cilpcp->space_used, &old, 0)) ++ ; + count += old; + } + atomic_add(count, &ctx->space_used); diff --git a/queue-6.6/xfs-validate-inumber-in-xfs_iget.patch b/queue-6.6/xfs-validate-inumber-in-xfs_iget.patch new file mode 100644 index 0000000000..cdf1bfe8c4 --- /dev/null +++ b/queue-6.6/xfs-validate-inumber-in-xfs_iget.patch @@ -0,0 +1,35 @@ +From stable+bounces-113968-greg=kroah.com@vger.kernel.org Wed Feb 5 22:40:46 2025 +From: Catherine Hoang +Date: Wed, 5 Feb 2025 13:40:03 -0800 +Subject: xfs: validate inumber in xfs_iget +To: stable@vger.kernel.org +Cc: xfs-stable@lists.linux.dev +Message-ID: <20250205214025.72516-3-catherine.hoang@oracle.com> + +From: "Darrick J. Wong" + +commit 05aba1953f4a6e2b48e13c610e8a4545ba4ef509 upstream. + +Actually use the inumber validator to check the argument passed in here. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Reviewed-by: Dave Chinner +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_icache.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -748,7 +748,7 @@ xfs_iget( + ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); + + /* reject inode numbers outside existing AGs */ +- if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) ++ if (!xfs_verify_ino(mp, ino)) + return -EINVAL; + + XFS_STATS_INC(mp, xs_ig_attempts);