From: Greg Kroah-Hartman Date: Fri, 18 Oct 2024 08:44:33 +0000 (+0200) Subject: 6.6-stable patches X-Git-Tag: v5.10.228~63 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=53eda5ab3dbf98c563d8ed9a7b43af2bce5a9bcc;p=thirdparty%2Fkernel%2Fstable-queue.git 6.6-stable patches added patches: xfs-allow-symlinks-with-short-remote-targets.patch xfs-allow-unlinked-symlinks-and-dirs-with-zero-size.patch xfs-check-opcode-and-iovec-count-match-in-xlog_recover_attri_commit_pass2.patch xfs-check-shortform-attr-entry-flags-specifically.patch xfs-convert-delayed-extents-to-unwritten-when-zeroing-post-eof-blocks.patch xfs-enforce-one-namespace-per-attribute.patch xfs-fix-error-returns-from-xfs_bmapi_write.patch xfs-fix-freeing-speculative-preallocations-for-preallocated-files.patch xfs-fix-missing-check-for-invalid-attr-flags.patch xfs-fix-unlink-vs-cluster-buffer-instantiation-race.patch xfs-fix-xfs_bmap_add_extent_delay_real-for-partial-conversions.patch xfs-make-sure-sb_fdblocks-is-non-negative.patch xfs-make-the-seq-argument-to-xfs_bmapi_convert_delalloc-optional.patch xfs-make-xfs_bmapi_convert_delalloc-to-allocate-the-target-offset.patch xfs-match-lock-mode-in-xfs_buffered_write_iomap_begin.patch xfs-remove-a-racy-if_bytes-check-in-xfs_reflink_end_cow_extent.patch xfs-require-xfs_sb_feat_incompat_log_xattrs-for-attr-log-intent-item-recovery.patch xfs-restrict-when-we-try-to-align-cow-fork-delalloc-to-cowextsz-hints.patch xfs-revert-commit-44af6c7e59b12.patch xfs-use-dontcache-for-grabbing-inodes-during-scrub.patch xfs-validate-recovered-name-buffers-when-recovering-xattr-items.patch --- diff --git a/queue-6.6/series b/queue-6.6/series index 2852ec3c626..171f49fe4dc 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -21,3 +21,24 @@ selftests-mm-fix-deadlock-for-fork-after-pthread_create-on-arm.patch mm-mremap-fix-move_normal_pmd-retract_page_tables-race.patch mm-mglru-only-clear-kswapd_failures-if-reclaimable.patch mm-swapfile-skip-hugetlb-pages-for-unuse_vma.patch +xfs-fix-error-returns-from-xfs_bmapi_write.patch +xfs-fix-xfs_bmap_add_extent_delay_real-for-partial-conversions.patch +xfs-remove-a-racy-if_bytes-check-in-xfs_reflink_end_cow_extent.patch +xfs-require-xfs_sb_feat_incompat_log_xattrs-for-attr-log-intent-item-recovery.patch +xfs-check-opcode-and-iovec-count-match-in-xlog_recover_attri_commit_pass2.patch +xfs-fix-missing-check-for-invalid-attr-flags.patch +xfs-check-shortform-attr-entry-flags-specifically.patch +xfs-validate-recovered-name-buffers-when-recovering-xattr-items.patch +xfs-enforce-one-namespace-per-attribute.patch +xfs-revert-commit-44af6c7e59b12.patch +xfs-use-dontcache-for-grabbing-inodes-during-scrub.patch +xfs-match-lock-mode-in-xfs_buffered_write_iomap_begin.patch +xfs-make-the-seq-argument-to-xfs_bmapi_convert_delalloc-optional.patch +xfs-make-xfs_bmapi_convert_delalloc-to-allocate-the-target-offset.patch +xfs-convert-delayed-extents-to-unwritten-when-zeroing-post-eof-blocks.patch +xfs-allow-symlinks-with-short-remote-targets.patch +xfs-make-sure-sb_fdblocks-is-non-negative.patch +xfs-fix-unlink-vs-cluster-buffer-instantiation-race.patch +xfs-fix-freeing-speculative-preallocations-for-preallocated-files.patch +xfs-allow-unlinked-symlinks-and-dirs-with-zero-size.patch +xfs-restrict-when-we-try-to-align-cow-fork-delalloc-to-cowextsz-hints.patch diff --git a/queue-6.6/xfs-allow-symlinks-with-short-remote-targets.patch b/queue-6.6/xfs-allow-symlinks-with-short-remote-targets.patch new file mode 100644 index 00000000000..665ffe115e2 --- /dev/null +++ b/queue-6.6/xfs-allow-symlinks-with-short-remote-targets.patch @@ -0,0 +1,114 @@ +From stable+bounces-86419-greg=kroah.com@vger.kernel.org Wed Oct 16 02:12:43 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:21 -0700 +Subject: xfs: allow symlinks with short remote targets +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-17-catherine.hoang@oracle.com> + +From: "Darrick J. Wong" + +commit 38de567906d95c397d87f292b892686b7ec6fbc3 upstream. + +An internal user complained about log recovery failing on a symlink +("Bad dinode after recovery") with the following (excerpted) format: + +core.magic = 0x494e +core.mode = 0120777 +core.version = 3 +core.format = 2 (extents) +core.nlinkv2 = 1 +core.nextents = 1 +core.size = 297 +core.nblocks = 1 +core.naextents = 0 +core.forkoff = 0 +core.aformat = 2 (extents) +u3.bmx[0] = [startoff,startblock,blockcount,extentflag] +0:[0,12,1,0] + +This is a symbolic link with a 297-byte target stored in a disk block, +which is to say this is a symlink with a remote target. The forkoff is +0, which is to say that there's 512 - 176 == 336 bytes in the inode core +to store the data fork. + +Eventually, testing of generic/388 failed with the same inode corruption +message during inode recovery. In writing a debugging patch to call +xfs_dinode_verify on dirty inode log items when we're committing +transactions, I observed that xfs/298 can reproduce the problem quite +quickly. + +xfs/298 creates a symbolic link, adds some extended attributes, then +deletes them all. The test failure occurs when the final removexattr +also deletes the attr fork because that does not convert the remote +symlink back into a shortform symlink. That is how we trip this test. +The only reason why xfs/298 only triggers with the debug patch added is +that it deletes the symlink, so the final iflush shows the inode as +free. + +I wrote a quick fstest to emulate the behavior of xfs/298, except that +it leaves the symlinks on the filesystem after inducing the "corrupt" +state. Kernels going back at least as far as 4.18 have written out +symlink inodes in this manner and prior to 1eb70f54c445f they did not +object to reading them back in. + +Because we've been writing out inodes this way for quite some time, the +only way to fix this is to relax the check for symbolic links. +Directories don't have this problem because di_size is bumped to +blocksize during the sf->data conversion. + +Fixes: 1eb70f54c445f ("xfs: validate inode fork size against fork format") +Signed-off-by: "Darrick J. Wong" +Reviewed-by: Christoph Hellwig +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_inode_buf.c | 28 ++++++++++++++++++++++++---- + 1 file changed, 24 insertions(+), 4 deletions(-) + +--- a/fs/xfs/libxfs/xfs_inode_buf.c ++++ b/fs/xfs/libxfs/xfs_inode_buf.c +@@ -366,17 +366,37 @@ xfs_dinode_verify_fork( + /* + * For fork types that can contain local data, check that the fork + * format matches the size of local data contained within the fork. +- * +- * For all types, check that when the size says the should be in extent +- * or btree format, the inode isn't claiming it is in local format. + */ + if (whichfork == XFS_DATA_FORK) { +- if (S_ISDIR(mode) || S_ISLNK(mode)) { ++ /* ++ * A directory small enough to fit in the inode must be stored ++ * in local format. The directory sf <-> extents conversion ++ * code updates the directory size accordingly. ++ */ ++ if (S_ISDIR(mode)) { + if (be64_to_cpu(dip->di_size) <= fork_size && + fork_format != XFS_DINODE_FMT_LOCAL) + return __this_address; + } + ++ /* ++ * A symlink with a target small enough to fit in the inode can ++ * be stored in extents format if xattrs were added (thus ++ * converting the data fork from shortform to remote format) ++ * and then removed. ++ */ ++ if (S_ISLNK(mode)) { ++ if (be64_to_cpu(dip->di_size) <= fork_size && ++ fork_format != XFS_DINODE_FMT_EXTENTS && ++ fork_format != XFS_DINODE_FMT_LOCAL) ++ return __this_address; ++ } ++ ++ /* ++ * For all types, check that when the size says the fork should ++ * be in extent or btree format, the inode isn't claiming to be ++ * in local format. ++ */ + if (be64_to_cpu(dip->di_size) > fork_size && + fork_format == XFS_DINODE_FMT_LOCAL) + return __this_address; diff --git a/queue-6.6/xfs-allow-unlinked-symlinks-and-dirs-with-zero-size.patch b/queue-6.6/xfs-allow-unlinked-symlinks-and-dirs-with-zero-size.patch new file mode 100644 index 00000000000..6db3dcfd614 --- /dev/null +++ b/queue-6.6/xfs-allow-unlinked-symlinks-and-dirs-with-zero-size.patch @@ -0,0 +1,75 @@ +From stable+bounces-86420-greg=kroah.com@vger.kernel.org Wed Oct 16 02:12:44 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:26 -0700 +Subject: xfs: allow unlinked symlinks and dirs with zero size +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-22-catherine.hoang@oracle.com> + +From: "Darrick J. Wong" + +commit 1ec9307fc066dd8a140d5430f8a7576aa9d78cd3 upstream. + +For a very very long time, inode inactivation has set the inode size to +zero before unmapping the extents associated with the data fork. +Unfortunately, commit 3c6f46eacd876 changed the inode verifier to +prohibit zero-length symlinks and directories. If an inode happens to +get logged in this state and the system crashes before freeing the +inode, log recovery will also fail on the broken inode. + +Therefore, allow zero-size symlinks and directories as long as the link +count is zero; nobody will be able to open these files by handle so +there isn't any risk of data exposure. + +Fixes: 3c6f46eacd876 ("xfs: sanity check directory inode di_size") +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_inode_buf.c | 23 ++++++++++++++++++----- + 1 file changed, 18 insertions(+), 5 deletions(-) + +--- a/fs/xfs/libxfs/xfs_inode_buf.c ++++ b/fs/xfs/libxfs/xfs_inode_buf.c +@@ -371,10 +371,13 @@ xfs_dinode_verify_fork( + /* + * A directory small enough to fit in the inode must be stored + * in local format. The directory sf <-> extents conversion +- * code updates the directory size accordingly. ++ * code updates the directory size accordingly. Directories ++ * being truncated have zero size and are not subject to this ++ * check. + */ + if (S_ISDIR(mode)) { +- if (be64_to_cpu(dip->di_size) <= fork_size && ++ if (dip->di_size && ++ be64_to_cpu(dip->di_size) <= fork_size && + fork_format != XFS_DINODE_FMT_LOCAL) + return __this_address; + } +@@ -512,9 +515,19 @@ xfs_dinode_verify( + if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN) + return __this_address; + +- /* No zero-length symlinks/dirs. */ +- if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) +- return __this_address; ++ /* ++ * No zero-length symlinks/dirs unless they're unlinked and hence being ++ * inactivated. ++ */ ++ if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) { ++ if (dip->di_version > 1) { ++ if (dip->di_nlink) ++ return __this_address; ++ } else { ++ if (dip->di_onlink) ++ return __this_address; ++ } ++ } + + fa = xfs_dinode_verify_nrext64(mp, dip); + if (fa) diff --git a/queue-6.6/xfs-check-opcode-and-iovec-count-match-in-xlog_recover_attri_commit_pass2.patch b/queue-6.6/xfs-check-opcode-and-iovec-count-match-in-xlog_recover_attri_commit_pass2.patch new file mode 100644 index 00000000000..a8688e5dd4e --- /dev/null +++ b/queue-6.6/xfs-check-opcode-and-iovec-count-match-in-xlog_recover_attri_commit_pass2.patch @@ -0,0 +1,67 @@ +From stable+bounces-86405-greg=kroah.com@vger.kernel.org Wed Oct 16 02:11:55 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:10 -0700 +Subject: xfs: check opcode and iovec count match in xlog_recover_attri_commit_pass2 +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-6-catherine.hoang@oracle.com> + +From: "Darrick J. Wong" + +commit ad206ae50eca62836c5460ab5bbf2a6c59a268e7 upstream. + +Check that the number of recovered log iovecs is what is expected for +the xattri opcode is expecting. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_attr_item.c | 27 +++++++++++++++++++++++++++ + 1 file changed, 27 insertions(+) + +--- a/fs/xfs/xfs_attr_item.c ++++ b/fs/xfs/xfs_attr_item.c +@@ -719,6 +719,7 @@ xlog_recover_attri_commit_pass2( + const void *attr_value = NULL; + const void *attr_name; + size_t len; ++ unsigned int op; + + attri_formatp = item->ri_buf[0].i_addr; + attr_name = item->ri_buf[1].i_addr; +@@ -737,6 +738,32 @@ xlog_recover_attri_commit_pass2( + return -EFSCORRUPTED; + } + ++ /* Check the number of log iovecs makes sense for the op code. */ ++ op = attri_formatp->alfi_op_flags & XFS_ATTRI_OP_FLAGS_TYPE_MASK; ++ switch (op) { ++ case XFS_ATTRI_OP_FLAGS_SET: ++ case XFS_ATTRI_OP_FLAGS_REPLACE: ++ /* Log item, attr name, attr value */ ++ if (item->ri_total != 3) { ++ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, ++ attri_formatp, len); ++ return -EFSCORRUPTED; ++ } ++ break; ++ case XFS_ATTRI_OP_FLAGS_REMOVE: ++ /* Log item, attr name */ ++ if (item->ri_total != 2) { ++ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, ++ attri_formatp, len); ++ return -EFSCORRUPTED; ++ } ++ break; ++ default: ++ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, ++ attri_formatp, len); ++ return -EFSCORRUPTED; ++ } ++ + /* Validate the attr name */ + if (item->ri_buf[1].i_len != + xlog_calc_iovec_len(attri_formatp->alfi_name_len)) { diff --git a/queue-6.6/xfs-check-shortform-attr-entry-flags-specifically.patch b/queue-6.6/xfs-check-shortform-attr-entry-flags-specifically.patch new file mode 100644 index 00000000000..894f393fb8f --- /dev/null +++ b/queue-6.6/xfs-check-shortform-attr-entry-flags-specifically.patch @@ -0,0 +1,44 @@ +From stable+bounces-86409-greg=kroah.com@vger.kernel.org Wed Oct 16 02:12:06 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:12 -0700 +Subject: xfs: check shortform attr entry flags specifically +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-8-catherine.hoang@oracle.com> + +From: "Darrick J. Wong" + +commit 309dc9cbbb4379241bcc9b5a6a42c04279a0e5a7 upstream. + +While reviewing flag checking in the attr scrub functions, we noticed +that the shortform attr scanner didn't catch entries that have the LOCAL +or INCOMPLETE bits set. Neither of these flags can ever be set on a +shortform attr, so we need to check this narrower set of valid flags. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/scrub/attr.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/fs/xfs/scrub/attr.c ++++ b/fs/xfs/scrub/attr.c +@@ -566,6 +566,15 @@ xchk_xattr_check_sf( + break; + } + ++ /* ++ * Shortform entries do not set LOCAL or INCOMPLETE, so the ++ * only valid flag bits here are for namespaces. ++ */ ++ if (sfe->flags & ~XFS_ATTR_NSP_ONDISK_MASK) { ++ xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0); ++ break; ++ } ++ + if (!xchk_xattr_set_map(sc, ab->usedmap, + (char *)sfe - (char *)sf, + sizeof(struct xfs_attr_sf_entry))) { diff --git a/queue-6.6/xfs-convert-delayed-extents-to-unwritten-when-zeroing-post-eof-blocks.patch b/queue-6.6/xfs-convert-delayed-extents-to-unwritten-when-zeroing-post-eof-blocks.patch new file mode 100644 index 00000000000..f51e1a51969 --- /dev/null +++ b/queue-6.6/xfs-convert-delayed-extents-to-unwritten-when-zeroing-post-eof-blocks.patch @@ -0,0 +1,105 @@ +From stable+bounces-86416-greg=kroah.com@vger.kernel.org Wed Oct 16 02:12:26 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:20 -0700 +Subject: xfs: convert delayed extents to unwritten when zeroing post eof blocks +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-16-catherine.hoang@oracle.com> + +From: Zhang Yi + +commit 5ce5674187c345dc31534d2024c09ad8ef29b7ba upstream. + +Current clone operation could be non-atomic if the destination of a file +is beyond EOF, user could get a file with corrupted (zeroed) data on +crash. + +The problem is about preallocations. If you write some data into a file: + + [A...B) + +and XFS decides to preallocate some post-eof blocks, then it can create +a delayed allocation reservation: + + [A.........D) + +The writeback path tries to convert delayed extents to real ones by +allocating blocks. If there aren't enough contiguous free space, we can +end up with two extents, the first real and the second still delalloc: + + [A....C)[C.D) + +After that, both the in-memory and the on-disk file sizes are still B. +If we clone into the range [E...F) from another file: + + [A....C)[C.D) [E...F) + +then xfs_reflink_zero_posteof() calls iomap_zero_range() to zero out the +range [B, E) beyond EOF and flush it. Since [C, D) is still a delalloc +extent, its pagecache will be zeroed and both the in-memory and on-disk +size will be updated to D after flushing but before cloning. This is +wrong, because the user can see the size change and read the zeroes +while the clone operation is ongoing. + +We need to keep the in-memory and on-disk size before the clone +operation starts, so instead of writing zeroes through the page cache +for delayed ranges beyond EOF, we convert these ranges to unwritten and +invalidate any cached data over that range beyond EOF. + +Suggested-by: Dave Chinner +Signed-off-by: Zhang Yi +Reviewed-by: "Darrick J. Wong" +Reviewed-by: Christoph Hellwig +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_iomap.c | 29 +++++++++++++++++++++++++++++ + 1 file changed, 29 insertions(+) + +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -1006,6 +1006,24 @@ xfs_buffered_write_iomap_begin( + } + + /* ++ * For zeroing, trim a delalloc extent that extends beyond the EOF ++ * block. If it starts beyond the EOF block, convert it to an ++ * unwritten extent. ++ */ ++ if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb && ++ isnullstartblock(imap.br_startblock)) { ++ xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)); ++ ++ if (offset_fsb >= eof_fsb) ++ goto convert_delay; ++ if (end_fsb > eof_fsb) { ++ end_fsb = eof_fsb; ++ xfs_trim_extent(&imap, offset_fsb, ++ end_fsb - offset_fsb); ++ } ++ } ++ ++ /* + * Search the COW fork extent list even if we did not find a data fork + * extent. This serves two purposes: first this implements the + * speculative preallocation using cowextsize, so that we also unshare +@@ -1150,6 +1168,17 @@ found_imap: + xfs_iunlock(ip, lockmode); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); + ++convert_delay: ++ xfs_iunlock(ip, lockmode); ++ truncate_pagecache(inode, offset); ++ error = xfs_bmapi_convert_delalloc(ip, XFS_DATA_FORK, offset, ++ iomap, NULL); ++ if (error) ++ return error; ++ ++ trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &imap); ++ return 0; ++ + found_cow: + seq = xfs_iomap_inode_sequence(ip, 0); + if (imap.br_startoff <= offset_fsb) { diff --git a/queue-6.6/xfs-enforce-one-namespace-per-attribute.patch b/queue-6.6/xfs-enforce-one-namespace-per-attribute.patch new file mode 100644 index 00000000000..3a4db1ddde8 --- /dev/null +++ b/queue-6.6/xfs-enforce-one-namespace-per-attribute.patch @@ -0,0 +1,196 @@ +From stable+bounces-86408-greg=kroah.com@vger.kernel.org Wed Oct 16 02:12:03 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:14 -0700 +Subject: xfs: enforce one namespace per attribute +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-10-catherine.hoang@oracle.com> + +From: "Darrick J. Wong" + +commit ea0b3e814741fb64e7785b564ea619578058e0b0 upstream. + +[backport: fix conflicts due to various xattr refactoring] + +Create a standardized helper function to enforce one namespace bit per +extended attribute, and refactor all the open-coded hweight logic. This +function is not a static inline to avoid porting hassles in userspace. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_attr.c | 11 +++++++++++ + fs/xfs/libxfs/xfs_attr.h | 4 +++- + fs/xfs/libxfs/xfs_attr_leaf.c | 6 +++++- + fs/xfs/scrub/attr.c | 12 +++++------- + fs/xfs/xfs_attr_item.c | 10 ++++++++-- + fs/xfs/xfs_attr_list.c | 11 +++++++---- + 6 files changed, 39 insertions(+), 15 deletions(-) + +--- a/fs/xfs/libxfs/xfs_attr.c ++++ b/fs/xfs/libxfs/xfs_attr.c +@@ -1565,12 +1565,23 @@ out_release: + return error; + } + ++/* Enforce that there is at most one namespace bit per attr. */ ++inline bool xfs_attr_check_namespace(unsigned int attr_flags) ++{ ++ return hweight32(attr_flags & XFS_ATTR_NSP_ONDISK_MASK) < 2; ++} ++ + /* Returns true if the attribute entry name is valid. */ + bool + xfs_attr_namecheck( ++ unsigned int attr_flags, + const void *name, + size_t length) + { ++ /* Only one namespace bit allowed. */ ++ if (!xfs_attr_check_namespace(attr_flags)) ++ return false; ++ + /* + * MAXNAMELEN includes the trailing null, but (name/length) leave it + * out, so use >= for the length check. +--- a/fs/xfs/libxfs/xfs_attr.h ++++ b/fs/xfs/libxfs/xfs_attr.h +@@ -547,7 +547,9 @@ int xfs_attr_get(struct xfs_da_args *arg + int xfs_attr_set(struct xfs_da_args *args); + int xfs_attr_set_iter(struct xfs_attr_intent *attr); + int xfs_attr_remove_iter(struct xfs_attr_intent *attr); +-bool xfs_attr_namecheck(const void *name, size_t length); ++bool xfs_attr_check_namespace(unsigned int attr_flags); ++bool xfs_attr_namecheck(unsigned int attr_flags, const void *name, ++ size_t length); + int xfs_attr_calc_size(struct xfs_da_args *args, int *local); + void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, + unsigned int *total); +--- a/fs/xfs/libxfs/xfs_attr_leaf.c ++++ b/fs/xfs/libxfs/xfs_attr_leaf.c +@@ -984,6 +984,10 @@ xfs_attr_shortform_to_leaf( + nargs.hashval = xfs_da_hashname(sfe->nameval, + sfe->namelen); + nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK; ++ if (!xfs_attr_check_namespace(sfe->flags)) { ++ error = -EFSCORRUPTED; ++ goto out; ++ } + error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ + ASSERT(error == -ENOATTR); + error = xfs_attr3_leaf_add(bp, &nargs); +@@ -1105,7 +1109,7 @@ xfs_attr_shortform_verify( + * one namespace flag per xattr, so we can just count the + * bits (i.e. hweight) here. + */ +- if (hweight8(sfep->flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) ++ if (!xfs_attr_check_namespace(sfep->flags)) + return __this_address; + + sfep = next_sfep; +--- a/fs/xfs/scrub/attr.c ++++ b/fs/xfs/scrub/attr.c +@@ -193,14 +193,8 @@ xchk_xattr_listent( + return; + } + +- /* Only one namespace bit allowed. */ +- if (hweight32(flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) { +- xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); +- goto fail_xref; +- } +- + /* Does this name make sense? */ +- if (!xfs_attr_namecheck(name, namelen)) { ++ if (!xfs_attr_namecheck(flags, name, namelen)) { + xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); + goto fail_xref; + } +@@ -501,6 +495,10 @@ xchk_xattr_rec( + xchk_da_set_corrupt(ds, level); + return 0; + } ++ if (!xfs_attr_check_namespace(ent->flags)) { ++ xchk_da_set_corrupt(ds, level); ++ return 0; ++ } + + if (ent->flags & XFS_ATTR_LOCAL) { + lentry = (struct xfs_attr_leaf_name_local *) +--- a/fs/xfs/xfs_attr_item.c ++++ b/fs/xfs/xfs_attr_item.c +@@ -522,6 +522,10 @@ xfs_attri_validate( + if (attrp->alfi_attr_filter & ~XFS_ATTRI_FILTER_MASK) + return false; + ++ if (!xfs_attr_check_namespace(attrp->alfi_attr_filter & ++ XFS_ATTR_NSP_ONDISK_MASK)) ++ return false; ++ + /* alfi_op_flags should be either a set or remove */ + switch (op) { + case XFS_ATTRI_OP_FLAGS_SET: +@@ -572,7 +576,8 @@ xfs_attri_item_recover( + */ + attrp = &attrip->attri_format; + if (!xfs_attri_validate(mp, attrp) || +- !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) ++ !xfs_attr_namecheck(attrp->alfi_attr_filter, nv->name.i_addr, ++ nv->name.i_len)) + return -EFSCORRUPTED; + + error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); +@@ -772,7 +777,8 @@ xlog_recover_attri_commit_pass2( + } + + attr_name = item->ri_buf[i].i_addr; +- if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) { ++ if (!xfs_attr_namecheck(attri_formatp->alfi_attr_filter, attr_name, ++ attri_formatp->alfi_name_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + attri_formatp, len); + return -EFSCORRUPTED; +--- a/fs/xfs/xfs_attr_list.c ++++ b/fs/xfs/xfs_attr_list.c +@@ -82,7 +82,8 @@ xfs_attr_shortform_list( + (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) { + for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + if (XFS_IS_CORRUPT(context->dp->i_mount, +- !xfs_attr_namecheck(sfe->nameval, ++ !xfs_attr_namecheck(sfe->flags, ++ sfe->nameval, + sfe->namelen))) + return -EFSCORRUPTED; + context->put_listent(context, +@@ -120,7 +121,8 @@ xfs_attr_shortform_list( + for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { + if (unlikely( + ((char *)sfe < (char *)sf) || +- ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) { ++ ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)) || ++ !xfs_attr_check_namespace(sfe->flags))) { + XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", + XFS_ERRLEVEL_LOW, + context->dp->i_mount, sfe, +@@ -174,7 +176,7 @@ xfs_attr_shortform_list( + cursor->offset = 0; + } + if (XFS_IS_CORRUPT(context->dp->i_mount, +- !xfs_attr_namecheck(sbp->name, ++ !xfs_attr_namecheck(sbp->flags, sbp->name, + sbp->namelen))) { + error = -EFSCORRUPTED; + goto out; +@@ -465,7 +467,8 @@ xfs_attr3_leaf_list_int( + } + + if (XFS_IS_CORRUPT(context->dp->i_mount, +- !xfs_attr_namecheck(name, namelen))) ++ !xfs_attr_namecheck(entry->flags, name, ++ namelen))) + return -EFSCORRUPTED; + context->put_listent(context, entry->flags, + name, namelen, valuelen); diff --git a/queue-6.6/xfs-fix-error-returns-from-xfs_bmapi_write.patch b/queue-6.6/xfs-fix-error-returns-from-xfs_bmapi_write.patch new file mode 100644 index 00000000000..372c75324a3 --- /dev/null +++ b/queue-6.6/xfs-fix-error-returns-from-xfs_bmapi_write.patch @@ -0,0 +1,328 @@ +From stable+bounces-86410-greg=kroah.com@vger.kernel.org Wed Oct 16 02:12:08 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:06 -0700 +Subject: xfs: fix error returns from xfs_bmapi_write +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-2-catherine.hoang@oracle.com> + +From: Christoph Hellwig + +commit 6773da870ab89123d1b513da63ed59e32a29cb77 upstream. + +[backport: resolve conflicts due to missing quota_repair.c, +rtbitmap_repair.c, xfs_bmap_mark_sick()] + +xfs_bmapi_write can return 0 without actually returning a mapping in +mval in two different cases: + + 1) when there is absolutely no space available to do an allocation + 2) when converting delalloc space, and the allocation is so small + that it only covers parts of the delalloc extent before the + range requested by the caller + +Callers at best can handle one of these cases, but in many cases can't +cope with either one. Switch xfs_bmapi_write to always return a +mapping or return an error code instead. For case 1) above ENOSPC is +the obvious choice which is very much what the callers expect anyway. +For case 2) there is no really good error code, so pick a funky one +from the SysV streams portfolio. + +This fixes the reproducer here: + + https://lore.kernel.org/linux-xfs/CAEJPjCvT3Uag-pMTYuigEjWZHn1sGMZ0GCjVVCv29tNHK76Cgg@mail.gmail.com0/ + +which uses reserved blocks to create file systems that are gravely +out of space and thus cause at least xfs_file_alloc_space to hang +and trigger the lack of ENOSPC handling in xfs_dquot_disk_alloc. + +Note that this patch does not actually make any caller but +xfs_alloc_file_space deal intelligently with case 2) above. + +Signed-off-by: Christoph Hellwig +Reported-by: 刘通 +Reviewed-by: "Darrick J. Wong" +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_attr_remote.c | 1 + fs/xfs/libxfs/xfs_bmap.c | 46 ++++++++++++++++++++++++++++++++-------- + fs/xfs/libxfs/xfs_da_btree.c | 20 ++++------------- + fs/xfs/xfs_bmap_util.c | 31 +++++++++++++------------- + fs/xfs/xfs_dquot.c | 1 + fs/xfs/xfs_iomap.c | 8 ------ + fs/xfs/xfs_reflink.c | 14 ------------ + fs/xfs/xfs_rtalloc.c | 2 - + 8 files changed, 57 insertions(+), 66 deletions(-) + +--- a/fs/xfs/libxfs/xfs_attr_remote.c ++++ b/fs/xfs/libxfs/xfs_attr_remote.c +@@ -619,7 +619,6 @@ xfs_attr_rmtval_set_blk( + if (error) + return error; + +- ASSERT(nmap == 1); + ASSERT((map->br_startblock != DELAYSTARTBLOCK) && + (map->br_startblock != HOLESTARTBLOCK)); + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -4128,8 +4128,10 @@ xfs_bmapi_allocate( + } else { + error = xfs_bmap_alloc_userdata(bma); + } +- if (error || bma->blkno == NULLFSBLOCK) ++ if (error) + return error; ++ if (bma->blkno == NULLFSBLOCK) ++ return -ENOSPC; + + if (bma->flags & XFS_BMAPI_ZERO) { + error = xfs_zero_extent(bma->ip, bma->blkno, bma->length); +@@ -4309,6 +4311,15 @@ xfs_bmapi_finish( + * extent state if necessary. Details behaviour is controlled by the flags + * parameter. Only allocates blocks from a single allocation group, to avoid + * locking problems. ++ * ++ * Returns 0 on success and places the extent mappings in mval. nmaps is used ++ * as an input/output parameter where the caller specifies the maximum number ++ * of mappings that may be returned and xfs_bmapi_write passes back the number ++ * of mappings (including existing mappings) it found. ++ * ++ * Returns a negative error code on failure, including -ENOSPC when it could not ++ * allocate any blocks and -ENOSR when it did allocate blocks to convert a ++ * delalloc range, but those blocks were before the passed in range. + */ + int + xfs_bmapi_write( +@@ -4436,10 +4447,16 @@ xfs_bmapi_write( + ASSERT(len > 0); + ASSERT(bma.length > 0); + error = xfs_bmapi_allocate(&bma); +- if (error) ++ if (error) { ++ /* ++ * If we already allocated space in a previous ++ * iteration return what we go so far when ++ * running out of space. ++ */ ++ if (error == -ENOSPC && bma.nallocs) ++ break; + goto error0; +- if (bma.blkno == NULLFSBLOCK) +- break; ++ } + + /* + * If this is a CoW allocation, record the data in +@@ -4477,7 +4494,6 @@ xfs_bmapi_write( + if (!xfs_iext_next_extent(ifp, &bma.icur, &bma.got)) + eof = true; + } +- *nmap = n; + + error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, + whichfork); +@@ -4488,7 +4504,22 @@ xfs_bmapi_write( + ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork)); + xfs_bmapi_finish(&bma, whichfork, 0); + xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, +- orig_nmap, *nmap); ++ orig_nmap, n); ++ ++ /* ++ * When converting delayed allocations, xfs_bmapi_allocate ignores ++ * the passed in bno and always converts from the start of the found ++ * delalloc extent. ++ * ++ * To avoid a successful return with *nmap set to 0, return the magic ++ * -ENOSR error code for this particular case so that the caller can ++ * handle it. ++ */ ++ if (!n) { ++ ASSERT(bma.nallocs >= *nmap); ++ return -ENOSR; ++ } ++ *nmap = n; + return 0; + error0: + xfs_bmapi_finish(&bma, whichfork, error); +@@ -4595,9 +4626,6 @@ xfs_bmapi_convert_delalloc( + if (error) + goto out_finish; + +- error = -ENOSPC; +- if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK)) +- goto out_finish; + error = -EFSCORRUPTED; + if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) + goto out_finish; +--- a/fs/xfs/libxfs/xfs_da_btree.c ++++ b/fs/xfs/libxfs/xfs_da_btree.c +@@ -2158,8 +2158,8 @@ xfs_da_grow_inode_int( + struct xfs_inode *dp = args->dp; + int w = args->whichfork; + xfs_rfsblock_t nblks = dp->i_nblocks; +- struct xfs_bmbt_irec map, *mapp; +- int nmap, error, got, i, mapi; ++ struct xfs_bmbt_irec map, *mapp = ↦ ++ int nmap, error, got, i, mapi = 1; + + /* + * Find a spot in the file space to put the new block. +@@ -2175,14 +2175,7 @@ xfs_da_grow_inode_int( + error = xfs_bmapi_write(tp, dp, *bno, count, + xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, + args->total, &map, &nmap); +- if (error) +- return error; +- +- ASSERT(nmap <= 1); +- if (nmap == 1) { +- mapp = ↦ +- mapi = 1; +- } else if (nmap == 0 && count > 1) { ++ if (error == -ENOSPC && count > 1) { + xfs_fileoff_t b; + int c; + +@@ -2199,16 +2192,13 @@ xfs_da_grow_inode_int( + args->total, &mapp[mapi], &nmap); + if (error) + goto out_free_map; +- if (nmap < 1) +- break; + mapi += nmap; + b = mapp[mapi - 1].br_startoff + + mapp[mapi - 1].br_blockcount; + } +- } else { +- mapi = 0; +- mapp = NULL; + } ++ if (error) ++ goto out_free_map; + + /* + * Count the blocks we got, make sure it matches the total. +--- a/fs/xfs/xfs_bmap_util.c ++++ b/fs/xfs/xfs_bmap_util.c +@@ -868,33 +868,32 @@ xfs_alloc_file_space( + if (error) + goto error; + +- error = xfs_bmapi_write(tp, ip, startoffset_fsb, +- allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp, +- &nimaps); +- if (error) +- goto error; +- +- ip->i_diflags |= XFS_DIFLAG_PREALLOC; +- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); +- +- error = xfs_trans_commit(tp); +- xfs_iunlock(ip, XFS_ILOCK_EXCL); +- if (error) +- break; +- + /* + * If the allocator cannot find a single free extent large + * enough to cover the start block of the requested range, +- * xfs_bmapi_write will return 0 but leave *nimaps set to 0. ++ * xfs_bmapi_write will return -ENOSR. + * + * In that case we simply need to keep looping with the same + * startoffset_fsb so that one of the following allocations + * will eventually reach the requested range. + */ +- if (nimaps) { ++ error = xfs_bmapi_write(tp, ip, startoffset_fsb, ++ allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp, ++ &nimaps); ++ if (error) { ++ if (error != -ENOSR) ++ goto error; ++ error = 0; ++ } else { + startoffset_fsb += imapp->br_blockcount; + allocatesize_fsb -= imapp->br_blockcount; + } ++ ++ ip->i_diflags |= XFS_DIFLAG_PREALLOC; ++ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); ++ ++ error = xfs_trans_commit(tp); ++ xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + return error; +--- a/fs/xfs/xfs_dquot.c ++++ b/fs/xfs/xfs_dquot.c +@@ -333,7 +333,6 @@ xfs_dquot_disk_alloc( + goto err_cancel; + + ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); +- ASSERT(nmaps == 1); + ASSERT((map.br_startblock != DELAYSTARTBLOCK) && + (map.br_startblock != HOLESTARTBLOCK)); + +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -317,14 +317,6 @@ xfs_iomap_write_direct( + if (error) + goto out_unlock; + +- /* +- * Copy any maps to caller's array and return any error. +- */ +- if (nimaps == 0) { +- error = -ENOSPC; +- goto out_unlock; +- } +- + if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) + error = xfs_alert_fsblock_zero(ip, imap); + +--- a/fs/xfs/xfs_reflink.c ++++ b/fs/xfs/xfs_reflink.c +@@ -429,13 +429,6 @@ xfs_reflink_fill_cow_hole( + if (error) + return error; + +- /* +- * Allocation succeeded but the requested range was not even partially +- * satisfied? Bail out! +- */ +- if (nimaps == 0) +- return -ENOSPC; +- + convert: + return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); + +@@ -498,13 +491,6 @@ xfs_reflink_fill_delalloc( + error = xfs_trans_commit(tp); + if (error) + return error; +- +- /* +- * Allocation succeeded but the requested range was not even +- * partially satisfied? Bail out! +- */ +- if (nimaps == 0) +- return -ENOSPC; + } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff); + + return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); +--- a/fs/xfs/xfs_rtalloc.c ++++ b/fs/xfs/xfs_rtalloc.c +@@ -840,8 +840,6 @@ xfs_growfs_rt_alloc( + nmap = 1; + error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, + XFS_BMAPI_METADATA, 0, &map, &nmap); +- if (!error && nmap < 1) +- error = -ENOSPC; + if (error) + goto out_trans_cancel; + /* diff --git a/queue-6.6/xfs-fix-freeing-speculative-preallocations-for-preallocated-files.patch b/queue-6.6/xfs-fix-freeing-speculative-preallocations-for-preallocated-files.patch new file mode 100644 index 00000000000..f307c77dcdd --- /dev/null +++ b/queue-6.6/xfs-fix-freeing-speculative-preallocations-for-preallocated-files.patch @@ -0,0 +1,169 @@ +From stable+bounces-86417-greg=kroah.com@vger.kernel.org Wed Oct 16 02:12:38 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:24 -0700 +Subject: xfs: fix freeing speculative preallocations for preallocated files +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-20-catherine.hoang@oracle.com> + +From: Christoph Hellwig + +commit 610b29161b0aa9feb59b78dc867553274f17fb01 upstream. + +xfs_can_free_eofblocks returns false for files that have persistent +preallocations unless the force flag is passed and there are delayed +blocks. This means it won't free delalloc reservations for files +with persistent preallocations unless the force flag is set, and it +will also free the persistent preallocations if the force flag is +set and the file happens to have delayed allocations. + +Both of these are bad, so do away with the force flag and always free +only post-EOF delayed allocations for files with the XFS_DIFLAG_PREALLOC +or APPEND flags set. + +Signed-off-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_bmap_util.c | 30 ++++++++++++++++++++++-------- + fs/xfs/xfs_bmap_util.h | 2 +- + fs/xfs/xfs_icache.c | 2 +- + fs/xfs/xfs_inode.c | 14 ++++---------- + 4 files changed, 28 insertions(+), 20 deletions(-) + +--- a/fs/xfs/xfs_bmap_util.c ++++ b/fs/xfs/xfs_bmap_util.c +@@ -636,13 +636,11 @@ out_unlock: + + /* + * Test whether it is appropriate to check an inode for and free post EOF +- * blocks. The 'force' parameter determines whether we should also consider +- * regular files that are marked preallocated or append-only. ++ * blocks. + */ + bool + xfs_can_free_eofblocks( +- struct xfs_inode *ip, +- bool force) ++ struct xfs_inode *ip) + { + struct xfs_bmbt_irec imap; + struct xfs_mount *mp = ip->i_mount; +@@ -676,11 +674,11 @@ xfs_can_free_eofblocks( + return false; + + /* +- * Do not free real preallocated or append-only files unless the file +- * has delalloc blocks and we are forced to remove them. ++ * Only free real extents for inodes with persistent preallocations or ++ * the append-only flag. + */ + if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) +- if (!force || ip->i_delayed_blks == 0) ++ if (ip->i_delayed_blks == 0) + return false; + + /* +@@ -734,6 +732,22 @@ xfs_free_eofblocks( + /* Wait on dio to ensure i_size has settled. */ + inode_dio_wait(VFS_I(ip)); + ++ /* ++ * For preallocated files only free delayed allocations. ++ * ++ * Note that this means we also leave speculative preallocations in ++ * place for preallocated files. ++ */ ++ if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) { ++ if (ip->i_delayed_blks) { ++ xfs_bmap_punch_delalloc_range(ip, ++ round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize), ++ LLONG_MAX); ++ } ++ xfs_inode_clear_eofblocks_tag(ip); ++ return 0; ++ } ++ + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + if (error) { + ASSERT(xfs_is_shutdown(mp)); +@@ -1048,7 +1062,7 @@ xfs_prepare_shift( + * Trim eofblocks to avoid shifting uninitialized post-eof preallocation + * into the accessible region of the file. + */ +- if (xfs_can_free_eofblocks(ip, true)) { ++ if (xfs_can_free_eofblocks(ip)) { + error = xfs_free_eofblocks(ip); + if (error) + return error; +--- a/fs/xfs/xfs_bmap_util.h ++++ b/fs/xfs/xfs_bmap_util.h +@@ -63,7 +63,7 @@ int xfs_insert_file_space(struct xfs_ino + xfs_off_t len); + + /* EOF block manipulation functions */ +-bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); ++bool xfs_can_free_eofblocks(struct xfs_inode *ip); + int xfs_free_eofblocks(struct xfs_inode *ip); + + int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -1149,7 +1149,7 @@ xfs_inode_free_eofblocks( + } + *lockflags |= XFS_IOLOCK_EXCL; + +- if (xfs_can_free_eofblocks(ip, false)) ++ if (xfs_can_free_eofblocks(ip)) + return xfs_free_eofblocks(ip); + + /* inode could be preallocated or append-only */ +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -1469,7 +1469,7 @@ xfs_release( + if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) + return 0; + +- if (xfs_can_free_eofblocks(ip, false)) { ++ if (xfs_can_free_eofblocks(ip)) { + /* + * Check if the inode is being opened, written and closed + * frequently and we have delayed allocation blocks outstanding +@@ -1685,15 +1685,13 @@ xfs_inode_needs_inactive( + + /* + * This file isn't being freed, so check if there are post-eof blocks +- * to free. @force is true because we are evicting an inode from the +- * cache. Post-eof blocks must be freed, lest we end up with broken +- * free space accounting. ++ * to free. + * + * Note: don't bother with iolock here since lockdep complains about + * acquiring it in reclaim context. We have the only reference to the + * inode at this point anyways. + */ +- return xfs_can_free_eofblocks(ip, true); ++ return xfs_can_free_eofblocks(ip); + } + + /* +@@ -1741,15 +1739,11 @@ xfs_inactive( + + if (VFS_I(ip)->i_nlink != 0) { + /* +- * force is true because we are evicting an inode from the +- * cache. Post-eof blocks must be freed, lest we end up with +- * broken free space accounting. +- * + * Note: don't bother with iolock here since lockdep complains + * about acquiring it in reclaim context. We have the only + * reference to the inode at this point anyways. + */ +- if (xfs_can_free_eofblocks(ip, true)) ++ if (xfs_can_free_eofblocks(ip)) + error = xfs_free_eofblocks(ip); + + goto out; diff --git a/queue-6.6/xfs-fix-missing-check-for-invalid-attr-flags.patch b/queue-6.6/xfs-fix-missing-check-for-invalid-attr-flags.patch new file mode 100644 index 00000000000..9bd0e6c957c --- /dev/null +++ b/queue-6.6/xfs-fix-missing-check-for-invalid-attr-flags.patch @@ -0,0 +1,89 @@ +From stable+bounces-86406-greg=kroah.com@vger.kernel.org Wed Oct 16 02:11:58 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:11 -0700 +Subject: xfs: fix missing check for invalid attr flags +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-7-catherine.hoang@oracle.com> + +From: "Darrick J. Wong" + +commit f660ec8eaeb50d0317c29601aacabdb15e5f2203 upstream. + +[backport: fix build errors in xchk_xattr_listent] + +The xattr scrubber doesn't check for undefined flags in shortform attr +entries. Therefore, define a mask XFS_ATTR_ONDISK_MASK that has all +possible XFS_ATTR_* flags in it, and use that to check for unknown bits +in xchk_xattr_actor. + +Refactor the check in the dabtree scanner function to use the new mask +as well. The redundant checks need to be in place because the dabtree +check examines the hash mappings and therefore needs to decode the attr +leaf entries to compute the namehash. This happens before the walk of +the xattr entries themselves. + +Fixes: ae0506eba78fd ("xfs: check used space of shortform xattr structures") +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_da_format.h | 5 +++++ + fs/xfs/scrub/attr.c | 13 +++++++++---- + 2 files changed, 14 insertions(+), 4 deletions(-) + +--- a/fs/xfs/libxfs/xfs_da_format.h ++++ b/fs/xfs/libxfs/xfs_da_format.h +@@ -703,8 +703,13 @@ struct xfs_attr3_leafblock { + #define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT) + #define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT) + #define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT) ++ + #define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) + ++#define XFS_ATTR_ONDISK_MASK (XFS_ATTR_NSP_ONDISK_MASK | \ ++ XFS_ATTR_LOCAL | \ ++ XFS_ATTR_INCOMPLETE) ++ + /* + * Alignment for namelist and valuelist entries (since they are mixed + * there can be only one alignment value) +--- a/fs/xfs/scrub/attr.c ++++ b/fs/xfs/scrub/attr.c +@@ -182,6 +182,11 @@ xchk_xattr_listent( + return; + } + ++ if (flags & ~XFS_ATTR_ONDISK_MASK) { ++ xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); ++ goto fail_xref; ++ } ++ + if (flags & XFS_ATTR_INCOMPLETE) { + /* Incomplete attr key, just mark the inode for preening. */ + xchk_ino_set_preen(sx->sc, context->dp->i_ino); +@@ -463,7 +468,6 @@ xchk_xattr_rec( + xfs_dahash_t hash; + int nameidx; + int hdrsize; +- unsigned int badflags; + int error; + + ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); +@@ -493,10 +497,11 @@ xchk_xattr_rec( + + /* Retrieve the entry and check it. */ + hash = be32_to_cpu(ent->hashval); +- badflags = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_SECURE | +- XFS_ATTR_INCOMPLETE); +- if ((ent->flags & badflags) != 0) ++ if (ent->flags & ~XFS_ATTR_ONDISK_MASK) { + xchk_da_set_corrupt(ds, level); ++ return 0; ++ } ++ + if (ent->flags & XFS_ATTR_LOCAL) { + lentry = (struct xfs_attr_leaf_name_local *) + (((char *)bp->b_addr) + nameidx); diff --git a/queue-6.6/xfs-fix-unlink-vs-cluster-buffer-instantiation-race.patch b/queue-6.6/xfs-fix-unlink-vs-cluster-buffer-instantiation-race.patch new file mode 100644 index 00000000000..370e74be3e1 --- /dev/null +++ b/queue-6.6/xfs-fix-unlink-vs-cluster-buffer-instantiation-race.patch @@ -0,0 +1,180 @@ +From stable+bounces-86418-greg=kroah.com@vger.kernel.org Wed Oct 16 02:12:41 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:23 -0700 +Subject: xfs: fix unlink vs cluster buffer instantiation race +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-19-catherine.hoang@oracle.com> + +From: Dave Chinner + +commit 348a1983cf4cf5099fc398438a968443af4c9f65 upstream. + +Luis has been reporting an assert failure when freeing an inode +cluster during inode inactivation for a while. The assert looks +like: + + XFS: Assertion failed: bp->b_flags & XBF_DONE, file: fs/xfs/xfs_trans_buf.c, line: 241 + ------------[ cut here ]------------ + kernel BUG at fs/xfs/xfs_message.c:102! + Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI + CPU: 4 PID: 73 Comm: kworker/4:1 Not tainted 6.10.0-rc1 #4 + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 + Workqueue: xfs-inodegc/loop5 xfs_inodegc_worker [xfs] + RIP: 0010:assfail (fs/xfs/xfs_message.c:102) xfs + RSP: 0018:ffff88810188f7f0 EFLAGS: 00010202 + RAX: 0000000000000000 RBX: ffff88816e748250 RCX: 1ffffffff844b0e7 + RDX: 0000000000000004 RSI: ffff88810188f558 RDI: ffffffffc2431fa0 + RBP: 1ffff11020311f01 R08: 0000000042431f9f R09: ffffed1020311e9b + R10: ffff88810188f4df R11: ffffffffac725d70 R12: ffff88817a3f4000 + R13: ffff88812182f000 R14: ffff88810188f998 R15: ffffffffc2423f80 + FS: 0000000000000000(0000) GS:ffff8881c8400000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 000055fe9d0f109c CR3: 000000014426c002 CR4: 0000000000770ef0 + DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 + PKRU: 55555554 + Call Trace: + + xfs_trans_read_buf_map (fs/xfs/xfs_trans_buf.c:241 (discriminator 1)) xfs + xfs_imap_to_bp (fs/xfs/xfs_trans.h:210 fs/xfs/libxfs/xfs_inode_buf.c:138) xfs + xfs_inode_item_precommit (fs/xfs/xfs_inode_item.c:145) xfs + xfs_trans_run_precommits (fs/xfs/xfs_trans.c:931) xfs + __xfs_trans_commit (fs/xfs/xfs_trans.c:966) xfs + xfs_inactive_ifree (fs/xfs/xfs_inode.c:1811) xfs + xfs_inactive (fs/xfs/xfs_inode.c:2013) xfs + xfs_inodegc_worker (fs/xfs/xfs_icache.c:1841 fs/xfs/xfs_icache.c:1886) xfs + process_one_work (kernel/workqueue.c:3231) + worker_thread (kernel/workqueue.c:3306 (discriminator 2) kernel/workqueue.c:3393 (discriminator 2)) + kthread (kernel/kthread.c:389) + ret_from_fork (arch/x86/kernel/process.c:147) + ret_from_fork_asm (arch/x86/entry/entry_64.S:257) + + +And occurs when the the inode precommit handlers is attempt to look +up the inode cluster buffer to attach the inode for writeback. + +The trail of logic that I can reconstruct is as follows. + + 1. the inode is clean when inodegc runs, so it is not + attached to a cluster buffer when precommit runs. + + 2. #1 implies the inode cluster buffer may be clean and not + pinned by dirty inodes when inodegc runs. + + 3. #2 implies that the inode cluster buffer can be reclaimed + by memory pressure at any time. + + 4. The assert failure implies that the cluster buffer was + attached to the transaction, but not marked done. It had + been accessed earlier in the transaction, but not marked + done. + + 5. #4 implies the cluster buffer has been invalidated (i.e. + marked stale). + + 6. #5 implies that the inode cluster buffer was instantiated + uninitialised in the transaction in xfs_ifree_cluster(), + which only instantiates the buffers to invalidate them + and never marks them as done. + +Given factors 1-3, this issue is highly dependent on timing and +environmental factors. Hence the issue can be very difficult to +reproduce in some situations, but highly reliable in others. Luis +has an environment where it can be reproduced easily by g/531 but, +OTOH, I've reproduced it only once in ~2000 cycles of g/531. + +I think the fix is to have xfs_ifree_cluster() set the XBF_DONE flag +on the cluster buffers, even though they may not be initialised. The +reasons why I think this is safe are: + + 1. A buffer cache lookup hit on a XBF_STALE buffer will + clear the XBF_DONE flag. Hence all future users of the + buffer know they have to re-initialise the contents + before use and mark it done themselves. + + 2. xfs_trans_binval() sets the XFS_BLI_STALE flag, which + means the buffer remains locked until the journal commit + completes and the buffer is unpinned. Hence once marked + XBF_STALE/XFS_BLI_STALE by xfs_ifree_cluster(), the only + context that can access the freed buffer is the currently + running transaction. + + 3. #2 implies that future buffer lookups in the currently + running transaction will hit the transaction match code + and not the buffer cache. Hence XBF_STALE and + XFS_BLI_STALE will not be cleared unless the transaction + initialises and logs the buffer with valid contents + again. At which point, the buffer will be marked marked + XBF_DONE again, so having XBF_DONE already set on the + stale buffer is a moot point. + + 4. #2 also implies that any concurrent access to that + cluster buffer will block waiting on the buffer lock + until the inode cluster has been fully freed and is no + longer an active inode cluster buffer. + + 5. #4 + #1 means that any future user of the disk range of + that buffer will always see the range of disk blocks + covered by the cluster buffer as not done, and hence must + initialise the contents themselves. + + 6. Setting XBF_DONE in xfs_ifree_cluster() then means the + unlinked inode precommit code will see a XBF_DONE buffer + from the transaction match as it expects. It can then + attach the stale but newly dirtied inode to the stale + but newly dirtied cluster buffer without unexpected + failures. The stale buffer will then sail through the + journal and do the right thing with the attached stale + inode during unpin. + +Hence the fix is just one line of extra code. The explanation of +why we have to set XBF_DONE in xfs_ifree_cluster, OTOH, is long and +complex.... + +Fixes: 82842fee6e59 ("xfs: fix AGF vs inode cluster buffer deadlock") +Signed-off-by: Dave Chinner +Tested-by: Luis Chamberlain +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_inode.c | 23 +++++++++++++++++++---- + 1 file changed, 19 insertions(+), 4 deletions(-) + +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -2329,11 +2329,26 @@ xfs_ifree_cluster( + * This buffer may not have been correctly initialised as we + * didn't read it from disk. That's not important because we are + * only using to mark the buffer as stale in the log, and to +- * attach stale cached inodes on it. That means it will never be +- * dispatched for IO. If it is, we want to know about it, and we +- * want it to fail. We can acheive this by adding a write +- * verifier to the buffer. ++ * attach stale cached inodes on it. ++ * ++ * For the inode that triggered the cluster freeing, this ++ * attachment may occur in xfs_inode_item_precommit() after we ++ * have marked this buffer stale. If this buffer was not in ++ * memory before xfs_ifree_cluster() started, it will not be ++ * marked XBF_DONE and this will cause problems later in ++ * xfs_inode_item_precommit() when we trip over a (stale, !done) ++ * buffer to attached to the transaction. ++ * ++ * Hence we have to mark the buffer as XFS_DONE here. This is ++ * safe because we are also marking the buffer as XBF_STALE and ++ * XFS_BLI_STALE. That means it will never be dispatched for ++ * IO and it won't be unlocked until the cluster freeing has ++ * been committed to the journal and the buffer unpinned. If it ++ * is written, we want to know about it, and we want it to ++ * fail. We can acheive this by adding a write verifier to the ++ * buffer. + */ ++ bp->b_flags |= XBF_DONE; + bp->b_ops = &xfs_inode_buf_ops; + + /* diff --git a/queue-6.6/xfs-fix-xfs_bmap_add_extent_delay_real-for-partial-conversions.patch b/queue-6.6/xfs-fix-xfs_bmap_add_extent_delay_real-for-partial-conversions.patch new file mode 100644 index 00000000000..5a8ea7eb987 --- /dev/null +++ b/queue-6.6/xfs-fix-xfs_bmap_add_extent_delay_real-for-partial-conversions.patch @@ -0,0 +1,122 @@ +From stable+bounces-86402-greg=kroah.com@vger.kernel.org Wed Oct 16 02:11:49 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:07 -0700 +Subject: xfs: fix xfs_bmap_add_extent_delay_real for partial conversions +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-3-catherine.hoang@oracle.com> + +From: Christoph Hellwig + +commit d69bee6a35d3c5e4873b9e164dd1a9711351a97c upstream. + +[backport: resolve conflict due to xfs_mod_freecounter refactor] + +xfs_bmap_add_extent_delay_real takes parts or all of a delalloc extent +and converts them to a real extent. It is written to deal with any +potential overlap of the to be converted range with the delalloc extent, +but it turns out that currently only converting the entire extents, or a +part starting at the beginning is actually exercised, as the only caller +always tries to convert the entire delalloc extent, and either succeeds +or at least progresses partially from the start. + +If it only converts a tiny part of a delalloc extent, the indirect block +calculation for the new delalloc extent (da_new) might be equivalent to that +of the existing delalloc extent (da_old). If this extent conversion now +requires allocating an indirect block that gets accounted into da_new, +leading to the assert that da_new must be smaller or equal to da_new +unless we split the extent to trigger. + +Except for the assert that case is actually handled by just trying to +allocate more space, as that already handled for the split case (which +currently can't be reached at all), so just reusing it should be fine. +Except that without dipping into the reserved block pool that would make +it a bit too easy to trigger a fs shutdown due to ENOSPC. So in addition +to adjusting the assert, also dip into the reserved block pool. + +Note that I could only reproduce the assert with a change to only convert +the actually asked range instead of the full delalloc extent from +xfs_bmapi_write. + +Signed-off-by: Christoph Hellwig +Reviewed-by: "Darrick J. Wong" +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_bmap.c | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -1549,6 +1549,7 @@ xfs_bmap_add_extent_delay_real( + if (error) + goto done; + } ++ ASSERT(da_new <= da_old); + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: +@@ -1578,6 +1579,7 @@ xfs_bmap_add_extent_delay_real( + if (error) + goto done; + } ++ ASSERT(da_new <= da_old); + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: +@@ -1611,6 +1613,7 @@ xfs_bmap_add_extent_delay_real( + if (error) + goto done; + } ++ ASSERT(da_new <= da_old); + break; + + case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: +@@ -1643,6 +1646,7 @@ xfs_bmap_add_extent_delay_real( + goto done; + } + } ++ ASSERT(da_new <= da_old); + break; + + case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: +@@ -1680,6 +1684,7 @@ xfs_bmap_add_extent_delay_real( + if (error) + goto done; + } ++ ASSERT(da_new <= da_old); + break; + + case BMAP_LEFT_FILLING: +@@ -1767,6 +1772,7 @@ xfs_bmap_add_extent_delay_real( + xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); + xfs_iext_next(ifp, &bma->icur); + xfs_iext_update_extent(bma->ip, state, &bma->icur, &RIGHT); ++ ASSERT(da_new <= da_old); + break; + + case BMAP_RIGHT_FILLING: +@@ -1814,6 +1820,7 @@ xfs_bmap_add_extent_delay_real( + PREV.br_blockcount = temp; + xfs_iext_insert(bma->ip, &bma->icur, &PREV, state); + xfs_iext_next(ifp, &bma->icur); ++ ASSERT(da_new <= da_old); + break; + + case 0: +@@ -1934,11 +1941,9 @@ xfs_bmap_add_extent_delay_real( + } + + /* adjust for changes in reserved delayed indirect blocks */ +- if (da_new != da_old) { +- ASSERT(state == 0 || da_new < da_old); ++ if (da_new != da_old) + error = xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), +- false); +- } ++ true); + + xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); + done: diff --git a/queue-6.6/xfs-make-sure-sb_fdblocks-is-non-negative.patch b/queue-6.6/xfs-make-sure-sb_fdblocks-is-non-negative.patch new file mode 100644 index 00000000000..86f2becb9c4 --- /dev/null +++ b/queue-6.6/xfs-make-sure-sb_fdblocks-is-non-negative.patch @@ -0,0 +1,73 @@ +From stable+bounces-86422-greg=kroah.com@vger.kernel.org Wed Oct 16 02:12:51 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:22 -0700 +Subject: xfs: make sure sb_fdblocks is non-negative +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-18-catherine.hoang@oracle.com> + +From: Wengang Wang + +commit 58f880711f2ba53fd5e959875aff5b3bf6d5c32e upstream. + +A user with a completely full filesystem experienced an unexpected +shutdown when the filesystem tried to write the superblock during +runtime. +kernel shows the following dmesg: + +[ 8.176281] XFS (dm-4): Metadata corruption detected at xfs_sb_write_verify+0x60/0x120 [xfs], xfs_sb block 0x0 +[ 8.177417] XFS (dm-4): Unmount and run xfs_repair +[ 8.178016] XFS (dm-4): First 128 bytes of corrupted metadata buffer: +[ 8.178703] 00000000: 58 46 53 42 00 00 10 00 00 00 00 00 01 90 00 00 XFSB............ +[ 8.179487] 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +[ 8.180312] 00000020: cf 12 dc 89 ca 26 45 29 92 e6 e3 8d 3b b8 a2 c3 .....&E)....;... +[ 8.181150] 00000030: 00 00 00 00 01 00 00 06 00 00 00 00 00 00 00 80 ................ +[ 8.182003] 00000040: 00 00 00 00 00 00 00 81 00 00 00 00 00 00 00 82 ................ +[ 8.182004] 00000050: 00 00 00 01 00 64 00 00 00 00 00 04 00 00 00 00 .....d.......... +[ 8.182004] 00000060: 00 00 64 00 b4 a5 02 00 02 00 00 08 00 00 00 00 ..d............. +[ 8.182005] 00000070: 00 00 00 00 00 00 00 00 0c 09 09 03 17 00 00 19 ................ +[ 8.182008] XFS (dm-4): Corruption of in-memory data detected. Shutting down filesystem +[ 8.182010] XFS (dm-4): Please unmount the filesystem and rectify the problem(s) + +When xfs_log_sb writes super block to disk, b_fdblocks is fetched from +m_fdblocks without any lock. As m_fdblocks can experience a positive -> +negative -> positive changing when the FS reaches fullness (see +xfs_mod_fdblocks). So there is a chance that sb_fdblocks is negative, and +because sb_fdblocks is type of unsigned long long, it reads super big. +And sb_fdblocks being bigger than sb_dblocks is a problem during log +recovery, xfs_validate_sb_write() complains. + +Fix: +As sb_fdblocks will be re-calculated during mount when lazysbcount is +enabled, We just need to make xfs_validate_sb_write() happy -- make sure +sb_fdblocks is not nenative. This patch also takes care of other percpu +counters in xfs_log_sb. + +Signed-off-by: Wengang Wang +Reviewed-by: Darrick J. Wong +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_sb.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +--- a/fs/xfs/libxfs/xfs_sb.c ++++ b/fs/xfs/libxfs/xfs_sb.c +@@ -1031,11 +1031,12 @@ xfs_log_sb( + * and hence we don't need have to update it here. + */ + if (xfs_has_lazysbcount(mp)) { +- mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); ++ mp->m_sb.sb_icount = percpu_counter_sum_positive(&mp->m_icount); + mp->m_sb.sb_ifree = min_t(uint64_t, +- percpu_counter_sum(&mp->m_ifree), ++ percpu_counter_sum_positive(&mp->m_ifree), + mp->m_sb.sb_icount); +- mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); ++ mp->m_sb.sb_fdblocks = ++ percpu_counter_sum_positive(&mp->m_fdblocks); + } + + xfs_sb_to_disk(bp->b_addr, &mp->m_sb); diff --git a/queue-6.6/xfs-make-the-seq-argument-to-xfs_bmapi_convert_delalloc-optional.patch b/queue-6.6/xfs-make-the-seq-argument-to-xfs_bmapi_convert_delalloc-optional.patch new file mode 100644 index 00000000000..2ad1fd73620 --- /dev/null +++ b/queue-6.6/xfs-make-the-seq-argument-to-xfs_bmapi_convert_delalloc-optional.patch @@ -0,0 +1,48 @@ +From stable+bounces-86413-greg=kroah.com@vger.kernel.org Wed Oct 16 02:12:18 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:18 -0700 +Subject: xfs: make the seq argument to xfs_bmapi_convert_delalloc() optional +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-14-catherine.hoang@oracle.com> + +From: Zhang Yi + +commit fc8d0ba0ff5fe4700fa02008b7751ec6b84b7677 upstream. + +Allow callers to pass a NULLL seq argument if they don't care about +the fork sequence number. + +Signed-off-by: Zhang Yi +Reviewed-by: Christoph Hellwig +Reviewed-by: "Darrick J. Wong" +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_bmap.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -4595,7 +4595,8 @@ xfs_bmapi_convert_delalloc( + if (!isnullstartblock(bma.got.br_startblock)) { + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, + xfs_iomap_inode_sequence(ip, flags)); +- *seq = READ_ONCE(ifp->if_seq); ++ if (seq) ++ *seq = READ_ONCE(ifp->if_seq); + goto out_trans_cancel; + } + +@@ -4641,7 +4642,8 @@ xfs_bmapi_convert_delalloc( + ASSERT(!isnullstartblock(bma.got.br_startblock)); + xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, + xfs_iomap_inode_sequence(ip, flags)); +- *seq = READ_ONCE(ifp->if_seq); ++ if (seq) ++ *seq = READ_ONCE(ifp->if_seq); + + if (whichfork == XFS_COW_FORK) + xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length); diff --git a/queue-6.6/xfs-make-xfs_bmapi_convert_delalloc-to-allocate-the-target-offset.patch b/queue-6.6/xfs-make-xfs_bmapi_convert_delalloc-to-allocate-the-target-offset.patch new file mode 100644 index 00000000000..065b350b54c --- /dev/null +++ b/queue-6.6/xfs-make-xfs_bmapi_convert_delalloc-to-allocate-the-target-offset.patch @@ -0,0 +1,159 @@ +From stable+bounces-86415-greg=kroah.com@vger.kernel.org Wed Oct 16 02:12:23 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:19 -0700 +Subject: xfs: make xfs_bmapi_convert_delalloc() to allocate the target offset +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-15-catherine.hoang@oracle.com> + +From: Zhang Yi + +commit 2e08371a83f1c06fd85eea8cd37c87a224cc4cc4 upstream. + +Since xfs_bmapi_convert_delalloc() only attempts to allocate the entire +delalloc extent and require multiple invocations to allocate the target +offset. So xfs_convert_blocks() add a loop to do this job and we call it +in the write back path, but xfs_convert_blocks() isn't a common helper. +Let's do it in xfs_bmapi_convert_delalloc() and drop +xfs_convert_blocks(), preparing for the post EOF delalloc blocks +converting in the buffered write begin path. + +Signed-off-by: Zhang Yi +Reviewed-by: Christoph Hellwig +Reviewed-by: "Darrick J. Wong" +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_bmap.c | 34 +++++++++++++++++++++++++++-- + fs/xfs/xfs_aops.c | 54 ++++++++++++----------------------------------- + 2 files changed, 46 insertions(+), 42 deletions(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -4537,8 +4537,8 @@ error0: + * invocations to allocate the target offset if a large enough physical extent + * is not available. + */ +-int +-xfs_bmapi_convert_delalloc( ++static int ++xfs_bmapi_convert_one_delalloc( + struct xfs_inode *ip, + int whichfork, + xfs_off_t offset, +@@ -4666,6 +4666,36 @@ out_trans_cancel: + return error; + } + ++/* ++ * Pass in a dellalloc extent and convert it to real extents, return the real ++ * extent that maps offset_fsb in iomap. ++ */ ++int ++xfs_bmapi_convert_delalloc( ++ struct xfs_inode *ip, ++ int whichfork, ++ loff_t offset, ++ struct iomap *iomap, ++ unsigned int *seq) ++{ ++ int error; ++ ++ /* ++ * Attempt to allocate whatever delalloc extent currently backs offset ++ * and put the result into iomap. Allocate in a loop because it may ++ * take several attempts to allocate real blocks for a contiguous ++ * delalloc extent if free space is sufficiently fragmented. ++ */ ++ do { ++ error = xfs_bmapi_convert_one_delalloc(ip, whichfork, offset, ++ iomap, seq); ++ if (error) ++ return error; ++ } while (iomap->offset + iomap->length <= offset); ++ ++ return 0; ++} ++ + int + xfs_bmapi_remap( + struct xfs_trans *tp, +--- a/fs/xfs/xfs_aops.c ++++ b/fs/xfs/xfs_aops.c +@@ -233,45 +233,6 @@ xfs_imap_valid( + return true; + } + +-/* +- * Pass in a dellalloc extent and convert it to real extents, return the real +- * extent that maps offset_fsb in wpc->iomap. +- * +- * The current page is held locked so nothing could have removed the block +- * backing offset_fsb, although it could have moved from the COW to the data +- * fork by another thread. +- */ +-static int +-xfs_convert_blocks( +- struct iomap_writepage_ctx *wpc, +- struct xfs_inode *ip, +- int whichfork, +- loff_t offset) +-{ +- int error; +- unsigned *seq; +- +- if (whichfork == XFS_COW_FORK) +- seq = &XFS_WPC(wpc)->cow_seq; +- else +- seq = &XFS_WPC(wpc)->data_seq; +- +- /* +- * Attempt to allocate whatever delalloc extent currently backs offset +- * and put the result into wpc->iomap. Allocate in a loop because it +- * may take several attempts to allocate real blocks for a contiguous +- * delalloc extent if free space is sufficiently fragmented. +- */ +- do { +- error = xfs_bmapi_convert_delalloc(ip, whichfork, offset, +- &wpc->iomap, seq); +- if (error) +- return error; +- } while (wpc->iomap.offset + wpc->iomap.length <= offset); +- +- return 0; +-} +- + static int + xfs_map_blocks( + struct iomap_writepage_ctx *wpc, +@@ -289,6 +250,7 @@ xfs_map_blocks( + struct xfs_iext_cursor icur; + int retries = 0; + int error = 0; ++ unsigned int *seq; + + if (xfs_is_shutdown(mp)) + return -EIO; +@@ -386,7 +348,19 @@ retry: + trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); + return 0; + allocate_blocks: +- error = xfs_convert_blocks(wpc, ip, whichfork, offset); ++ /* ++ * Convert a dellalloc extent to a real one. The current page is held ++ * locked so nothing could have removed the block backing offset_fsb, ++ * although it could have moved from the COW to the data fork by another ++ * thread. ++ */ ++ if (whichfork == XFS_COW_FORK) ++ seq = &XFS_WPC(wpc)->cow_seq; ++ else ++ seq = &XFS_WPC(wpc)->data_seq; ++ ++ error = xfs_bmapi_convert_delalloc(ip, whichfork, offset, ++ &wpc->iomap, seq); + if (error) { + /* + * If we failed to find the extent in the COW fork we might have diff --git a/queue-6.6/xfs-match-lock-mode-in-xfs_buffered_write_iomap_begin.patch b/queue-6.6/xfs-match-lock-mode-in-xfs_buffered_write_iomap_begin.patch new file mode 100644 index 00000000000..ab6d3ab9f22 --- /dev/null +++ b/queue-6.6/xfs-match-lock-mode-in-xfs_buffered_write_iomap_begin.patch @@ -0,0 +1,70 @@ +From stable+bounces-86412-greg=kroah.com@vger.kernel.org Wed Oct 16 02:12:13 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:17 -0700 +Subject: xfs: match lock mode in xfs_buffered_write_iomap_begin() +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-13-catherine.hoang@oracle.com> + +From: Zhang Yi + +commit bb712842a85d595525e72f0e378c143e620b3ea2 upstream. + +Commit 1aa91d9c9933 ("xfs: Add async buffered write support") replace +xfs_ilock(XFS_ILOCK_EXCL) with xfs_ilock_for_iomap() when locking the +writing inode, and a new variable lockmode is used to indicate the lock +mode. Although the lockmode should always be XFS_ILOCK_EXCL, it's still +better to use this variable instead of useing XFS_ILOCK_EXCL directly +when unlocking the inode. + +Fixes: 1aa91d9c9933 ("xfs: Add async buffered write support") +Signed-off-by: Zhang Yi +Reviewed-by: "Darrick J. Wong" +Reviewed-by: Christoph Hellwig +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_iomap.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -1141,13 +1141,13 @@ retry: + * them out if the write happens to fail. + */ + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); +- xfs_iunlock(ip, XFS_ILOCK_EXCL); ++ xfs_iunlock(ip, lockmode); + trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); + + found_imap: + seq = xfs_iomap_inode_sequence(ip, 0); +- xfs_iunlock(ip, XFS_ILOCK_EXCL); ++ xfs_iunlock(ip, lockmode); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); + + found_cow: +@@ -1157,17 +1157,17 @@ found_cow: + if (error) + goto out_unlock; + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); +- xfs_iunlock(ip, XFS_ILOCK_EXCL); ++ xfs_iunlock(ip, lockmode); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, + IOMAP_F_SHARED, seq); + } + + xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); +- xfs_iunlock(ip, XFS_ILOCK_EXCL); ++ xfs_iunlock(ip, lockmode); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); + + out_unlock: +- xfs_iunlock(ip, XFS_ILOCK_EXCL); ++ xfs_iunlock(ip, lockmode); + return error; + } + diff --git a/queue-6.6/xfs-remove-a-racy-if_bytes-check-in-xfs_reflink_end_cow_extent.patch b/queue-6.6/xfs-remove-a-racy-if_bytes-check-in-xfs_reflink_end_cow_extent.patch new file mode 100644 index 00000000000..23689068d9f --- /dev/null +++ b/queue-6.6/xfs-remove-a-racy-if_bytes-check-in-xfs_reflink_end_cow_extent.patch @@ -0,0 +1,41 @@ +From stable+bounces-86403-greg=kroah.com@vger.kernel.org Wed Oct 16 02:11:53 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:08 -0700 +Subject: xfs: remove a racy if_bytes check in xfs_reflink_end_cow_extent +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-4-catherine.hoang@oracle.com> + +From: Christoph Hellwig + +commit 86de848403abda05bf9c16dcdb6bef65a8d88c41 upstream. + +Accessing if_bytes without the ilock is racy. Remove the initial +if_bytes == 0 check in xfs_reflink_end_cow_extent and let +ext_iext_lookup_extent fail for this case after we've taken the ilock. + +Signed-off-by: Christoph Hellwig +Reviewed-by: "Darrick J. Wong" +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_reflink.c | 6 ------ + 1 file changed, 6 deletions(-) + +--- a/fs/xfs/xfs_reflink.c ++++ b/fs/xfs/xfs_reflink.c +@@ -716,12 +716,6 @@ xfs_reflink_end_cow_extent( + int nmaps; + int error; + +- /* No COW extents? That's easy! */ +- if (ifp->if_bytes == 0) { +- *offset_fsb = end_fsb; +- return 0; +- } +- + resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, + XFS_TRANS_RESERVE, &tp); diff --git a/queue-6.6/xfs-require-xfs_sb_feat_incompat_log_xattrs-for-attr-log-intent-item-recovery.patch b/queue-6.6/xfs-require-xfs_sb_feat_incompat_log_xattrs-for-attr-log-intent-item-recovery.patch new file mode 100644 index 00000000000..1c908370f7d --- /dev/null +++ b/queue-6.6/xfs-require-xfs_sb_feat_incompat_log_xattrs-for-attr-log-intent-item-recovery.patch @@ -0,0 +1,47 @@ +From stable+bounces-86404-greg=kroah.com@vger.kernel.org Wed Oct 16 02:11:53 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:09 -0700 +Subject: xfs: require XFS_SB_FEAT_INCOMPAT_LOG_XATTRS for attr log intent item recovery +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-5-catherine.hoang@oracle.com> + +From: "Darrick J. Wong" + +commit 8ef1d96a985e4dc07ffbd71bd7fc5604a80cc644 upstream. + +The XFS_SB_FEAT_INCOMPAT_LOG_XATTRS feature bit protects a filesystem +from old kernels that do not know how to recover extended attribute log +intent items. Make this check mandatory instead of a debugging assert. + +Fixes: fd920008784ea ("xfs: Set up infrastructure for log attribute replay") +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_attr_item.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/fs/xfs/xfs_attr_item.c ++++ b/fs/xfs/xfs_attr_item.c +@@ -510,6 +510,9 @@ xfs_attri_validate( + unsigned int op = attrp->alfi_op_flags & + XFS_ATTRI_OP_FLAGS_TYPE_MASK; + ++ if (!xfs_sb_version_haslogxattrs(&mp->m_sb)) ++ return false; ++ + if (attrp->__pad != 0) + return false; + +@@ -602,8 +605,6 @@ xfs_attri_item_recover( + args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT | + XFS_DA_OP_LOGGED; + +- ASSERT(xfs_sb_version_haslogxattrs(&mp->m_sb)); +- + switch (attr->xattri_op_flags) { + case XFS_ATTRI_OP_FLAGS_SET: + case XFS_ATTRI_OP_FLAGS_REPLACE: diff --git a/queue-6.6/xfs-restrict-when-we-try-to-align-cow-fork-delalloc-to-cowextsz-hints.patch b/queue-6.6/xfs-restrict-when-we-try-to-align-cow-fork-delalloc-to-cowextsz-hints.patch new file mode 100644 index 00000000000..25ddf0a6101 --- /dev/null +++ b/queue-6.6/xfs-restrict-when-we-try-to-align-cow-fork-delalloc-to-cowextsz-hints.patch @@ -0,0 +1,175 @@ +From stable+bounces-86421-greg=kroah.com@vger.kernel.org Wed Oct 16 02:12:49 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:25 -0700 +Subject: xfs: restrict when we try to align cow fork delalloc to cowextsz hints +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-21-catherine.hoang@oracle.com> + +From: "Darrick J. Wong" + +commit 288e1f693f04e66be99f27e7cbe4a45936a66745 upstream. + +xfs/205 produces the following failure when always_cow is enabled: + +# --- a/tests/xfs/205.out 2024-02-28 16:20:24.437887970 -0800 +# +++ b/tests/xfs/205.out.bad 2024-06-03 21:13:40.584000000 -0700 +# @@ -1,4 +1,5 @@ +# QA output created by 205 +# *** one file +# + !!! disk full (expected) +# *** one file, a few bytes at a time +# *** done + +This is the result of overly aggressive attempts to align cow fork +delalloc reservations to the CoW extent size hint. Looking at the trace +data, we're trying to append a single fsblock to the "fred" file. +Trying to create a speculative post-eof reservation fails because +there's not enough space. + +We then set @prealloc_blocks to zero and try again, but the cowextsz +alignment code triggers, which expands our request for a 1-fsblock +reservation into a 39-block reservation. There's not enough space for +that, so the whole write fails with ENOSPC even though there's +sufficient space in the filesystem to allocate the single block that we +need to land the write. + +There are two things wrong here -- first, we shouldn't be attempting +speculative preallocations beyond what was requested when we're low on +space. Second, if we've already computed a posteof preallocation, we +shouldn't bother trying to align that to the cowextsize hint. + +Fix both of these problems by adding a flag that only enables the +expansion of the delalloc reservation to the cowextsize if we're doing a +non-extending write, and only if we're not doing an ENOSPC retry. This +requires us to move the ENOSPC retry logic to xfs_bmapi_reserve_delalloc. + +I probably should have caught this six years ago when 6ca30729c206d was +being reviewed, but oh well. Update the comments to reflect what the +code does now. + +Fixes: 6ca30729c206d ("xfs: bmap code cleanup") +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_bmap.c | 31 +++++++++++++++++++++++++++---- + fs/xfs/xfs_iomap.c | 34 ++++++++++++---------------------- + 2 files changed, 39 insertions(+), 26 deletions(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -3974,20 +3974,32 @@ xfs_bmapi_reserve_delalloc( + xfs_extlen_t alen; + xfs_extlen_t indlen; + int error; +- xfs_fileoff_t aoff = off; ++ xfs_fileoff_t aoff; ++ bool use_cowextszhint = ++ whichfork == XFS_COW_FORK && !prealloc; + ++retry: + /* + * Cap the alloc length. Keep track of prealloc so we know whether to + * tag the inode before we return. + */ ++ aoff = off; + alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); + if (!eof) + alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); + if (prealloc && alen >= len) + prealloc = alen - len; + +- /* Figure out the extent size, adjust alen */ +- if (whichfork == XFS_COW_FORK) { ++ /* ++ * If we're targetting the COW fork but aren't creating a speculative ++ * posteof preallocation, try to expand the reservation to align with ++ * the COW extent size hint if there's sufficient free space. ++ * ++ * Unlike the data fork, the CoW cancellation functions will free all ++ * the reservations at inactivation, so we don't require that every ++ * delalloc reservation have a dirty pagecache. ++ */ ++ if (use_cowextszhint) { + struct xfs_bmbt_irec prev; + xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); + +@@ -4006,7 +4018,7 @@ xfs_bmapi_reserve_delalloc( + */ + error = xfs_quota_reserve_blkres(ip, alen); + if (error) +- return error; ++ goto out; + + /* + * Split changing sb for alen and indlen since they could be coming +@@ -4051,6 +4063,17 @@ out_unreserve_blocks: + out_unreserve_quota: + if (XFS_IS_QUOTA_ON(mp)) + xfs_quota_unreserve_blkres(ip, alen); ++out: ++ if (error == -ENOSPC || error == -EDQUOT) { ++ trace_xfs_delalloc_enospc(ip, off, len); ++ ++ if (prealloc || use_cowextszhint) { ++ /* retry without any preallocation */ ++ use_cowextszhint = false; ++ prealloc = 0; ++ goto retry; ++ } ++ } + return error; + } + +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -1127,33 +1127,23 @@ xfs_buffered_write_iomap_begin( + } + } + +-retry: +- error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, +- end_fsb - offset_fsb, prealloc_blocks, +- allocfork == XFS_DATA_FORK ? &imap : &cmap, +- allocfork == XFS_DATA_FORK ? &icur : &ccur, +- allocfork == XFS_DATA_FORK ? eof : cow_eof); +- switch (error) { +- case 0: +- break; +- case -ENOSPC: +- case -EDQUOT: +- /* retry without any preallocation */ +- trace_xfs_delalloc_enospc(ip, offset, count); +- if (prealloc_blocks) { +- prealloc_blocks = 0; +- goto retry; +- } +- fallthrough; +- default: +- goto out_unlock; +- } +- + if (allocfork == XFS_COW_FORK) { ++ error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, ++ end_fsb - offset_fsb, prealloc_blocks, &cmap, ++ &ccur, cow_eof); ++ if (error) ++ goto out_unlock; ++ + trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap); + goto found_cow; + } + ++ error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, ++ end_fsb - offset_fsb, prealloc_blocks, &imap, &icur, ++ eof); ++ if (error) ++ goto out_unlock; ++ + /* + * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch + * them out if the write happens to fail. diff --git a/queue-6.6/xfs-revert-commit-44af6c7e59b12.patch b/queue-6.6/xfs-revert-commit-44af6c7e59b12.patch new file mode 100644 index 00000000000..77f3df1fbb7 --- /dev/null +++ b/queue-6.6/xfs-revert-commit-44af6c7e59b12.patch @@ -0,0 +1,58 @@ +From stable+bounces-86411-greg=kroah.com@vger.kernel.org Wed Oct 16 02:12:14 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:15 -0700 +Subject: xfs: revert commit 44af6c7e59b12 +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-11-catherine.hoang@oracle.com> + +From: "Darrick J. Wong" + +commit 2a009397eb5ae178670cbd7101e9635cf6412b35 upstream. + +[backport: resolve conflicts due to new xattr walk helper] + +In my haste to fix what I thought was a performance problem in the attr +scrub code, I neglected to notice that the xfs_attr_get_ilocked also had +the effect of checking that attributes can actually be looked up through +the attr dabtree. Fix this. + +Fixes: 44af6c7e59b12 ("xfs: don't load local xattr values during scrub") +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/scrub/attr.c | 13 +++++-------- + 1 file changed, 5 insertions(+), 8 deletions(-) + +--- a/fs/xfs/scrub/attr.c ++++ b/fs/xfs/scrub/attr.c +@@ -200,14 +200,6 @@ xchk_xattr_listent( + } + + /* +- * Local xattr values are stored in the attr leaf block, so we don't +- * need to retrieve the value from a remote block to detect corruption +- * problems. +- */ +- if (flags & XFS_ATTR_LOCAL) +- goto fail_xref; +- +- /* + * Try to allocate enough memory to extrat the attr value. If that + * doesn't work, we overload the seen_enough variable to convey + * the error message back to the main scrub function. +@@ -222,6 +214,11 @@ xchk_xattr_listent( + + args.value = ab->value; + ++ /* ++ * Get the attr value to ensure that lookup can find this attribute ++ * through the dabtree indexing and that remote value retrieval also ++ * works correctly. ++ */ + error = xfs_attr_get_ilocked(&args); + /* ENODATA means the hash lookup failed and the attr is bad */ + if (error == -ENODATA) diff --git a/queue-6.6/xfs-use-dontcache-for-grabbing-inodes-during-scrub.patch b/queue-6.6/xfs-use-dontcache-for-grabbing-inodes-during-scrub.patch new file mode 100644 index 00000000000..0486baa884b --- /dev/null +++ b/queue-6.6/xfs-use-dontcache-for-grabbing-inodes-during-scrub.patch @@ -0,0 +1,110 @@ +From stable+bounces-86414-greg=kroah.com@vger.kernel.org Wed Oct 16 02:12:23 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:16 -0700 +Subject: xfs: use dontcache for grabbing inodes during scrub +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-12-catherine.hoang@oracle.com> + +From: "Darrick J. Wong" + +commit b27ce0da60a523fc32e3795f96b2de5490642235 upstream. + +[backport: resolve conflict due to missing iscan.c] + +Back when I wrote commit a03297a0ca9f2, I had thought that we'd be doing +users a favor by only marking inodes dontcache at the end of a scrub +operation, and only if there's only one reference to that inode. This +was more or less true back when I_DONTCACHE was an XFS iflag and the +only thing it did was change the outcome of xfs_fs_drop_inode to 1. + +Note: If there are dentries pointing to the inode when scrub finishes, +the inode will have positive i_count and stay around in cache until +dentry reclaim. + +But now we have d_mark_dontcache, which cause the inode *and* the +dentries attached to it all to be marked I_DONTCACHE, which means that +we drop the dentries ASAP, which drops the inode ASAP. + +This is bad if scrub found problems with the inode, because now they can +be scheduled for inactivation, which can cause inodegc to trip on it and +shut down the filesystem. + +Even if the inode isn't bad, this is still suboptimal because phases 3-7 +each initiate inode scans. Dropping the inode immediately during phase +3 is silly because phase 5 will reload it and drop it immediately, etc. +It's fine to mark the inodes dontcache, but if there have been accesses +to the file that set up dentries, we should keep them. + +I validated this by setting up ftrace to capture xfs_iget_recycle* +tracepoints and ran xfs/285 for 30 seconds. With current djwong-wtf I +saw ~30,000 recycle events. I then dropped the d_mark_dontcache calls +and set XFS_IGET_DONTCACHE, and the recycle events dropped to ~5,000 per +30 seconds. + +Therefore, grab the inode with XFS_IGET_DONTCACHE, which only has the +effect of setting I_DONTCACHE for cache misses. Remove the +d_mark_dontcache call that can happen in xchk_irele. + +Fixes: a03297a0ca9f2 ("xfs: manage inode DONTCACHE status at irele time") +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/scrub/common.c | 12 +++--------- + fs/xfs/scrub/scrub.h | 7 +++++++ + 2 files changed, 10 insertions(+), 9 deletions(-) + +--- a/fs/xfs/scrub/common.c ++++ b/fs/xfs/scrub/common.c +@@ -735,7 +735,7 @@ xchk_iget( + { + ASSERT(sc->tp != NULL); + +- return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp); ++ return xfs_iget(sc->mp, sc->tp, inum, XCHK_IGET_FLAGS, 0, ipp); + } + + /* +@@ -786,8 +786,8 @@ again: + if (error) + return error; + +- error = xfs_iget(mp, tp, inum, +- XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp); ++ error = xfs_iget(mp, tp, inum, XFS_IGET_NORETRY | XCHK_IGET_FLAGS, 0, ++ ipp); + if (error == -EAGAIN) { + /* + * The inode may be in core but temporarily unavailable and may +@@ -994,12 +994,6 @@ xchk_irele( + spin_lock(&VFS_I(ip)->i_lock); + VFS_I(ip)->i_state &= ~I_DONTCACHE; + spin_unlock(&VFS_I(ip)->i_lock); +- } else if (atomic_read(&VFS_I(ip)->i_count) == 1) { +- /* +- * If this is the last reference to the inode and the caller +- * permits it, set DONTCACHE to avoid thrashing. +- */ +- d_mark_dontcache(VFS_I(ip)); + } + + xfs_irele(ip); +--- a/fs/xfs/scrub/scrub.h ++++ b/fs/xfs/scrub/scrub.h +@@ -17,6 +17,13 @@ struct xfs_scrub; + #define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \ + __GFP_RETRY_MAYFAIL)) + ++/* ++ * For opening files by handle for fsck operations, we don't trust the inumber ++ * or the allocation state; therefore, perform an untrusted lookup. We don't ++ * want these inodes to pollute the cache, so mark them for immediate removal. ++ */ ++#define XCHK_IGET_FLAGS (XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE) ++ + /* Type info and names for the scrub types. */ + enum xchk_type { + ST_NONE = 1, /* disabled */ diff --git a/queue-6.6/xfs-validate-recovered-name-buffers-when-recovering-xattr-items.patch b/queue-6.6/xfs-validate-recovered-name-buffers-when-recovering-xattr-items.patch new file mode 100644 index 00000000000..6b5bc8299de --- /dev/null +++ b/queue-6.6/xfs-validate-recovered-name-buffers-when-recovering-xattr-items.patch @@ -0,0 +1,129 @@ +From stable+bounces-86407-greg=kroah.com@vger.kernel.org Wed Oct 16 02:12:01 2024 +From: Catherine Hoang +Date: Tue, 15 Oct 2024 17:11:13 -0700 +Subject: xfs: validate recovered name buffers when recovering xattr items +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20241016001126.3256-9-catherine.hoang@oracle.com> + +From: "Darrick J. Wong" + +commit 1c7f09d210aba2f2bb206e2e8c97c9f11a3fd880 upstream. + +Strengthen the xattri log item recovery code by checking that we +actually have the required name and newname buffers for whatever +operation we're replaying. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_attr_item.c | 58 +++++++++++++++++++++++++++++++++++++++---------- + 1 file changed, 47 insertions(+), 11 deletions(-) + +--- a/fs/xfs/xfs_attr_item.c ++++ b/fs/xfs/xfs_attr_item.c +@@ -719,22 +719,20 @@ xlog_recover_attri_commit_pass2( + const void *attr_value = NULL; + const void *attr_name; + size_t len; +- unsigned int op; +- +- attri_formatp = item->ri_buf[0].i_addr; +- attr_name = item->ri_buf[1].i_addr; ++ unsigned int op, i = 0; + + /* Validate xfs_attri_log_format before the large memory allocation */ + len = sizeof(struct xfs_attri_log_format); +- if (item->ri_buf[0].i_len != len) { ++ if (item->ri_buf[i].i_len != len) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + ++ attri_formatp = item->ri_buf[i].i_addr; + if (!xfs_attri_validate(mp, attri_formatp)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, +- item->ri_buf[0].i_addr, item->ri_buf[0].i_len); ++ attri_formatp, len); + return -EFSCORRUPTED; + } + +@@ -763,31 +761,69 @@ xlog_recover_attri_commit_pass2( + attri_formatp, len); + return -EFSCORRUPTED; + } ++ i++; + + /* Validate the attr name */ +- if (item->ri_buf[1].i_len != ++ if (item->ri_buf[i].i_len != + xlog_calc_iovec_len(attri_formatp->alfi_name_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, +- item->ri_buf[0].i_addr, item->ri_buf[0].i_len); ++ attri_formatp, len); + return -EFSCORRUPTED; + } + ++ attr_name = item->ri_buf[i].i_addr; + if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, +- item->ri_buf[1].i_addr, item->ri_buf[1].i_len); ++ attri_formatp, len); + return -EFSCORRUPTED; + } ++ i++; + + /* Validate the attr value, if present */ + if (attri_formatp->alfi_value_len != 0) { +- if (item->ri_buf[2].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) { ++ if (item->ri_buf[i].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) { + XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, + item->ri_buf[0].i_addr, + item->ri_buf[0].i_len); + return -EFSCORRUPTED; + } + +- attr_value = item->ri_buf[2].i_addr; ++ attr_value = item->ri_buf[i].i_addr; ++ i++; ++ } ++ ++ /* ++ * Make sure we got the correct number of buffers for the operation ++ * that we just loaded. ++ */ ++ if (i != item->ri_total) { ++ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, ++ attri_formatp, len); ++ return -EFSCORRUPTED; ++ } ++ ++ switch (op) { ++ case XFS_ATTRI_OP_FLAGS_REMOVE: ++ /* Regular remove operations operate only on names. */ ++ if (attr_value != NULL || attri_formatp->alfi_value_len != 0) { ++ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, ++ attri_formatp, len); ++ return -EFSCORRUPTED; ++ } ++ fallthrough; ++ case XFS_ATTRI_OP_FLAGS_SET: ++ case XFS_ATTRI_OP_FLAGS_REPLACE: ++ /* ++ * Regular xattr set/remove/replace operations require a name ++ * and do not take a newname. Values are optional for set and ++ * replace. ++ */ ++ if (attr_name == NULL || attri_formatp->alfi_name_len == 0) { ++ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, ++ attri_formatp, len); ++ return -EFSCORRUPTED; ++ } ++ break; + } + + /*