From: Greg Kroah-Hartman Date: Wed, 19 Jun 2024 08:43:59 +0000 (+0200) Subject: 6.6-stable patches X-Git-Tag: v6.1.95~60 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=c94270a510561e864561fc2e73ad2bb51d7684cd;p=thirdparty%2Fkernel%2Fstable-queue.git 6.6-stable patches added patches: xfs-allow-cross-linking-special-files-without-project-quota.patch xfs-allow-sunit-mount-option-to-repair-bad-primary-sb-stripe-values.patch xfs-don-t-use-current-journal_info.patch xfs-ensure-submit-buffers-on-lsn-boundaries-in-error-handlers.patch xfs-fix-imprecise-logic-in-xchk_btree_check_block_owner.patch xfs-fix-scrub-stats-file-permissions.patch xfs-fix-seek_hole-data-for-regions-with-active-cow-extents.patch xfs-shrink-failure-needs-to-hold-agi-buffer.patch --- diff --git a/queue-6.6/series b/queue-6.6/series index 95420a18c2d..c98ef3d30ed 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -238,3 +238,11 @@ btrfs-zoned-factor-out-per-zone-logic-from-btrfs_load_block_group_zone_info.patc btrfs-zoned-factor-out-single-bg-handling-from-btrfs_load_block_group_zone_info.patch btrfs-zoned-factor-out-dup-bg-handling-from-btrfs_load_block_group_zone_info.patch btrfs-zoned-fix-use-after-free-due-to-race-with-dev-replace.patch +xfs-fix-imprecise-logic-in-xchk_btree_check_block_owner.patch +xfs-fix-scrub-stats-file-permissions.patch +xfs-fix-seek_hole-data-for-regions-with-active-cow-extents.patch +xfs-shrink-failure-needs-to-hold-agi-buffer.patch +xfs-ensure-submit-buffers-on-lsn-boundaries-in-error-handlers.patch +xfs-allow-sunit-mount-option-to-repair-bad-primary-sb-stripe-values.patch +xfs-don-t-use-current-journal_info.patch +xfs-allow-cross-linking-special-files-without-project-quota.patch diff --git a/queue-6.6/xfs-allow-cross-linking-special-files-without-project-quota.patch b/queue-6.6/xfs-allow-cross-linking-special-files-without-project-quota.patch new file mode 100644 index 00000000000..651029cfe16 --- /dev/null +++ b/queue-6.6/xfs-allow-cross-linking-special-files-without-project-quota.patch @@ -0,0 +1,73 @@ +From stable+bounces-52621-greg=kroah.com@vger.kernel.org Tue Jun 18 01:04:33 2024 +From: Catherine Hoang +Date: Mon, 17 Jun 2024 16:03:55 -0700 +Subject: xfs: allow cross-linking special files without project quota +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20240617230355.77091-9-catherine.hoang@oracle.com> + +From: Andrey Albershteyn + +commit e23d7e82b707d1d0a627e334fb46370e4f772c11 upstream. + +There's an issue that if special files is created before quota +project is enabled, then it's not possible to link this file. This +works fine for normal files. This happens because xfs_quota skips +special files (no ioctls to set necessary flags). The check for +having the same project ID for source and destination then fails as +source file doesn't have any ID. + +mkfs.xfs -f /dev/sda +mount -o prjquota /dev/sda /mnt/test + +mkdir /mnt/test/foo +mkfifo /mnt/test/foo/fifo1 + +xfs_quota -xc "project -sp /mnt/test/foo 9" /mnt/test +> Setting up project 9 (path /mnt/test/foo)... +> xfs_quota: skipping special file /mnt/test/foo/fifo1 +> Processed 1 (/etc/projects and cmdline) paths for project 9 with recursion depth infinite (-1). + +ln /mnt/test/foo/fifo1 /mnt/test/foo/fifo1_link +> ln: failed to create hard link '/mnt/test/testdir/fifo1_link' => '/mnt/test/testdir/fifo1': Invalid cross-device link + +mkfifo /mnt/test/foo/fifo2 +ln /mnt/test/foo/fifo2 /mnt/test/foo/fifo2_link + +Fix this by allowing linking of special files to the project quota +if special files doesn't have any ID set (ID = 0). + +Signed-off-by: Andrey Albershteyn +Reviewed-by: "Darrick J. Wong" +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_inode.c | 15 +++++++++++++-- + 1 file changed, 13 insertions(+), 2 deletions(-) + +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -1239,8 +1239,19 @@ xfs_link( + */ + if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) && + tdp->i_projid != sip->i_projid)) { +- error = -EXDEV; +- goto error_return; ++ /* ++ * Project quota setup skips special files which can ++ * leave inodes in a PROJINHERIT directory without a ++ * project ID set. We need to allow links to be made ++ * to these "project-less" inodes because userspace ++ * expects them to succeed after project ID setup, ++ * but everything else should be rejected. ++ */ ++ if (!special_file(VFS_I(sip)->i_mode) || ++ sip->i_projid != 0) { ++ error = -EXDEV; ++ goto error_return; ++ } + } + + if (!resblks) { diff --git a/queue-6.6/xfs-allow-sunit-mount-option-to-repair-bad-primary-sb-stripe-values.patch b/queue-6.6/xfs-allow-sunit-mount-option-to-repair-bad-primary-sb-stripe-values.patch new file mode 100644 index 00000000000..89baa7c06b1 --- /dev/null +++ b/queue-6.6/xfs-allow-sunit-mount-option-to-repair-bad-primary-sb-stripe-values.patch @@ -0,0 +1,170 @@ +From stable+bounces-52619-greg=kroah.com@vger.kernel.org Tue Jun 18 01:04:27 2024 +From: Catherine Hoang +Date: Mon, 17 Jun 2024 16:03:53 -0700 +Subject: xfs: allow sunit mount option to repair bad primary sb stripe values +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20240617230355.77091-7-catherine.hoang@oracle.com> + +From: Dave Chinner + +commit 15922f5dbf51dad334cde888ce6835d377678dc9 upstream. + +If a filesystem has a busted stripe alignment configuration on disk +(e.g. because broken RAID firmware told mkfs that swidth was smaller +than sunit), then the filesystem will refuse to mount due to the +stripe validation failing. This failure is triggering during distro +upgrades from old kernels lacking this check to newer kernels with +this check, and currently the only way to fix it is with offline +xfs_db surgery. + +This runtime validity checking occurs when we read the superblock +for the first time and causes the mount to fail immediately. This +prevents the rewrite of stripe unit/width via +mount options that occurs later in the mount process. Hence there is +no way to recover this situation without resorting to offline xfs_db +rewrite of the values. + +However, we parse the mount options long before we read the +superblock, and we know if the mount has been asked to re-write the +stripe alignment configuration when we are reading the superblock +and verifying it for the first time. Hence we can conditionally +ignore stripe verification failures if the mount options specified +will correct the issue. + +We validate that the new stripe unit/width are valid before we +overwrite the superblock values, so we can ignore the invalid config +at verification and fail the mount later if the new values are not +valid. This, at least, gives users the chance of correcting the +issue after a kernel upgrade without having to resort to xfs-db +hacks. + +Signed-off-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Reviewed-by: "Darrick J. Wong" +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_sb.c | 40 +++++++++++++++++++++++++++++++--------- + fs/xfs/libxfs/xfs_sb.h | 5 +++-- + 2 files changed, 34 insertions(+), 11 deletions(-) + +--- a/fs/xfs/libxfs/xfs_sb.c ++++ b/fs/xfs/libxfs/xfs_sb.c +@@ -530,7 +530,8 @@ xfs_validate_sb_common( + } + + if (!xfs_validate_stripe_geometry(mp, XFS_FSB_TO_B(mp, sbp->sb_unit), +- XFS_FSB_TO_B(mp, sbp->sb_width), 0, false)) ++ XFS_FSB_TO_B(mp, sbp->sb_width), 0, ++ xfs_buf_daddr(bp) == XFS_SB_DADDR, false)) + return -EFSCORRUPTED; + + /* +@@ -1319,8 +1320,10 @@ xfs_sb_get_secondary( + } + + /* +- * sunit, swidth, sectorsize(optional with 0) should be all in bytes, +- * so users won't be confused by values in error messages. ++ * sunit, swidth, sectorsize(optional with 0) should be all in bytes, so users ++ * won't be confused by values in error messages. This function returns false ++ * if the stripe geometry is invalid and the caller is unable to repair the ++ * stripe configuration later in the mount process. + */ + bool + xfs_validate_stripe_geometry( +@@ -1328,20 +1331,21 @@ xfs_validate_stripe_geometry( + __s64 sunit, + __s64 swidth, + int sectorsize, ++ bool may_repair, + bool silent) + { + if (swidth > INT_MAX) { + if (!silent) + xfs_notice(mp, + "stripe width (%lld) is too large", swidth); +- return false; ++ goto check_override; + } + + if (sunit > swidth) { + if (!silent) + xfs_notice(mp, + "stripe unit (%lld) is larger than the stripe width (%lld)", sunit, swidth); +- return false; ++ goto check_override; + } + + if (sectorsize && (int)sunit % sectorsize) { +@@ -1349,21 +1353,21 @@ xfs_validate_stripe_geometry( + xfs_notice(mp, + "stripe unit (%lld) must be a multiple of the sector size (%d)", + sunit, sectorsize); +- return false; ++ goto check_override; + } + + if (sunit && !swidth) { + if (!silent) + xfs_notice(mp, + "invalid stripe unit (%lld) and stripe width of 0", sunit); +- return false; ++ goto check_override; + } + + if (!sunit && swidth) { + if (!silent) + xfs_notice(mp, + "invalid stripe width (%lld) and stripe unit of 0", swidth); +- return false; ++ goto check_override; + } + + if (sunit && (int)swidth % (int)sunit) { +@@ -1371,9 +1375,27 @@ xfs_validate_stripe_geometry( + xfs_notice(mp, + "stripe width (%lld) must be a multiple of the stripe unit (%lld)", + swidth, sunit); +- return false; ++ goto check_override; + } + return true; ++ ++check_override: ++ if (!may_repair) ++ return false; ++ /* ++ * During mount, mp->m_dalign will not be set unless the sunit mount ++ * option was set. If it was set, ignore the bad stripe alignment values ++ * and allow the validation and overwrite later in the mount process to ++ * attempt to overwrite the bad stripe alignment values with the values ++ * supplied by mount options. ++ */ ++ if (!mp->m_dalign) ++ return false; ++ if (!silent) ++ xfs_notice(mp, ++"Will try to correct with specified mount options sunit (%d) and swidth (%d)", ++ BBTOB(mp->m_dalign), BBTOB(mp->m_swidth)); ++ return true; + } + + /* +--- a/fs/xfs/libxfs/xfs_sb.h ++++ b/fs/xfs/libxfs/xfs_sb.h +@@ -35,8 +35,9 @@ extern int xfs_sb_get_secondary(struct x + struct xfs_trans *tp, xfs_agnumber_t agno, + struct xfs_buf **bpp); + +-extern bool xfs_validate_stripe_geometry(struct xfs_mount *mp, +- __s64 sunit, __s64 swidth, int sectorsize, bool silent); ++bool xfs_validate_stripe_geometry(struct xfs_mount *mp, ++ __s64 sunit, __s64 swidth, int sectorsize, bool may_repair, ++ bool silent); + + uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents); + diff --git a/queue-6.6/xfs-don-t-use-current-journal_info.patch b/queue-6.6/xfs-don-t-use-current-journal_info.patch new file mode 100644 index 00000000000..afb3902b489 --- /dev/null +++ b/queue-6.6/xfs-don-t-use-current-journal_info.patch @@ -0,0 +1,168 @@ +From stable+bounces-52620-greg=kroah.com@vger.kernel.org Tue Jun 18 01:04:32 2024 +From: Catherine Hoang +Date: Mon, 17 Jun 2024 16:03:54 -0700 +Subject: xfs: don't use current->journal_info +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20240617230355.77091-8-catherine.hoang@oracle.com> + +From: Dave Chinner + +commit f2e812c1522dab847912309b00abcc762dd696da upstream. + +syzbot reported an ext4 panic during a page fault where found a +journal handle when it didn't expect to find one. The structure +it tripped over had a value of 'TRAN' in the first entry in the +structure, and that indicates it tripped over a struct xfs_trans +instead of a jbd2 handle. + +The reason for this is that the page fault was taken during a +copy-out to a user buffer from an xfs bulkstat operation. XFS uses +an "empty" transaction context for bulkstat to do automated metadata +buffer cleanup, and so the transaction context is valid across the +copyout of the bulkstat info into the user buffer. + +We are using empty transaction contexts like this in XFS to reduce +the risk of failing to release objects we reference during the +operation, especially during error handling. Hence we really need to +ensure that we can take page faults from these contexts without +leaving landmines for the code processing the page fault to trip +over. + +However, this same behaviour could happen from any other filesystem +that triggers a page fault or any other exception that is handled +on-stack from within a task context that has current->journal_info +set. Having a page fault from some other filesystem bounce into XFS +where we have to run a transaction isn't a bug at all, but the usage +of current->journal_info means that this could result corruption of +the outer task's journal_info structure. + +The problem is purely that we now have two different contexts that +now think they own current->journal_info. IOWs, no filesystem can +allow page faults or on-stack exceptions while current->journal_info +is set by the filesystem because the exception processing might use +current->journal_info itself. + +If we end up with nested XFS transactions whilst holding an empty +transaction, then it isn't an issue as the outer transaction does +not hold a log reservation. If we ignore the current->journal_info +usage, then the only problem that might occur is a deadlock if the +exception tries to take the same locks the upper context holds. +That, however, is not a problem that setting current->journal_info +would solve, so it's largely an irrelevant concern here. + +IOWs, we really only use current->journal_info for a warning check +in xfs_vm_writepages() to ensure we aren't doing writeback from a +transaction context. Writeback might need to do allocation, so it +can need to run transactions itself. Hence it's a debug check to +warn us that we've done something silly, and largely it is not all +that useful. + +So let's just remove all the use of current->journal_info in XFS and +get rid of all the potential issues from nested contexts where +current->journal_info might get misused by another filesystem +context. + +Reported-by: syzbot+cdee56dbcdf0096ef605@syzkaller.appspotmail.com +Signed-off-by: Dave Chinner +Reviewed-by: "Darrick J. Wong" +Reviewed-by: Mark Tinguely +Reviewed-by: Christoph Hellwig +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/scrub/common.c | 4 +--- + fs/xfs/xfs_aops.c | 7 ------- + fs/xfs/xfs_icache.c | 8 +++++--- + fs/xfs/xfs_trans.h | 9 +-------- + 4 files changed, 7 insertions(+), 21 deletions(-) + +--- a/fs/xfs/scrub/common.c ++++ b/fs/xfs/scrub/common.c +@@ -978,9 +978,7 @@ xchk_irele( + struct xfs_scrub *sc, + struct xfs_inode *ip) + { +- if (current->journal_info != NULL) { +- ASSERT(current->journal_info == sc->tp); +- ++ if (sc->tp) { + /* + * If we are in a transaction, we /cannot/ drop the inode + * ourselves, because the VFS will trigger writeback, which +--- a/fs/xfs/xfs_aops.c ++++ b/fs/xfs/xfs_aops.c +@@ -502,13 +502,6 @@ xfs_vm_writepages( + { + struct xfs_writepage_ctx wpc = { }; + +- /* +- * Writing back data in a transaction context can result in recursive +- * transactions. This is bad, so issue a warning and get out of here. +- */ +- if (WARN_ON_ONCE(current->journal_info)) +- return 0; +- + xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); + return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); + } +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -2031,8 +2031,10 @@ xfs_inodegc_want_queue_work( + * - Memory shrinkers queued the inactivation worker and it hasn't finished. + * - The queue depth exceeds the maximum allowable percpu backlog. + * +- * Note: If the current thread is running a transaction, we don't ever want to +- * wait for other transactions because that could introduce a deadlock. ++ * Note: If we are in a NOFS context here (e.g. current thread is running a ++ * transaction) the we don't want to block here as inodegc progress may require ++ * filesystem resources we hold to make progress and that could result in a ++ * deadlock. Hence we skip out of here if we are in a scoped NOFS context. + */ + static inline bool + xfs_inodegc_want_flush_work( +@@ -2040,7 +2042,7 @@ xfs_inodegc_want_flush_work( + unsigned int items, + unsigned int shrinker_hits) + { +- if (current->journal_info) ++ if (current->flags & PF_MEMALLOC_NOFS) + return false; + + if (shrinker_hits > 0) +--- a/fs/xfs/xfs_trans.h ++++ b/fs/xfs/xfs_trans.h +@@ -277,19 +277,14 @@ static inline void + xfs_trans_set_context( + struct xfs_trans *tp) + { +- ASSERT(current->journal_info == NULL); + tp->t_pflags = memalloc_nofs_save(); +- current->journal_info = tp; + } + + static inline void + xfs_trans_clear_context( + struct xfs_trans *tp) + { +- if (current->journal_info == tp) { +- memalloc_nofs_restore(tp->t_pflags); +- current->journal_info = NULL; +- } ++ memalloc_nofs_restore(tp->t_pflags); + } + + static inline void +@@ -297,10 +292,8 @@ xfs_trans_switch_context( + struct xfs_trans *old_tp, + struct xfs_trans *new_tp) + { +- ASSERT(current->journal_info == old_tp); + new_tp->t_pflags = old_tp->t_pflags; + old_tp->t_pflags = 0; +- current->journal_info = new_tp; + } + + #endif /* __XFS_TRANS_H__ */ diff --git a/queue-6.6/xfs-ensure-submit-buffers-on-lsn-boundaries-in-error-handlers.patch b/queue-6.6/xfs-ensure-submit-buffers-on-lsn-boundaries-in-error-handlers.patch new file mode 100644 index 00000000000..25c11729e61 --- /dev/null +++ b/queue-6.6/xfs-ensure-submit-buffers-on-lsn-boundaries-in-error-handlers.patch @@ -0,0 +1,175 @@ +From stable+bounces-52618-greg=kroah.com@vger.kernel.org Tue Jun 18 01:04:26 2024 +From: Catherine Hoang +Date: Mon, 17 Jun 2024 16:03:52 -0700 +Subject: xfs: ensure submit buffers on LSN boundaries in error handlers +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20240617230355.77091-6-catherine.hoang@oracle.com> + +From: Long Li + +commit e4c3b72a6ea93ed9c1815c74312eee9305638852 upstream. + +While performing the IO fault injection test, I caught the following data +corruption report: + + XFS (dm-0): Internal error ltbno + ltlen > bno at line 1957 of file fs/xfs/libxfs/xfs_alloc.c. Caller xfs_free_ag_extent+0x79c/0x1130 + CPU: 3 PID: 33 Comm: kworker/3:0 Not tainted 6.5.0-rc7-next-20230825-00001-g7f8666926889 #214 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 + Workqueue: xfs-inodegc/dm-0 xfs_inodegc_worker + Call Trace: + + dump_stack_lvl+0x50/0x70 + xfs_corruption_error+0x134/0x150 + xfs_free_ag_extent+0x7d3/0x1130 + __xfs_free_extent+0x201/0x3c0 + xfs_trans_free_extent+0x29b/0xa10 + xfs_extent_free_finish_item+0x2a/0xb0 + xfs_defer_finish_noroll+0x8d1/0x1b40 + xfs_defer_finish+0x21/0x200 + xfs_itruncate_extents_flags+0x1cb/0x650 + xfs_free_eofblocks+0x18f/0x250 + xfs_inactive+0x485/0x570 + xfs_inodegc_worker+0x207/0x530 + process_scheduled_works+0x24a/0xe10 + worker_thread+0x5ac/0xc60 + kthread+0x2cd/0x3c0 + ret_from_fork+0x4a/0x80 + ret_from_fork_asm+0x11/0x20 + + XFS (dm-0): Corruption detected. Unmount and run xfs_repair + +After analyzing the disk image, it was found that the corruption was +triggered by the fact that extent was recorded in both inode datafork +and AGF btree blocks. After a long time of reproduction and analysis, +we found that the reason of free sapce btree corruption was that the +AGF btree was not recovered correctly. + +Consider the following situation, Checkpoint A and Checkpoint B are in +the same record and share the same start LSN1, buf items of same object +(AGF btree block) is included in both Checkpoint A and Checkpoint B. If +the buf item in Checkpoint A has been recovered and updates metadata LSN +permanently, then the buf item in Checkpoint B cannot be recovered, +because log recovery skips items with a metadata LSN >= the current LSN +of the recovery item. If there is still an inode item in Checkpoint B +that records the Extent X, the Extent X will be recorded in both inode +datafork and AGF btree block after Checkpoint B is recovered. Such +transaction can be seen when allocing enxtent for inode bmap, it record +both the addition of extent to the inode extent list and the removing +extent from the AGF. + + |------------Record (LSN1)------------------|---Record (LSN2)---| + |-------Checkpoint A----------|----------Checkpoint B-----------| + | Buf Item(Extent X) | Buf Item / Inode item(Extent X) | + | Extent X is freed | Extent X is allocated | + +After commit 12818d24db8a ("xfs: rework log recovery to submit buffers +on LSN boundaries") was introduced, we submit buffers on lsn boundaries +during log recovery. The above problem can be avoided under normal paths, +but it's not guaranteed under abnormal paths. Consider the following +process, if an error was encountered after recover buf item in Checkpoint +A and before recover buf item in Checkpoint B, buffers that have been +added to the buffer_list will still be submitted, this violates the +submits rule on lsn boundaries. So buf item in Checkpoint B cannot be +recovered on the next mount due to current lsn of transaction equal to +metadata lsn on disk. The detailed process of the problem is as follows. + +First Mount: + + xlog_do_recovery_pass + error = xlog_recover_process + xlog_recover_process_data + xlog_recover_process_ophdr + xlog_recovery_process_trans + ... + /* recover buf item in Checkpoint A */ + xlog_recover_buf_commit_pass2 + xlog_recover_do_reg_buffer + /* add buffer of agf btree block to buffer_list */ + xfs_buf_delwri_queue(bp, buffer_list) + ... + ==> Encounter read IO error and return + /* submit buffers regardless of error */ + if (!list_empty(&buffer_list)) + xfs_buf_delwri_submit(&buffer_list); + + + +Second Mount: + + xlog_do_recovery_pass + error = xlog_recover_process + xlog_recover_process_data + xlog_recover_process_ophdr + xlog_recovery_process_trans + ... + /* recover buf item in Checkpoint B */ + xlog_recover_buf_commit_pass2 + /* buffer of agf btree block wouldn't added to + buffer_list due to lsn equal to current_lsn */ + if (XFS_LSN_CMP(lsn, current_lsn) >= 0) + goto out_release + + + +In order to make sure that submits buffers on lsn boundaries in the +abnormal paths, we need to check error status before submit buffers that +have been added from the last record processed. If error status exist, +buffers in the bufffer_list should not be writen to disk. + +Canceling the buffers in the buffer_list directly isn't correct, unlike +any other place where write list was canceled, these buffers has been +initialized by xfs_buf_item_init() during recovery and held by buf item, +buf items will not be released in xfs_buf_delwri_cancel(), it's not easy +to solve. + +If the filesystem has been shut down, then delwri list submission will +error out all buffers on the list via IO submission/completion and do +all the correct cleanup automatically. So shutting down the filesystem +could prevents buffers in the bufffer_list from being written to disk. + +Fixes: 50d5c8d8e938 ("xfs: check LSN ordering for v5 superblocks during recovery") +Signed-off-by: Long Li +Reviewed-by: "Darrick J. Wong" +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_log_recover.c | 23 ++++++++++++++++++++--- + 1 file changed, 20 insertions(+), 3 deletions(-) + +--- a/fs/xfs/xfs_log_recover.c ++++ b/fs/xfs/xfs_log_recover.c +@@ -3203,11 +3203,28 @@ xlog_do_recovery_pass( + kmem_free(hbp); + + /* +- * Submit buffers that have been added from the last record processed, +- * regardless of error status. ++ * Submit buffers that have been dirtied by the last record recovered. + */ +- if (!list_empty(&buffer_list)) ++ if (!list_empty(&buffer_list)) { ++ if (error) { ++ /* ++ * If there has been an item recovery error then we ++ * cannot allow partial checkpoint writeback to ++ * occur. We might have multiple checkpoints with the ++ * same start LSN in this buffer list, and partial ++ * writeback of a checkpoint in this situation can ++ * prevent future recovery of all the changes in the ++ * checkpoints at this start LSN. ++ * ++ * Note: Shutting down the filesystem will result in the ++ * delwri submission marking all the buffers stale, ++ * completing them and cleaning up _XBF_LOGRECOVERY ++ * state without doing any IO. ++ */ ++ xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); ++ } + error2 = xfs_buf_delwri_submit(&buffer_list); ++ } + + if (error && first_bad) + *first_bad = rhead_blk; diff --git a/queue-6.6/xfs-fix-imprecise-logic-in-xchk_btree_check_block_owner.patch b/queue-6.6/xfs-fix-imprecise-logic-in-xchk_btree_check_block_owner.patch new file mode 100644 index 00000000000..4162ff583c1 --- /dev/null +++ b/queue-6.6/xfs-fix-imprecise-logic-in-xchk_btree_check_block_owner.patch @@ -0,0 +1,54 @@ +From stable+bounces-52614-greg=kroah.com@vger.kernel.org Tue Jun 18 01:04:15 2024 +From: Catherine Hoang +Date: Mon, 17 Jun 2024 16:03:48 -0700 +Subject: xfs: fix imprecise logic in xchk_btree_check_block_owner +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20240617230355.77091-2-catherine.hoang@oracle.com> + +From: "Darrick J. Wong" + +commit c0afba9a8363f17d4efed22a8764df33389aebe8 upstream. + +A reviewer was confused by the init_sa logic in this function. Upon +checking the logic, I discovered that the code is imprecise. What we +want to do here is check that there is an ownership record in the rmap +btree for the AG that contains a btree block. + +For an inode-rooted btree (e.g. the bmbt) the per-AG btree cursors have +not been initialized because inode btrees can span multiple AGs. +Therefore, we must initialize the per-AG btree cursors in sc->sa before +proceeding. That is what init_sa controls, and hence the logic should +be gated on XFS_BTREE_ROOT_IN_INODE, not XFS_BTREE_LONG_PTRS. + +In practice, ROOT_IN_INODE and LONG_PTRS are coincident so this hasn't +mattered. However, we're about to refactor both of those flags into +separate btree_ops fields so we want this the logic to make sense +afterwards. + +Fixes: 858333dcf021a ("xfs: check btree block ownership with bnobt/rmapbt when scrubbing btree") +Signed-off-by: Darrick J. Wong +Reviewed-by: Christoph Hellwig +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/scrub/btree.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/fs/xfs/scrub/btree.c ++++ b/fs/xfs/scrub/btree.c +@@ -385,7 +385,12 @@ xchk_btree_check_block_owner( + agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr); + agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr); + +- init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS; ++ /* ++ * If the btree being examined is not itself a per-AG btree, initialize ++ * sc->sa so that we can check for the presence of an ownership record ++ * in the rmap btree for the AG containing the block. ++ */ ++ init_sa = bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE; + if (init_sa) { + error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa); + if (!xchk_btree_xref_process_error(bs->sc, bs->cur, diff --git a/queue-6.6/xfs-fix-scrub-stats-file-permissions.patch b/queue-6.6/xfs-fix-scrub-stats-file-permissions.patch new file mode 100644 index 00000000000..8951db7ab9d --- /dev/null +++ b/queue-6.6/xfs-fix-scrub-stats-file-permissions.patch @@ -0,0 +1,48 @@ +From stable+bounces-52615-greg=kroah.com@vger.kernel.org Tue Jun 18 01:04:17 2024 +From: Catherine Hoang +Date: Mon, 17 Jun 2024 16:03:49 -0700 +Subject: xfs: fix scrub stats file permissions +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20240617230355.77091-3-catherine.hoang@oracle.com> + +From: "Darrick J. Wong" + +commit e610e856b938a1fc86e7ee83ad2f39716082bca7 upstream. + +When the kernel is in lockdown mode, debugfs will only show files that +are world-readable and cannot be written, mmaped, or used with ioctl. +That more or less describes the scrub stats file, except that the +permissions are wrong -- they should be 0444, not 0644. You can't write +the stats file, so the 0200 makes no sense. + +Meanwhile, the clear_stats file is only writable, but it got mode 0400 +instead of 0200, which would make more sense. + +Fix both files so that they make sense. + +Fixes: d7a74cad8f451 ("xfs: track usage statistics of online fsck") +Signed-off-by: "Darrick J. Wong" +Reviewed-by: Christoph Hellwig +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/scrub/stats.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/xfs/scrub/stats.c ++++ b/fs/xfs/scrub/stats.c +@@ -329,9 +329,9 @@ xchk_stats_register( + if (!cs->cs_debugfs) + return; + +- debugfs_create_file("stats", 0644, cs->cs_debugfs, cs, ++ debugfs_create_file("stats", 0444, cs->cs_debugfs, cs, + &scrub_stats_fops); +- debugfs_create_file("clear_stats", 0400, cs->cs_debugfs, cs, ++ debugfs_create_file("clear_stats", 0200, cs->cs_debugfs, cs, + &clear_scrub_stats_fops); + } + diff --git a/queue-6.6/xfs-fix-seek_hole-data-for-regions-with-active-cow-extents.patch b/queue-6.6/xfs-fix-seek_hole-data-for-regions-with-active-cow-extents.patch new file mode 100644 index 00000000000..c13317cd25a --- /dev/null +++ b/queue-6.6/xfs-fix-seek_hole-data-for-regions-with-active-cow-extents.patch @@ -0,0 +1,73 @@ +From stable+bounces-52616-greg=kroah.com@vger.kernel.org Tue Jun 18 01:04:18 2024 +From: Catherine Hoang +Date: Mon, 17 Jun 2024 16:03:50 -0700 +Subject: xfs: fix SEEK_HOLE/DATA for regions with active COW extents +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20240617230355.77091-4-catherine.hoang@oracle.com> + +From: Dave Chinner + +commit 4b2f459d86252619448455013f581836c8b1b7da upstream. + +A data corruption problem was reported by CoreOS image builders +when using reflink based disk image copies and then converting +them to qcow2 images. The converted images failed the conversion +verification step, and it was isolated down to the fact that +qemu-img uses SEEK_HOLE/SEEK_DATA to find the data it is supposed to +copy. + +The reproducer allowed me to isolate the issue down to a region of +the file that had overlapping data and COW fork extents, and the +problem was that the COW fork extent was being reported in it's +entirity by xfs_seek_iomap_begin() and so skipping over the real +data fork extents in that range. + +This was somewhat hidden by the fact that 'xfs_bmap -vvp' reported +all the extents correctly, and reading the file completely (i.e. not +using seek to skip holes) would map the file correctly and all the +correct data extents are read. Hence the problem is isolated to just +the xfs_seek_iomap_begin() implementation. + +Instrumentation with trace_printk made the problem obvious: we are +passing the wrong length to xfs_trim_extent() in +xfs_seek_iomap_begin(). We are passing the end_fsb, not the +maximum length of the extent we want to trim the map too. Hence the +COW extent map never gets trimmed to the start of the next data fork +extent, and so the seek code treats the entire COW fork extent as +unwritten and skips entirely over the data fork extents in that +range. + +Link: https://github.com/coreos/coreos-assembler/issues/3728 +Fixes: 60271ab79d40 ("xfs: fix SEEK_DATA for speculative COW fork preallocation") +Signed-off-by: Dave Chinner +Reviewed-by: "Darrick J. Wong" +Reviewed-by: Christoph Hellwig +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_iomap.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -1323,7 +1323,7 @@ xfs_seek_iomap_begin( + if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { + if (data_fsb < cow_fsb + cmap.br_blockcount) + end_fsb = min(end_fsb, data_fsb); +- xfs_trim_extent(&cmap, offset_fsb, end_fsb); ++ xfs_trim_extent(&cmap, offset_fsb, end_fsb - offset_fsb); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, + IOMAP_F_SHARED, seq); +@@ -1348,7 +1348,7 @@ xfs_seek_iomap_begin( + imap.br_state = XFS_EXT_NORM; + done: + seq = xfs_iomap_inode_sequence(ip, 0); +- xfs_trim_extent(&imap, offset_fsb, end_fsb); ++ xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); + error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); + out_unlock: + xfs_iunlock(ip, lockmode); diff --git a/queue-6.6/xfs-shrink-failure-needs-to-hold-agi-buffer.patch b/queue-6.6/xfs-shrink-failure-needs-to-hold-agi-buffer.patch new file mode 100644 index 00000000000..327b5caaced --- /dev/null +++ b/queue-6.6/xfs-shrink-failure-needs-to-hold-agi-buffer.patch @@ -0,0 +1,87 @@ +From stable+bounces-52617-greg=kroah.com@vger.kernel.org Tue Jun 18 01:04:21 2024 +From: Catherine Hoang +Date: Mon, 17 Jun 2024 16:03:51 -0700 +Subject: xfs: shrink failure needs to hold AGI buffer +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org +Message-ID: <20240617230355.77091-5-catherine.hoang@oracle.com> + +From: Dave Chinner + +commit 75bcffbb9e7563259b7aed0fa77459d6a3a35627 upstream. + +Chandan reported a AGI/AGF lock order hang on xfs/168 during recent +testing. The cause of the problem was the task running xfs_growfs +to shrink the filesystem. A failure occurred trying to remove the +free space from the btrees that the shrink would make disappear, +and that meant it ran the error handling for a partial failure. + +This error path involves restoring the per-ag block reservations, +and that requires calculating the amount of space needed to be +reserved for the free inode btree. The growfs operation hung here: + +[18679.536829] down+0x71/0xa0 +[18679.537657] xfs_buf_lock+0xa4/0x290 [xfs] +[18679.538731] xfs_buf_find_lock+0xf7/0x4d0 [xfs] +[18679.539920] xfs_buf_lookup.constprop.0+0x289/0x500 [xfs] +[18679.542628] xfs_buf_get_map+0x2b3/0xe40 [xfs] +[18679.547076] xfs_buf_read_map+0xbb/0x900 [xfs] +[18679.562616] xfs_trans_read_buf_map+0x449/0xb10 [xfs] +[18679.569778] xfs_read_agi+0x1cd/0x500 [xfs] +[18679.573126] xfs_ialloc_read_agi+0xc2/0x5b0 [xfs] +[18679.578708] xfs_finobt_calc_reserves+0xe7/0x4d0 [xfs] +[18679.582480] xfs_ag_resv_init+0x2c5/0x490 [xfs] +[18679.586023] xfs_ag_shrink_space+0x736/0xd30 [xfs] +[18679.590730] xfs_growfs_data_private.isra.0+0x55e/0x990 [xfs] +[18679.599764] xfs_growfs_data+0x2f1/0x410 [xfs] +[18679.602212] xfs_file_ioctl+0xd1e/0x1370 [xfs] + +trying to get the AGI lock. The AGI lock was held by a fstress task +trying to do an inode allocation, and it was waiting on the AGF +lock to allocate a new inode chunk on disk. Hence deadlock. + +The fix for this is for the growfs code to hold the AGI over the +transaction roll it does in the error path. It already holds the AGF +locked across this, and that is what causes the lock order inversion +in the xfs_ag_resv_init() call. + +Reported-by: Chandan Babu R +Fixes: 46141dc891f7 ("xfs: introduce xfs_ag_shrink_space()") +Signed-off-by: Dave Chinner +Reviewed-by: Gao Xiang +Reviewed-by: Christoph Hellwig +Signed-off-by: Chandan Babu R +Signed-off-by: Catherine Hoang +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_ag.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +--- a/fs/xfs/libxfs/xfs_ag.c ++++ b/fs/xfs/libxfs/xfs_ag.c +@@ -979,14 +979,23 @@ xfs_ag_shrink_space( + + if (error) { + /* +- * if extent allocation fails, need to roll the transaction to ++ * If extent allocation fails, need to roll the transaction to + * ensure that the AGFL fixup has been committed anyway. ++ * ++ * We need to hold the AGF across the roll to ensure nothing can ++ * access the AG for allocation until the shrink is fully ++ * cleaned up. And due to the resetting of the AG block ++ * reservation space needing to lock the AGI, we also have to ++ * hold that so we don't get AGI/AGF lock order inversions in ++ * the error handling path. + */ + xfs_trans_bhold(*tpp, agfbp); ++ xfs_trans_bhold(*tpp, agibp); + err2 = xfs_trans_roll(tpp); + if (err2) + return err2; + xfs_trans_bjoin(*tpp, agfbp); ++ xfs_trans_bjoin(*tpp, agibp); + goto resv_init_out; + } +