]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.6-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 19 Jun 2024 08:43:59 +0000 (10:43 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 19 Jun 2024 08:43:59 +0000 (10:43 +0200)
added patches:
xfs-allow-cross-linking-special-files-without-project-quota.patch
xfs-allow-sunit-mount-option-to-repair-bad-primary-sb-stripe-values.patch
xfs-don-t-use-current-journal_info.patch
xfs-ensure-submit-buffers-on-lsn-boundaries-in-error-handlers.patch
xfs-fix-imprecise-logic-in-xchk_btree_check_block_owner.patch
xfs-fix-scrub-stats-file-permissions.patch
xfs-fix-seek_hole-data-for-regions-with-active-cow-extents.patch
xfs-shrink-failure-needs-to-hold-agi-buffer.patch

queue-6.6/series
queue-6.6/xfs-allow-cross-linking-special-files-without-project-quota.patch [new file with mode: 0644]
queue-6.6/xfs-allow-sunit-mount-option-to-repair-bad-primary-sb-stripe-values.patch [new file with mode: 0644]
queue-6.6/xfs-don-t-use-current-journal_info.patch [new file with mode: 0644]
queue-6.6/xfs-ensure-submit-buffers-on-lsn-boundaries-in-error-handlers.patch [new file with mode: 0644]
queue-6.6/xfs-fix-imprecise-logic-in-xchk_btree_check_block_owner.patch [new file with mode: 0644]
queue-6.6/xfs-fix-scrub-stats-file-permissions.patch [new file with mode: 0644]
queue-6.6/xfs-fix-seek_hole-data-for-regions-with-active-cow-extents.patch [new file with mode: 0644]
queue-6.6/xfs-shrink-failure-needs-to-hold-agi-buffer.patch [new file with mode: 0644]

index 95420a18c2d9d67bdd976b2a078560ab54e1be02..c98ef3d30ed74542e8034535a08c319b926cdf4c 100644 (file)
@@ -238,3 +238,11 @@ btrfs-zoned-factor-out-per-zone-logic-from-btrfs_load_block_group_zone_info.patc
 btrfs-zoned-factor-out-single-bg-handling-from-btrfs_load_block_group_zone_info.patch
 btrfs-zoned-factor-out-dup-bg-handling-from-btrfs_load_block_group_zone_info.patch
 btrfs-zoned-fix-use-after-free-due-to-race-with-dev-replace.patch
+xfs-fix-imprecise-logic-in-xchk_btree_check_block_owner.patch
+xfs-fix-scrub-stats-file-permissions.patch
+xfs-fix-seek_hole-data-for-regions-with-active-cow-extents.patch
+xfs-shrink-failure-needs-to-hold-agi-buffer.patch
+xfs-ensure-submit-buffers-on-lsn-boundaries-in-error-handlers.patch
+xfs-allow-sunit-mount-option-to-repair-bad-primary-sb-stripe-values.patch
+xfs-don-t-use-current-journal_info.patch
+xfs-allow-cross-linking-special-files-without-project-quota.patch
diff --git a/queue-6.6/xfs-allow-cross-linking-special-files-without-project-quota.patch b/queue-6.6/xfs-allow-cross-linking-special-files-without-project-quota.patch
new file mode 100644 (file)
index 0000000..651029c
--- /dev/null
@@ -0,0 +1,73 @@
+From stable+bounces-52621-greg=kroah.com@vger.kernel.org Tue Jun 18 01:04:33 2024
+From: Catherine Hoang <catherine.hoang@oracle.com>
+Date: Mon, 17 Jun 2024 16:03:55 -0700
+Subject: xfs: allow cross-linking special files without project quota
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org
+Message-ID: <20240617230355.77091-9-catherine.hoang@oracle.com>
+
+From: Andrey Albershteyn <aalbersh@redhat.com>
+
+commit e23d7e82b707d1d0a627e334fb46370e4f772c11 upstream.
+
+There's an issue that if special files is created before quota
+project is enabled, then it's not possible to link this file. This
+works fine for normal files. This happens because xfs_quota skips
+special files (no ioctls to set necessary flags). The check for
+having the same project ID for source and destination then fails as
+source file doesn't have any ID.
+
+mkfs.xfs -f /dev/sda
+mount -o prjquota /dev/sda /mnt/test
+
+mkdir /mnt/test/foo
+mkfifo /mnt/test/foo/fifo1
+
+xfs_quota -xc "project -sp /mnt/test/foo 9" /mnt/test
+> Setting up project 9 (path /mnt/test/foo)...
+> xfs_quota: skipping special file /mnt/test/foo/fifo1
+> Processed 1 (/etc/projects and cmdline) paths for project 9 with recursion depth infinite (-1).
+
+ln /mnt/test/foo/fifo1 /mnt/test/foo/fifo1_link
+> ln: failed to create hard link '/mnt/test/testdir/fifo1_link' => '/mnt/test/testdir/fifo1': Invalid cross-device link
+
+mkfifo /mnt/test/foo/fifo2
+ln /mnt/test/foo/fifo2 /mnt/test/foo/fifo2_link
+
+Fix this by allowing linking of special files to the project quota
+if special files doesn't have any ID set (ID = 0).
+
+Signed-off-by: Andrey Albershteyn <aalbersh@redhat.com>
+Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
+Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_inode.c |   15 +++++++++++++--
+ 1 file changed, 13 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1239,8 +1239,19 @@ xfs_link(
+        */
+       if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
+                    tdp->i_projid != sip->i_projid)) {
+-              error = -EXDEV;
+-              goto error_return;
++              /*
++               * Project quota setup skips special files which can
++               * leave inodes in a PROJINHERIT directory without a
++               * project ID set. We need to allow links to be made
++               * to these "project-less" inodes because userspace
++               * expects them to succeed after project ID setup,
++               * but everything else should be rejected.
++               */
++              if (!special_file(VFS_I(sip)->i_mode) ||
++                  sip->i_projid != 0) {
++                      error = -EXDEV;
++                      goto error_return;
++              }
+       }
+       if (!resblks) {
diff --git a/queue-6.6/xfs-allow-sunit-mount-option-to-repair-bad-primary-sb-stripe-values.patch b/queue-6.6/xfs-allow-sunit-mount-option-to-repair-bad-primary-sb-stripe-values.patch
new file mode 100644 (file)
index 0000000..89baa7c
--- /dev/null
@@ -0,0 +1,170 @@
+From stable+bounces-52619-greg=kroah.com@vger.kernel.org Tue Jun 18 01:04:27 2024
+From: Catherine Hoang <catherine.hoang@oracle.com>
+Date: Mon, 17 Jun 2024 16:03:53 -0700
+Subject: xfs: allow sunit mount option to repair bad primary sb stripe values
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org
+Message-ID: <20240617230355.77091-7-catherine.hoang@oracle.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 15922f5dbf51dad334cde888ce6835d377678dc9 upstream.
+
+If a filesystem has a busted stripe alignment configuration on disk
+(e.g. because broken RAID firmware told mkfs that swidth was smaller
+than sunit), then the filesystem will refuse to mount due to the
+stripe validation failing. This failure is triggering during distro
+upgrades from old kernels lacking this check to newer kernels with
+this check, and currently the only way to fix it is with offline
+xfs_db surgery.
+
+This runtime validity checking occurs when we read the superblock
+for the first time and causes the mount to fail immediately. This
+prevents the rewrite of stripe unit/width via
+mount options that occurs later in the mount process. Hence there is
+no way to recover this situation without resorting to offline xfs_db
+rewrite of the values.
+
+However, we parse the mount options long before we read the
+superblock, and we know if the mount has been asked to re-write the
+stripe alignment configuration when we are reading the superblock
+and verifying it for the first time. Hence we can conditionally
+ignore stripe verification failures if the mount options specified
+will correct the issue.
+
+We validate that the new stripe unit/width are valid before we
+overwrite the superblock values, so we can ignore the invalid config
+at verification and fail the mount later if the new values are not
+valid. This, at least, gives users the chance of correcting the
+issue after a kernel upgrade without having to resort to xfs-db
+hacks.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
+Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_sb.c |   40 +++++++++++++++++++++++++++++++---------
+ fs/xfs/libxfs/xfs_sb.h |    5 +++--
+ 2 files changed, 34 insertions(+), 11 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_sb.c
++++ b/fs/xfs/libxfs/xfs_sb.c
+@@ -530,7 +530,8 @@ xfs_validate_sb_common(
+       }
+       if (!xfs_validate_stripe_geometry(mp, XFS_FSB_TO_B(mp, sbp->sb_unit),
+-                      XFS_FSB_TO_B(mp, sbp->sb_width), 0, false))
++                      XFS_FSB_TO_B(mp, sbp->sb_width), 0,
++                      xfs_buf_daddr(bp) == XFS_SB_DADDR, false))
+               return -EFSCORRUPTED;
+       /*
+@@ -1319,8 +1320,10 @@ xfs_sb_get_secondary(
+ }
+ /*
+- * sunit, swidth, sectorsize(optional with 0) should be all in bytes,
+- * so users won't be confused by values in error messages.
++ * sunit, swidth, sectorsize(optional with 0) should be all in bytes, so users
++ * won't be confused by values in error messages.  This function returns false
++ * if the stripe geometry is invalid and the caller is unable to repair the
++ * stripe configuration later in the mount process.
+  */
+ bool
+ xfs_validate_stripe_geometry(
+@@ -1328,20 +1331,21 @@ xfs_validate_stripe_geometry(
+       __s64                   sunit,
+       __s64                   swidth,
+       int                     sectorsize,
++      bool                    may_repair,
+       bool                    silent)
+ {
+       if (swidth > INT_MAX) {
+               if (!silent)
+                       xfs_notice(mp,
+ "stripe width (%lld) is too large", swidth);
+-              return false;
++              goto check_override;
+       }
+       if (sunit > swidth) {
+               if (!silent)
+                       xfs_notice(mp,
+ "stripe unit (%lld) is larger than the stripe width (%lld)", sunit, swidth);
+-              return false;
++              goto check_override;
+       }
+       if (sectorsize && (int)sunit % sectorsize) {
+@@ -1349,21 +1353,21 @@ xfs_validate_stripe_geometry(
+                       xfs_notice(mp,
+ "stripe unit (%lld) must be a multiple of the sector size (%d)",
+                                  sunit, sectorsize);
+-              return false;
++              goto check_override;
+       }
+       if (sunit && !swidth) {
+               if (!silent)
+                       xfs_notice(mp,
+ "invalid stripe unit (%lld) and stripe width of 0", sunit);
+-              return false;
++              goto check_override;
+       }
+       if (!sunit && swidth) {
+               if (!silent)
+                       xfs_notice(mp,
+ "invalid stripe width (%lld) and stripe unit of 0", swidth);
+-              return false;
++              goto check_override;
+       }
+       if (sunit && (int)swidth % (int)sunit) {
+@@ -1371,9 +1375,27 @@ xfs_validate_stripe_geometry(
+                       xfs_notice(mp,
+ "stripe width (%lld) must be a multiple of the stripe unit (%lld)",
+                                  swidth, sunit);
+-              return false;
++              goto check_override;
+       }
+       return true;
++
++check_override:
++      if (!may_repair)
++              return false;
++      /*
++       * During mount, mp->m_dalign will not be set unless the sunit mount
++       * option was set. If it was set, ignore the bad stripe alignment values
++       * and allow the validation and overwrite later in the mount process to
++       * attempt to overwrite the bad stripe alignment values with the values
++       * supplied by mount options.
++       */
++      if (!mp->m_dalign)
++              return false;
++      if (!silent)
++              xfs_notice(mp,
++"Will try to correct with specified mount options sunit (%d) and swidth (%d)",
++                      BBTOB(mp->m_dalign), BBTOB(mp->m_swidth));
++      return true;
+ }
+ /*
+--- a/fs/xfs/libxfs/xfs_sb.h
++++ b/fs/xfs/libxfs/xfs_sb.h
+@@ -35,8 +35,9 @@ extern int   xfs_sb_get_secondary(struct x
+                               struct xfs_trans *tp, xfs_agnumber_t agno,
+                               struct xfs_buf **bpp);
+-extern bool   xfs_validate_stripe_geometry(struct xfs_mount *mp,
+-              __s64 sunit, __s64 swidth, int sectorsize, bool silent);
++bool  xfs_validate_stripe_geometry(struct xfs_mount *mp,
++              __s64 sunit, __s64 swidth, int sectorsize, bool may_repair,
++              bool silent);
+ uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
diff --git a/queue-6.6/xfs-don-t-use-current-journal_info.patch b/queue-6.6/xfs-don-t-use-current-journal_info.patch
new file mode 100644 (file)
index 0000000..afb3902
--- /dev/null
@@ -0,0 +1,168 @@
+From stable+bounces-52620-greg=kroah.com@vger.kernel.org Tue Jun 18 01:04:32 2024
+From: Catherine Hoang <catherine.hoang@oracle.com>
+Date: Mon, 17 Jun 2024 16:03:54 -0700
+Subject: xfs: don't use current->journal_info
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org
+Message-ID: <20240617230355.77091-8-catherine.hoang@oracle.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit f2e812c1522dab847912309b00abcc762dd696da upstream.
+
+syzbot reported an ext4 panic during a page fault where found a
+journal handle when it didn't expect to find one. The structure
+it tripped over had a value of 'TRAN' in the first entry in the
+structure, and that indicates it tripped over a struct xfs_trans
+instead of a jbd2 handle.
+
+The reason for this is that the page fault was taken during a
+copy-out to a user buffer from an xfs bulkstat operation. XFS uses
+an "empty" transaction context for bulkstat to do automated metadata
+buffer cleanup, and so the transaction context is valid across the
+copyout of the bulkstat info into the user buffer.
+
+We are using empty transaction contexts like this in XFS to reduce
+the risk of failing to release objects we reference during the
+operation, especially during error handling. Hence we really need to
+ensure that we can take page faults from these contexts without
+leaving landmines for the code processing the page fault to trip
+over.
+
+However, this same behaviour could happen from any other filesystem
+that triggers a page fault or any other exception that is handled
+on-stack from within a task context that has current->journal_info
+set.  Having a page fault from some other filesystem bounce into XFS
+where we have to run a transaction isn't a bug at all, but the usage
+of current->journal_info means that this could result corruption of
+the outer task's journal_info structure.
+
+The problem is purely that we now have two different contexts that
+now think they own current->journal_info. IOWs, no filesystem can
+allow page faults or on-stack exceptions while current->journal_info
+is set by the filesystem because the exception processing might use
+current->journal_info itself.
+
+If we end up with nested XFS transactions whilst holding an empty
+transaction, then it isn't an issue as the outer transaction does
+not hold a log reservation. If we ignore the current->journal_info
+usage, then the only problem that might occur is a deadlock if the
+exception tries to take the same locks the upper context holds.
+That, however, is not a problem that setting current->journal_info
+would solve, so it's largely an irrelevant concern here.
+
+IOWs, we really only use current->journal_info for a warning check
+in xfs_vm_writepages() to ensure we aren't doing writeback from a
+transaction context. Writeback might need to do allocation, so it
+can need to run transactions itself. Hence it's a debug check to
+warn us that we've done something silly, and largely it is not all
+that useful.
+
+So let's just remove all the use of current->journal_info in XFS and
+get rid of all the potential issues from nested contexts where
+current->journal_info might get misused by another filesystem
+context.
+
+Reported-by: syzbot+cdee56dbcdf0096ef605@syzkaller.appspotmail.com
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
+Reviewed-by: Mark Tinguely <mark.tinguely@oracle.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/scrub/common.c |    4 +---
+ fs/xfs/xfs_aops.c     |    7 -------
+ fs/xfs/xfs_icache.c   |    8 +++++---
+ fs/xfs/xfs_trans.h    |    9 +--------
+ 4 files changed, 7 insertions(+), 21 deletions(-)
+
+--- a/fs/xfs/scrub/common.c
++++ b/fs/xfs/scrub/common.c
+@@ -978,9 +978,7 @@ xchk_irele(
+       struct xfs_scrub        *sc,
+       struct xfs_inode        *ip)
+ {
+-      if (current->journal_info != NULL) {
+-              ASSERT(current->journal_info == sc->tp);
+-
++      if (sc->tp) {
+               /*
+                * If we are in a transaction, we /cannot/ drop the inode
+                * ourselves, because the VFS will trigger writeback, which
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -502,13 +502,6 @@ xfs_vm_writepages(
+ {
+       struct xfs_writepage_ctx wpc = { };
+-      /*
+-       * Writing back data in a transaction context can result in recursive
+-       * transactions. This is bad, so issue a warning and get out of here.
+-       */
+-      if (WARN_ON_ONCE(current->journal_info))
+-              return 0;
+-
+       xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+       return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
+ }
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -2031,8 +2031,10 @@ xfs_inodegc_want_queue_work(
+  *  - Memory shrinkers queued the inactivation worker and it hasn't finished.
+  *  - The queue depth exceeds the maximum allowable percpu backlog.
+  *
+- * Note: If the current thread is running a transaction, we don't ever want to
+- * wait for other transactions because that could introduce a deadlock.
++ * Note: If we are in a NOFS context here (e.g. current thread is running a
++ * transaction) the we don't want to block here as inodegc progress may require
++ * filesystem resources we hold to make progress and that could result in a
++ * deadlock. Hence we skip out of here if we are in a scoped NOFS context.
+  */
+ static inline bool
+ xfs_inodegc_want_flush_work(
+@@ -2040,7 +2042,7 @@ xfs_inodegc_want_flush_work(
+       unsigned int            items,
+       unsigned int            shrinker_hits)
+ {
+-      if (current->journal_info)
++      if (current->flags & PF_MEMALLOC_NOFS)
+               return false;
+       if (shrinker_hits > 0)
+--- a/fs/xfs/xfs_trans.h
++++ b/fs/xfs/xfs_trans.h
+@@ -277,19 +277,14 @@ static inline void
+ xfs_trans_set_context(
+       struct xfs_trans        *tp)
+ {
+-      ASSERT(current->journal_info == NULL);
+       tp->t_pflags = memalloc_nofs_save();
+-      current->journal_info = tp;
+ }
+ static inline void
+ xfs_trans_clear_context(
+       struct xfs_trans        *tp)
+ {
+-      if (current->journal_info == tp) {
+-              memalloc_nofs_restore(tp->t_pflags);
+-              current->journal_info = NULL;
+-      }
++      memalloc_nofs_restore(tp->t_pflags);
+ }
+ static inline void
+@@ -297,10 +292,8 @@ xfs_trans_switch_context(
+       struct xfs_trans        *old_tp,
+       struct xfs_trans        *new_tp)
+ {
+-      ASSERT(current->journal_info == old_tp);
+       new_tp->t_pflags = old_tp->t_pflags;
+       old_tp->t_pflags = 0;
+-      current->journal_info = new_tp;
+ }
+ #endif        /* __XFS_TRANS_H__ */
diff --git a/queue-6.6/xfs-ensure-submit-buffers-on-lsn-boundaries-in-error-handlers.patch b/queue-6.6/xfs-ensure-submit-buffers-on-lsn-boundaries-in-error-handlers.patch
new file mode 100644 (file)
index 0000000..25c1172
--- /dev/null
@@ -0,0 +1,175 @@
+From stable+bounces-52618-greg=kroah.com@vger.kernel.org Tue Jun 18 01:04:26 2024
+From: Catherine Hoang <catherine.hoang@oracle.com>
+Date: Mon, 17 Jun 2024 16:03:52 -0700
+Subject: xfs: ensure submit buffers on LSN boundaries in error handlers
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org
+Message-ID: <20240617230355.77091-6-catherine.hoang@oracle.com>
+
+From: Long Li <leo.lilong@huawei.com>
+
+commit e4c3b72a6ea93ed9c1815c74312eee9305638852 upstream.
+
+While performing the IO fault injection test, I caught the following data
+corruption report:
+
+ XFS (dm-0): Internal error ltbno + ltlen > bno at line 1957 of file fs/xfs/libxfs/xfs_alloc.c.  Caller xfs_free_ag_extent+0x79c/0x1130
+ CPU: 3 PID: 33 Comm: kworker/3:0 Not tainted 6.5.0-rc7-next-20230825-00001-g7f8666926889 #214
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014
+ Workqueue: xfs-inodegc/dm-0 xfs_inodegc_worker
+ Call Trace:
+  <TASK>
+  dump_stack_lvl+0x50/0x70
+  xfs_corruption_error+0x134/0x150
+  xfs_free_ag_extent+0x7d3/0x1130
+  __xfs_free_extent+0x201/0x3c0
+  xfs_trans_free_extent+0x29b/0xa10
+  xfs_extent_free_finish_item+0x2a/0xb0
+  xfs_defer_finish_noroll+0x8d1/0x1b40
+  xfs_defer_finish+0x21/0x200
+  xfs_itruncate_extents_flags+0x1cb/0x650
+  xfs_free_eofblocks+0x18f/0x250
+  xfs_inactive+0x485/0x570
+  xfs_inodegc_worker+0x207/0x530
+  process_scheduled_works+0x24a/0xe10
+  worker_thread+0x5ac/0xc60
+  kthread+0x2cd/0x3c0
+  ret_from_fork+0x4a/0x80
+  ret_from_fork_asm+0x11/0x20
+  </TASK>
+ XFS (dm-0): Corruption detected. Unmount and run xfs_repair
+
+After analyzing the disk image, it was found that the corruption was
+triggered by the fact that extent was recorded in both inode datafork
+and AGF btree blocks. After a long time of reproduction and analysis,
+we found that the reason of free sapce btree corruption was that the
+AGF btree was not recovered correctly.
+
+Consider the following situation, Checkpoint A and Checkpoint B are in
+the same record and share the same start LSN1, buf items of same object
+(AGF btree block) is included in both Checkpoint A and Checkpoint B. If
+the buf item in Checkpoint A has been recovered and updates metadata LSN
+permanently, then the buf item in Checkpoint B cannot be recovered,
+because log recovery skips items with a metadata LSN >= the current LSN
+of the recovery item. If there is still an inode item in Checkpoint B
+that records the Extent X, the Extent X will be recorded in both inode
+datafork and AGF btree block after Checkpoint B is recovered. Such
+transaction can be seen when allocing enxtent for inode bmap, it record
+both the addition of extent to the inode extent list and the removing
+extent from the AGF.
+
+  |------------Record (LSN1)------------------|---Record (LSN2)---|
+  |-------Checkpoint A----------|----------Checkpoint B-----------|
+  |     Buf Item(Extent X)      | Buf Item / Inode item(Extent X) |
+  |     Extent X is freed       |     Extent X is allocated       |
+
+After commit 12818d24db8a ("xfs: rework log recovery to submit buffers
+on LSN boundaries") was introduced, we submit buffers on lsn boundaries
+during log recovery. The above problem can be avoided under normal paths,
+but it's not guaranteed under abnormal paths. Consider the following
+process, if an error was encountered after recover buf item in Checkpoint
+A and before recover buf item in Checkpoint B, buffers that have been
+added to the buffer_list will still be submitted, this violates the
+submits rule on lsn boundaries. So buf item in Checkpoint B cannot be
+recovered on the next mount due to current lsn of transaction equal to
+metadata lsn on disk. The detailed process of the problem is as follows.
+
+First Mount:
+
+  xlog_do_recovery_pass
+    error = xlog_recover_process
+      xlog_recover_process_data
+        xlog_recover_process_ophdr
+          xlog_recovery_process_trans
+            ...
+              /* recover buf item in Checkpoint A */
+              xlog_recover_buf_commit_pass2
+                xlog_recover_do_reg_buffer
+                /* add buffer of agf btree block to buffer_list */
+                xfs_buf_delwri_queue(bp, buffer_list)
+            ...
+            ==> Encounter read IO error and return
+    /* submit buffers regardless of error */
+    if (!list_empty(&buffer_list))
+      xfs_buf_delwri_submit(&buffer_list);
+
+    <buf items of agf btree block in Checkpoint A recovery success>
+
+Second Mount:
+
+  xlog_do_recovery_pass
+    error = xlog_recover_process
+      xlog_recover_process_data
+        xlog_recover_process_ophdr
+          xlog_recovery_process_trans
+            ...
+              /* recover buf item in Checkpoint B */
+              xlog_recover_buf_commit_pass2
+                /* buffer of agf btree block wouldn't added to
+                   buffer_list due to lsn equal to current_lsn */
+                if (XFS_LSN_CMP(lsn, current_lsn) >= 0)
+                  goto out_release
+
+    <buf items of agf btree block in Checkpoint B wouldn't recovery>
+
+In order to make sure that submits buffers on lsn boundaries in the
+abnormal paths, we need to check error status before submit buffers that
+have been added from the last record processed. If error status exist,
+buffers in the bufffer_list should not be writen to disk.
+
+Canceling the buffers in the buffer_list directly isn't correct, unlike
+any other place where write list was canceled, these buffers has been
+initialized by xfs_buf_item_init() during recovery and held by buf item,
+buf items will not be released in xfs_buf_delwri_cancel(), it's not easy
+to solve.
+
+If the filesystem has been shut down, then delwri list submission will
+error out all buffers on the list via IO submission/completion and do
+all the correct cleanup automatically. So shutting down the filesystem
+could prevents buffers in the bufffer_list from being written to disk.
+
+Fixes: 50d5c8d8e938 ("xfs: check LSN ordering for v5 superblocks during recovery")
+Signed-off-by: Long Li <leo.lilong@huawei.com>
+Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
+Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log_recover.c |   23 ++++++++++++++++++++---
+ 1 file changed, 20 insertions(+), 3 deletions(-)
+
+--- a/fs/xfs/xfs_log_recover.c
++++ b/fs/xfs/xfs_log_recover.c
+@@ -3203,11 +3203,28 @@ xlog_do_recovery_pass(
+       kmem_free(hbp);
+       /*
+-       * Submit buffers that have been added from the last record processed,
+-       * regardless of error status.
++       * Submit buffers that have been dirtied by the last record recovered.
+        */
+-      if (!list_empty(&buffer_list))
++      if (!list_empty(&buffer_list)) {
++              if (error) {
++                      /*
++                       * If there has been an item recovery error then we
++                       * cannot allow partial checkpoint writeback to
++                       * occur.  We might have multiple checkpoints with the
++                       * same start LSN in this buffer list, and partial
++                       * writeback of a checkpoint in this situation can
++                       * prevent future recovery of all the changes in the
++                       * checkpoints at this start LSN.
++                       *
++                       * Note: Shutting down the filesystem will result in the
++                       * delwri submission marking all the buffers stale,
++                       * completing them and cleaning up _XBF_LOGRECOVERY
++                       * state without doing any IO.
++                       */
++                      xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
++              }
+               error2 = xfs_buf_delwri_submit(&buffer_list);
++      }
+       if (error && first_bad)
+               *first_bad = rhead_blk;
diff --git a/queue-6.6/xfs-fix-imprecise-logic-in-xchk_btree_check_block_owner.patch b/queue-6.6/xfs-fix-imprecise-logic-in-xchk_btree_check_block_owner.patch
new file mode 100644 (file)
index 0000000..4162ff5
--- /dev/null
@@ -0,0 +1,54 @@
+From stable+bounces-52614-greg=kroah.com@vger.kernel.org Tue Jun 18 01:04:15 2024
+From: Catherine Hoang <catherine.hoang@oracle.com>
+Date: Mon, 17 Jun 2024 16:03:48 -0700
+Subject: xfs: fix imprecise logic in xchk_btree_check_block_owner
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org
+Message-ID: <20240617230355.77091-2-catherine.hoang@oracle.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+commit c0afba9a8363f17d4efed22a8764df33389aebe8 upstream.
+
+A reviewer was confused by the init_sa logic in this function.  Upon
+checking the logic, I discovered that the code is imprecise.  What we
+want to do here is check that there is an ownership record in the rmap
+btree for the AG that contains a btree block.
+
+For an inode-rooted btree (e.g. the bmbt) the per-AG btree cursors have
+not been initialized because inode btrees can span multiple AGs.
+Therefore, we must initialize the per-AG btree cursors in sc->sa before
+proceeding.  That is what init_sa controls, and hence the logic should
+be gated on XFS_BTREE_ROOT_IN_INODE, not XFS_BTREE_LONG_PTRS.
+
+In practice, ROOT_IN_INODE and LONG_PTRS are coincident so this hasn't
+mattered.  However, we're about to refactor both of those flags into
+separate btree_ops fields so we want this the logic to make sense
+afterwards.
+
+Fixes: 858333dcf021a ("xfs: check btree block ownership with bnobt/rmapbt when scrubbing btree")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/scrub/btree.c |    7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/scrub/btree.c
++++ b/fs/xfs/scrub/btree.c
+@@ -385,7 +385,12 @@ xchk_btree_check_block_owner(
+       agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr);
+       agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr);
+-      init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS;
++      /*
++       * If the btree being examined is not itself a per-AG btree, initialize
++       * sc->sa so that we can check for the presence of an ownership record
++       * in the rmap btree for the AG containing the block.
++       */
++      init_sa = bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE;
+       if (init_sa) {
+               error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa);
+               if (!xchk_btree_xref_process_error(bs->sc, bs->cur,
diff --git a/queue-6.6/xfs-fix-scrub-stats-file-permissions.patch b/queue-6.6/xfs-fix-scrub-stats-file-permissions.patch
new file mode 100644 (file)
index 0000000..8951db7
--- /dev/null
@@ -0,0 +1,48 @@
+From stable+bounces-52615-greg=kroah.com@vger.kernel.org Tue Jun 18 01:04:17 2024
+From: Catherine Hoang <catherine.hoang@oracle.com>
+Date: Mon, 17 Jun 2024 16:03:49 -0700
+Subject: xfs: fix scrub stats file permissions
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org
+Message-ID: <20240617230355.77091-3-catherine.hoang@oracle.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+commit e610e856b938a1fc86e7ee83ad2f39716082bca7 upstream.
+
+When the kernel is in lockdown mode, debugfs will only show files that
+are world-readable and cannot be written, mmaped, or used with ioctl.
+That more or less describes the scrub stats file, except that the
+permissions are wrong -- they should be 0444, not 0644.  You can't write
+the stats file, so the 0200 makes no sense.
+
+Meanwhile, the clear_stats file is only writable, but it got mode 0400
+instead of 0200, which would make more sense.
+
+Fix both files so that they make sense.
+
+Fixes: d7a74cad8f451 ("xfs: track usage statistics of online fsck")
+Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/scrub/stats.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/scrub/stats.c
++++ b/fs/xfs/scrub/stats.c
+@@ -329,9 +329,9 @@ xchk_stats_register(
+       if (!cs->cs_debugfs)
+               return;
+-      debugfs_create_file("stats", 0644, cs->cs_debugfs, cs,
++      debugfs_create_file("stats", 0444, cs->cs_debugfs, cs,
+                       &scrub_stats_fops);
+-      debugfs_create_file("clear_stats", 0400, cs->cs_debugfs, cs,
++      debugfs_create_file("clear_stats", 0200, cs->cs_debugfs, cs,
+                       &clear_scrub_stats_fops);
+ }
diff --git a/queue-6.6/xfs-fix-seek_hole-data-for-regions-with-active-cow-extents.patch b/queue-6.6/xfs-fix-seek_hole-data-for-regions-with-active-cow-extents.patch
new file mode 100644 (file)
index 0000000..c13317c
--- /dev/null
@@ -0,0 +1,73 @@
+From stable+bounces-52616-greg=kroah.com@vger.kernel.org Tue Jun 18 01:04:18 2024
+From: Catherine Hoang <catherine.hoang@oracle.com>
+Date: Mon, 17 Jun 2024 16:03:50 -0700
+Subject: xfs: fix SEEK_HOLE/DATA for regions with active COW extents
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org
+Message-ID: <20240617230355.77091-4-catherine.hoang@oracle.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 4b2f459d86252619448455013f581836c8b1b7da upstream.
+
+A data corruption problem was reported by CoreOS image builders
+when using reflink based disk image copies and then converting
+them to qcow2 images. The converted images failed the conversion
+verification step, and it was isolated down to the fact that
+qemu-img uses SEEK_HOLE/SEEK_DATA to find the data it is supposed to
+copy.
+
+The reproducer allowed me to isolate the issue down to a region of
+the file that had overlapping data and COW fork extents, and the
+problem was that the COW fork extent was being reported in it's
+entirity by xfs_seek_iomap_begin() and so skipping over the real
+data fork extents in that range.
+
+This was somewhat hidden by the fact that 'xfs_bmap -vvp' reported
+all the extents correctly, and reading the file completely (i.e. not
+using seek to skip holes) would map the file correctly and all the
+correct data extents are read. Hence the problem is isolated to just
+the xfs_seek_iomap_begin() implementation.
+
+Instrumentation with trace_printk made the problem obvious: we are
+passing the wrong length to xfs_trim_extent() in
+xfs_seek_iomap_begin(). We are passing the end_fsb, not the
+maximum length of the extent we want to trim the map too. Hence the
+COW extent map never gets trimmed to the start of the next data fork
+extent, and so the seek code treats the entire COW fork extent as
+unwritten and skips entirely over the data fork extents in that
+range.
+
+Link: https://github.com/coreos/coreos-assembler/issues/3728
+Fixes: 60271ab79d40 ("xfs: fix SEEK_DATA for speculative COW fork preallocation")
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_iomap.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -1323,7 +1323,7 @@ xfs_seek_iomap_begin(
+       if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
+               if (data_fsb < cow_fsb + cmap.br_blockcount)
+                       end_fsb = min(end_fsb, data_fsb);
+-              xfs_trim_extent(&cmap, offset_fsb, end_fsb);
++              xfs_trim_extent(&cmap, offset_fsb, end_fsb - offset_fsb);
+               seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
+               error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
+                               IOMAP_F_SHARED, seq);
+@@ -1348,7 +1348,7 @@ xfs_seek_iomap_begin(
+       imap.br_state = XFS_EXT_NORM;
+ done:
+       seq = xfs_iomap_inode_sequence(ip, 0);
+-      xfs_trim_extent(&imap, offset_fsb, end_fsb);
++      xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
+       error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
+ out_unlock:
+       xfs_iunlock(ip, lockmode);
diff --git a/queue-6.6/xfs-shrink-failure-needs-to-hold-agi-buffer.patch b/queue-6.6/xfs-shrink-failure-needs-to-hold-agi-buffer.patch
new file mode 100644 (file)
index 0000000..327b5ca
--- /dev/null
@@ -0,0 +1,87 @@
+From stable+bounces-52617-greg=kroah.com@vger.kernel.org Tue Jun 18 01:04:21 2024
+From: Catherine Hoang <catherine.hoang@oracle.com>
+Date: Mon, 17 Jun 2024 16:03:51 -0700
+Subject: xfs: shrink failure needs to hold AGI buffer
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org
+Message-ID: <20240617230355.77091-5-catherine.hoang@oracle.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+commit 75bcffbb9e7563259b7aed0fa77459d6a3a35627 upstream.
+
+Chandan reported a AGI/AGF lock order hang on xfs/168 during recent
+testing. The cause of the problem was the task running xfs_growfs
+to shrink the filesystem. A failure occurred trying to remove the
+free space from the btrees that the shrink would make disappear,
+and that meant it ran the error handling for a partial failure.
+
+This error path involves restoring the per-ag block reservations,
+and that requires calculating the amount of space needed to be
+reserved for the free inode btree. The growfs operation hung here:
+
+[18679.536829]  down+0x71/0xa0
+[18679.537657]  xfs_buf_lock+0xa4/0x290 [xfs]
+[18679.538731]  xfs_buf_find_lock+0xf7/0x4d0 [xfs]
+[18679.539920]  xfs_buf_lookup.constprop.0+0x289/0x500 [xfs]
+[18679.542628]  xfs_buf_get_map+0x2b3/0xe40 [xfs]
+[18679.547076]  xfs_buf_read_map+0xbb/0x900 [xfs]
+[18679.562616]  xfs_trans_read_buf_map+0x449/0xb10 [xfs]
+[18679.569778]  xfs_read_agi+0x1cd/0x500 [xfs]
+[18679.573126]  xfs_ialloc_read_agi+0xc2/0x5b0 [xfs]
+[18679.578708]  xfs_finobt_calc_reserves+0xe7/0x4d0 [xfs]
+[18679.582480]  xfs_ag_resv_init+0x2c5/0x490 [xfs]
+[18679.586023]  xfs_ag_shrink_space+0x736/0xd30 [xfs]
+[18679.590730]  xfs_growfs_data_private.isra.0+0x55e/0x990 [xfs]
+[18679.599764]  xfs_growfs_data+0x2f1/0x410 [xfs]
+[18679.602212]  xfs_file_ioctl+0xd1e/0x1370 [xfs]
+
+trying to get the AGI lock. The AGI lock was held by a fstress task
+trying to do an inode allocation, and it was waiting on the AGF
+lock to allocate a new inode chunk on disk. Hence deadlock.
+
+The fix for this is for the growfs code to hold the AGI over the
+transaction roll it does in the error path. It already holds the AGF
+locked across this, and that is what causes the lock order inversion
+in the xfs_ag_resv_init() call.
+
+Reported-by: Chandan Babu R <chandanbabu@kernel.org>
+Fixes: 46141dc891f7 ("xfs: introduce xfs_ag_shrink_space()")
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Catherine Hoang <catherine.hoang@oracle.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_ag.c |   11 ++++++++++-
+ 1 file changed, 10 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_ag.c
++++ b/fs/xfs/libxfs/xfs_ag.c
+@@ -979,14 +979,23 @@ xfs_ag_shrink_space(
+       if (error) {
+               /*
+-               * if extent allocation fails, need to roll the transaction to
++               * If extent allocation fails, need to roll the transaction to
+                * ensure that the AGFL fixup has been committed anyway.
++               *
++               * We need to hold the AGF across the roll to ensure nothing can
++               * access the AG for allocation until the shrink is fully
++               * cleaned up. And due to the resetting of the AG block
++               * reservation space needing to lock the AGI, we also have to
++               * hold that so we don't get AGI/AGF lock order inversions in
++               * the error handling path.
+                */
+               xfs_trans_bhold(*tpp, agfbp);
++              xfs_trans_bhold(*tpp, agibp);
+               err2 = xfs_trans_roll(tpp);
+               if (err2)
+                       return err2;
+               xfs_trans_bjoin(*tpp, agfbp);
++              xfs_trans_bjoin(*tpp, agibp);
+               goto resv_init_out;
+       }