]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.1-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 23 May 2024 11:19:00 +0000 (13:19 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 23 May 2024 11:19:00 +0000 (13:19 +0200)
added patches:
iomap-buffered-write-failure-should-not-truncate-the-page-cache.patch
iomap-write-iomap-validity-checks.patch
keys-trusted-fix-memory-leak-in-tpm2_key_encode.patch
mmc-core-add-hs400-tuning-in-hs400es-initialization.patch
xfs-allow-inode-inactivation-during-a-ro-mount-log-recovery.patch
xfs-attach-dquots-to-inode-before-reading-data-cow-fork-mappings.patch
xfs-drop-write-error-injection-is-unfixable-remove-it.patch
xfs-estimate-post-merge-refcounts-correctly.patch
xfs-fix-incorrect-error-out-in-xfs_remove.patch
xfs-fix-incorrect-i_nlink-caused-by-inode-racing.patch
xfs-fix-log-recovery-when-unknown-rocompat-bits-are-set.patch
xfs-fix-off-by-one-block-in-xfs_discard_folio.patch
xfs-fix-sb-write-verify-for-lazysbcount.patch
xfs-fix-super-block-buf-log-item-uaf-during-force-shutdown.patch
xfs-get-root-inode-correctly-at-bulkstat.patch
xfs-hoist-refcount-record-merge-predicates.patch
xfs-invalidate-block-device-page-cache-during-unmount.patch
xfs-invalidate-xfs_bufs-when-allocating-cow-extents.patch
xfs-iomap-move-delalloc-punching-to-iomap.patch
xfs-punching-delalloc-extents-on-write-failure-is-racy.patch
xfs-short-circuit-xfs_growfs_data_private-if-delta-is-zero.patch
xfs-use-byte-ranges-for-write-cleanup-ranges.patch
xfs-use-iomap_valid-method-to-detect-stale-cached-iomaps.patch
xfs-wait-iclog-complete-before-tearing-down-ail.patch
xfs-write-page-faults-in-iomap-are-not-buffered-writes.patch
xfs-xfs_bmap_punch_delalloc_range-should-take-a-byte-range.patch

27 files changed:
queue-6.1/iomap-buffered-write-failure-should-not-truncate-the-page-cache.patch [new file with mode: 0644]
queue-6.1/iomap-write-iomap-validity-checks.patch [new file with mode: 0644]
queue-6.1/keys-trusted-fix-memory-leak-in-tpm2_key_encode.patch [new file with mode: 0644]
queue-6.1/mmc-core-add-hs400-tuning-in-hs400es-initialization.patch [new file with mode: 0644]
queue-6.1/series
queue-6.1/xfs-allow-inode-inactivation-during-a-ro-mount-log-recovery.patch [new file with mode: 0644]
queue-6.1/xfs-attach-dquots-to-inode-before-reading-data-cow-fork-mappings.patch [new file with mode: 0644]
queue-6.1/xfs-drop-write-error-injection-is-unfixable-remove-it.patch [new file with mode: 0644]
queue-6.1/xfs-estimate-post-merge-refcounts-correctly.patch [new file with mode: 0644]
queue-6.1/xfs-fix-incorrect-error-out-in-xfs_remove.patch [new file with mode: 0644]
queue-6.1/xfs-fix-incorrect-i_nlink-caused-by-inode-racing.patch [new file with mode: 0644]
queue-6.1/xfs-fix-log-recovery-when-unknown-rocompat-bits-are-set.patch [new file with mode: 0644]
queue-6.1/xfs-fix-off-by-one-block-in-xfs_discard_folio.patch [new file with mode: 0644]
queue-6.1/xfs-fix-sb-write-verify-for-lazysbcount.patch [new file with mode: 0644]
queue-6.1/xfs-fix-super-block-buf-log-item-uaf-during-force-shutdown.patch [new file with mode: 0644]
queue-6.1/xfs-get-root-inode-correctly-at-bulkstat.patch [new file with mode: 0644]
queue-6.1/xfs-hoist-refcount-record-merge-predicates.patch [new file with mode: 0644]
queue-6.1/xfs-invalidate-block-device-page-cache-during-unmount.patch [new file with mode: 0644]
queue-6.1/xfs-invalidate-xfs_bufs-when-allocating-cow-extents.patch [new file with mode: 0644]
queue-6.1/xfs-iomap-move-delalloc-punching-to-iomap.patch [new file with mode: 0644]
queue-6.1/xfs-punching-delalloc-extents-on-write-failure-is-racy.patch [new file with mode: 0644]
queue-6.1/xfs-short-circuit-xfs_growfs_data_private-if-delta-is-zero.patch [new file with mode: 0644]
queue-6.1/xfs-use-byte-ranges-for-write-cleanup-ranges.patch [new file with mode: 0644]
queue-6.1/xfs-use-iomap_valid-method-to-detect-stale-cached-iomaps.patch [new file with mode: 0644]
queue-6.1/xfs-wait-iclog-complete-before-tearing-down-ail.patch [new file with mode: 0644]
queue-6.1/xfs-write-page-faults-in-iomap-are-not-buffered-writes.patch [new file with mode: 0644]
queue-6.1/xfs-xfs_bmap_punch_delalloc_range-should-take-a-byte-range.patch [new file with mode: 0644]

diff --git a/queue-6.1/iomap-buffered-write-failure-should-not-truncate-the-page-cache.patch b/queue-6.1/iomap-buffered-write-failure-should-not-truncate-the-page-cache.patch
new file mode 100644 (file)
index 0000000..6a98817
--- /dev/null
@@ -0,0 +1,295 @@
+From stable+bounces-42894-greg=kroah.com@vger.kernel.org Wed May  1 20:41:36 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:40:53 -0700
+Subject: iomap: buffered write failure should not truncate the page cache
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-5-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit f43dc4dc3eff028b5ddddd99f3a66c5a6bdd4e78 ]
+
+iomap_file_buffered_write_punch_delalloc() currently invalidates the
+page cache over the unused range of the delalloc extent that was
+allocated. While the write allocated the delalloc extent, it does
+not own it exclusively as the write does not hold any locks that
+prevent either writeback or mmap page faults from changing the state
+of either the page cache or the extent state backing this range.
+
+Whilst xfs_bmap_punch_delalloc_range() already handles races in
+extent conversion - it will only punch out delalloc extents and it
+ignores any other type of extent - the page cache truncate does not
+discriminate between data written by this write or some other task.
+As a result, truncating the page cache can result in data corruption
+if the write races with mmap modifications to the file over the same
+range.
+
+generic/346 exercises this workload, and if we randomly fail writes
+(as will happen when iomap gets stale iomap detection later in the
+patchset), it will randomly corrupt the file data because it removes
+data written by mmap() in the same page as the write() that failed.
+
+Hence we do not want to punch out the page cache over the range of
+the extent we failed to write to - what we actually need to do is
+detect the ranges that have dirty data in cache over them and *not
+punch them out*.
+
+To do this, we have to walk the page cache over the range of the
+delalloc extent we want to remove. This is made complex by the fact
+we have to handle partially up-to-date folios correctly and this can
+happen even when the FSB size == PAGE_SIZE because we now support
+multi-page folios in the page cache.
+
+Because we are only interested in discovering the edges of data
+ranges in the page cache (i.e. hole-data boundaries) we can make use
+of mapping_seek_hole_data() to find those transitions in the page
+cache. As we hold the invalidate_lock, we know that the boundaries
+are not going to change while we walk the range. This interface is
+also byte-based and is sub-page block aware, so we can find the data
+ranges in the cache based on byte offsets rather than page, folio or
+fs block sized chunks. This greatly simplifies the logic of finding
+dirty cached ranges in the page cache.
+
+Once we've identified a range that contains cached data, we can then
+iterate the range folio by folio. This allows us to determine if the
+data is dirty and hence perform the correct delalloc extent punching
+operations. The seek interface we use to iterate data ranges will
+give us sub-folio start/end granularity, so we may end up looking up
+the same folio multiple times as the seek interface iterates across
+each discontiguous data region in the folio.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/iomap/buffered-io.c |  195 +++++++++++++++++++++++++++++++++++++++++++++----
+ 1 file changed, 180 insertions(+), 15 deletions(-)
+
+--- a/fs/iomap/buffered-io.c
++++ b/fs/iomap/buffered-io.c
+@@ -828,6 +828,165 @@ iomap_file_buffered_write(struct kiocb *
+ EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+ /*
++ * Scan the data range passed to us for dirty page cache folios. If we find a
++ * dirty folio, punch out the preceeding range and update the offset from which
++ * the next punch will start from.
++ *
++ * We can punch out storage reservations under clean pages because they either
++ * contain data that has been written back - in which case the delalloc punch
++ * over that range is a no-op - or they have been read faults in which case they
++ * contain zeroes and we can remove the delalloc backing range and any new
++ * writes to those pages will do the normal hole filling operation...
++ *
++ * This makes the logic simple: we only need to keep the delalloc extents only
++ * over the dirty ranges of the page cache.
++ *
++ * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
++ * simplify range iterations.
++ */
++static int iomap_write_delalloc_scan(struct inode *inode,
++              loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
++              int (*punch)(struct inode *inode, loff_t offset, loff_t length))
++{
++      while (start_byte < end_byte) {
++              struct folio    *folio;
++
++              /* grab locked page */
++              folio = filemap_lock_folio(inode->i_mapping,
++                              start_byte >> PAGE_SHIFT);
++              if (!folio) {
++                      start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) +
++                                      PAGE_SIZE;
++                      continue;
++              }
++
++              /* if dirty, punch up to offset */
++              if (folio_test_dirty(folio)) {
++                      if (start_byte > *punch_start_byte) {
++                              int     error;
++
++                              error = punch(inode, *punch_start_byte,
++                                              start_byte - *punch_start_byte);
++                              if (error) {
++                                      folio_unlock(folio);
++                                      folio_put(folio);
++                                      return error;
++                              }
++                      }
++
++                      /*
++                       * Make sure the next punch start is correctly bound to
++                       * the end of this data range, not the end of the folio.
++                       */
++                      *punch_start_byte = min_t(loff_t, end_byte,
++                                      folio_next_index(folio) << PAGE_SHIFT);
++              }
++
++              /* move offset to start of next folio in range */
++              start_byte = folio_next_index(folio) << PAGE_SHIFT;
++              folio_unlock(folio);
++              folio_put(folio);
++      }
++      return 0;
++}
++
++/*
++ * Punch out all the delalloc blocks in the range given except for those that
++ * have dirty data still pending in the page cache - those are going to be
++ * written and so must still retain the delalloc backing for writeback.
++ *
++ * As we are scanning the page cache for data, we don't need to reimplement the
++ * wheel - mapping_seek_hole_data() does exactly what we need to identify the
++ * start and end of data ranges correctly even for sub-folio block sizes. This
++ * byte range based iteration is especially convenient because it means we
++ * don't have to care about variable size folios, nor where the start or end of
++ * the data range lies within a folio, if they lie within the same folio or even
++ * if there are multiple discontiguous data ranges within the folio.
++ *
++ * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so
++ * can return data ranges that exist in the cache beyond EOF. e.g. a page fault
++ * spanning EOF will initialise the post-EOF data to zeroes and mark it up to
++ * date. A write page fault can then mark it dirty. If we then fail a write()
++ * beyond EOF into that up to date cached range, we allocate a delalloc block
++ * beyond EOF and then have to punch it out. Because the range is up to date,
++ * mapping_seek_hole_data() will return it, and we will skip the punch because
++ * the folio is dirty. THis is incorrect - we always need to punch out delalloc
++ * beyond EOF in this case as writeback will never write back and covert that
++ * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF,
++ * resulting in always punching out the range from the EOF to the end of the
++ * range the iomap spans.
++ *
++ * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it
++ * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA
++ * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte)
++ * returns the end of the data range (data_end). Using closed intervals would
++ * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
++ * the code to subtle off-by-one bugs....
++ */
++static int iomap_write_delalloc_release(struct inode *inode,
++              loff_t start_byte, loff_t end_byte,
++              int (*punch)(struct inode *inode, loff_t pos, loff_t length))
++{
++      loff_t punch_start_byte = start_byte;
++      loff_t scan_end_byte = min(i_size_read(inode), end_byte);
++      int error = 0;
++
++      /*
++       * Lock the mapping to avoid races with page faults re-instantiating
++       * folios and dirtying them via ->page_mkwrite whilst we walk the
++       * cache and perform delalloc extent removal. Failing to do this can
++       * leave dirty pages with no space reservation in the cache.
++       */
++      filemap_invalidate_lock(inode->i_mapping);
++      while (start_byte < scan_end_byte) {
++              loff_t          data_end;
++
++              start_byte = mapping_seek_hole_data(inode->i_mapping,
++                              start_byte, scan_end_byte, SEEK_DATA);
++              /*
++               * If there is no more data to scan, all that is left is to
++               * punch out the remaining range.
++               */
++              if (start_byte == -ENXIO || start_byte == scan_end_byte)
++                      break;
++              if (start_byte < 0) {
++                      error = start_byte;
++                      goto out_unlock;
++              }
++              WARN_ON_ONCE(start_byte < punch_start_byte);
++              WARN_ON_ONCE(start_byte > scan_end_byte);
++
++              /*
++               * We find the end of this contiguous cached data range by
++               * seeking from start_byte to the beginning of the next hole.
++               */
++              data_end = mapping_seek_hole_data(inode->i_mapping, start_byte,
++                              scan_end_byte, SEEK_HOLE);
++              if (data_end < 0) {
++                      error = data_end;
++                      goto out_unlock;
++              }
++              WARN_ON_ONCE(data_end <= start_byte);
++              WARN_ON_ONCE(data_end > scan_end_byte);
++
++              error = iomap_write_delalloc_scan(inode, &punch_start_byte,
++                              start_byte, data_end, punch);
++              if (error)
++                      goto out_unlock;
++
++              /* The next data search starts at the end of this one. */
++              start_byte = data_end;
++      }
++
++      if (punch_start_byte < end_byte)
++              error = punch(inode, punch_start_byte,
++                              end_byte - punch_start_byte);
++out_unlock:
++      filemap_invalidate_unlock(inode->i_mapping);
++      return error;
++}
++
++/*
+  * When a short write occurs, the filesystem may need to remove reserved space
+  * that was allocated in ->iomap_begin from it's ->iomap_end method. For
+  * filesystems that use delayed allocation, we need to punch out delalloc
+@@ -837,8 +996,25 @@ EXPORT_SYMBOL_GPL(iomap_file_buffered_wr
+  * allocated for this iomap.
+  *
+  * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
+- * simplify range iterations, but converts them back to {offset,len} tuples for
+- * the punch callback.
++ * simplify range iterations.
++ *
++ * The punch() callback *must* only punch delalloc extents in the range passed
++ * to it. It must skip over all other types of extents in the range and leave
++ * them completely unchanged. It must do this punch atomically with respect to
++ * other extent modifications.
++ *
++ * The punch() callback may be called with a folio locked to prevent writeback
++ * extent allocation racing at the edge of the range we are currently punching.
++ * The locked folio may or may not cover the range being punched, so it is not
++ * safe for the punch() callback to lock folios itself.
++ *
++ * Lock order is:
++ *
++ * inode->i_rwsem (shared or exclusive)
++ *   inode->i_mapping->invalidate_lock (exclusive)
++ *     folio_lock()
++ *       ->punch
++ *         internal filesystem allocation lock
+  */
+ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
+               struct iomap *iomap, loff_t pos, loff_t length,
+@@ -848,7 +1024,6 @@ int iomap_file_buffered_write_punch_dela
+       loff_t                  start_byte;
+       loff_t                  end_byte;
+       int                     blocksize = i_blocksize(inode);
+-      int                     error = 0;
+       if (iomap->type != IOMAP_DELALLOC)
+               return 0;
+@@ -872,18 +1047,8 @@ int iomap_file_buffered_write_punch_dela
+       if (start_byte >= end_byte)
+               return 0;
+-      /*
+-       * Lock the mapping to avoid races with page faults re-instantiating
+-       * folios and dirtying them via ->page_mkwrite between the page cache
+-       * truncation and the delalloc extent removal. Failing to do this can
+-       * leave dirty pages with no space reservation in the cache.
+-       */
+-      filemap_invalidate_lock(inode->i_mapping);
+-      truncate_pagecache_range(inode, start_byte, end_byte - 1);
+-      error = punch(inode, start_byte, end_byte - start_byte);
+-      filemap_invalidate_unlock(inode->i_mapping);
+-
+-      return error;
++      return iomap_write_delalloc_release(inode, start_byte, end_byte,
++                                      punch);
+ }
+ EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
diff --git a/queue-6.1/iomap-write-iomap-validity-checks.patch b/queue-6.1/iomap-write-iomap-validity-checks.patch
new file mode 100644 (file)
index 0000000..63324d6
--- /dev/null
@@ -0,0 +1,263 @@
+From stable+bounces-42896-greg=kroah.com@vger.kernel.org Wed May  1 20:41:39 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:40:55 -0700
+Subject: iomap: write iomap validity checks
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, Christoph Hellwig <hch@lst.de>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-7-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit d7b64041164ca177170191d2ad775da074ab2926 ]
+
+A recent multithreaded write data corruption has been uncovered in
+the iomap write code. The core of the problem is partial folio
+writes can be flushed to disk while a new racing write can map it
+and fill the rest of the page:
+
+writeback                      new write
+
+allocate blocks
+  blocks are unwritten
+submit IO
+.....
+                               map blocks
+                               iomap indicates UNWRITTEN range
+                               loop {
+                                 lock folio
+                                 copyin data
+.....
+IO completes
+  runs unwritten extent conv
+    blocks are marked written
+                                 <iomap now stale>
+                                 get next folio
+                               }
+
+Now add memory pressure such that memory reclaim evicts the
+partially written folio that has already been written to disk.
+
+When the new write finally gets to the last partial page of the new
+write, it does not find it in cache, so it instantiates a new page,
+sees the iomap is unwritten, and zeros the part of the page that
+it does not have data from. This overwrites the data on disk that
+was originally written.
+
+The full description of the corruption mechanism can be found here:
+
+https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/
+
+To solve this problem, we need to check whether the iomap is still
+valid after we lock each folio during the write. We have to do it
+after we lock the page so that we don't end up with state changes
+occurring while we wait for the folio to be locked.
+
+Hence we need a mechanism to be able to check that the cached iomap
+is still valid (similar to what we already do in buffered
+writeback), and we need a way for ->begin_write to back out and
+tell the high level iomap iterator that we need to remap the
+remaining write range.
+
+The iomap needs to grow some storage for the validity cookie that
+the filesystem provides to travel with the iomap. XFS, in
+particular, also needs to know some more information about what the
+iomap maps (attribute extents rather than file data extents) to for
+the validity cookie to cover all the types of iomaps we might need
+to validate.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/iomap/buffered-io.c |   29 ++++++++++++++++++++++++++++-
+ fs/iomap/iter.c        |   19 ++++++++++++++++++-
+ include/linux/iomap.h  |   43 +++++++++++++++++++++++++++++++++++--------
+ 3 files changed, 81 insertions(+), 10 deletions(-)
+
+--- a/fs/iomap/buffered-io.c
++++ b/fs/iomap/buffered-io.c
+@@ -579,7 +579,7 @@ static int iomap_write_begin_inline(cons
+       return iomap_read_inline_data(iter, folio);
+ }
+-static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
++static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
+               size_t len, struct folio **foliop)
+ {
+       const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
+@@ -613,6 +613,27 @@ static int iomap_write_begin(const struc
+               status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM;
+               goto out_no_page;
+       }
++
++      /*
++       * Now we have a locked folio, before we do anything with it we need to
++       * check that the iomap we have cached is not stale. The inode extent
++       * mapping can change due to concurrent IO in flight (e.g.
++       * IOMAP_UNWRITTEN state can change and memory reclaim could have
++       * reclaimed a previously partially written page at this index after IO
++       * completion before this write reaches this file offset) and hence we
++       * could do the wrong thing here (zero a page range incorrectly or fail
++       * to zero) and corrupt data.
++       */
++      if (page_ops && page_ops->iomap_valid) {
++              bool iomap_valid = page_ops->iomap_valid(iter->inode,
++                                                      &iter->iomap);
++              if (!iomap_valid) {
++                      iter->iomap.flags |= IOMAP_F_STALE;
++                      status = 0;
++                      goto out_unlock;
++              }
++      }
++
+       if (pos + len > folio_pos(folio) + folio_size(folio))
+               len = folio_pos(folio) + folio_size(folio) - pos;
+@@ -768,6 +789,8 @@ again:
+               status = iomap_write_begin(iter, pos, bytes, &folio);
+               if (unlikely(status))
+                       break;
++              if (iter->iomap.flags & IOMAP_F_STALE)
++                      break;
+               page = folio_file_page(folio, pos >> PAGE_SHIFT);
+               if (mapping_writably_mapped(mapping))
+@@ -1076,6 +1099,8 @@ static loff_t iomap_unshare_iter(struct
+               status = iomap_write_begin(iter, pos, bytes, &folio);
+               if (unlikely(status))
+                       return status;
++              if (iter->iomap.flags & IOMAP_F_STALE)
++                      break;
+               status = iomap_write_end(iter, pos, bytes, bytes, folio);
+               if (WARN_ON_ONCE(status == 0))
+@@ -1131,6 +1156,8 @@ static loff_t iomap_zero_iter(struct iom
+               status = iomap_write_begin(iter, pos, bytes, &folio);
+               if (status)
+                       return status;
++              if (iter->iomap.flags & IOMAP_F_STALE)
++                      break;
+               offset = offset_in_folio(folio, pos);
+               if (bytes > folio_size(folio) - offset)
+--- a/fs/iomap/iter.c
++++ b/fs/iomap/iter.c
+@@ -7,12 +7,28 @@
+ #include <linux/iomap.h>
+ #include "trace.h"
++/*
++ * Advance to the next range we need to map.
++ *
++ * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully
++ * processed - it was aborted because the extent the iomap spanned may have been
++ * changed during the operation. In this case, the iteration behaviour is to
++ * remap the unprocessed range of the iter, and that means we may need to remap
++ * even when we've made no progress (i.e. iter->processed = 0). Hence the
++ * "finished iterating" case needs to distinguish between
++ * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we
++ * need to remap the entire remaining range.
++ */
+ static inline int iomap_iter_advance(struct iomap_iter *iter)
+ {
++      bool stale = iter->iomap.flags & IOMAP_F_STALE;
++
+       /* handle the previous iteration (if any) */
+       if (iter->iomap.length) {
+-              if (iter->processed <= 0)
++              if (iter->processed < 0)
+                       return iter->processed;
++              if (!iter->processed && !stale)
++                      return 0;
+               if (WARN_ON_ONCE(iter->processed > iomap_length(iter)))
+                       return -EIO;
+               iter->pos += iter->processed;
+@@ -33,6 +49,7 @@ static inline void iomap_iter_done(struc
+       WARN_ON_ONCE(iter->iomap.offset > iter->pos);
+       WARN_ON_ONCE(iter->iomap.length == 0);
+       WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos);
++      WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE);
+       trace_iomap_iter_dstmap(iter->inode, &iter->iomap);
+       if (iter->srcmap.type != IOMAP_HOLE)
+--- a/include/linux/iomap.h
++++ b/include/linux/iomap.h
+@@ -49,26 +49,35 @@ struct vm_fault;
+  *
+  * IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of
+  * buffer heads for this mapping.
++ *
++ * IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent
++ * rather than a file data extent.
+  */
+-#define IOMAP_F_NEW           0x01
+-#define IOMAP_F_DIRTY         0x02
+-#define IOMAP_F_SHARED                0x04
+-#define IOMAP_F_MERGED                0x08
+-#define IOMAP_F_BUFFER_HEAD   0x10
+-#define IOMAP_F_ZONE_APPEND   0x20
++#define IOMAP_F_NEW           (1U << 0)
++#define IOMAP_F_DIRTY         (1U << 1)
++#define IOMAP_F_SHARED                (1U << 2)
++#define IOMAP_F_MERGED                (1U << 3)
++#define IOMAP_F_BUFFER_HEAD   (1U << 4)
++#define IOMAP_F_ZONE_APPEND   (1U << 5)
++#define IOMAP_F_XATTR         (1U << 6)
+ /*
+  * Flags set by the core iomap code during operations:
+  *
+  * IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size
+  * has changed as the result of this write operation.
++ *
++ * IOMAP_F_STALE indicates that the iomap is not valid any longer and the file
++ * range it covers needs to be remapped by the high level before the operation
++ * can proceed.
+  */
+-#define IOMAP_F_SIZE_CHANGED  0x100
++#define IOMAP_F_SIZE_CHANGED  (1U << 8)
++#define IOMAP_F_STALE         (1U << 9)
+ /*
+  * Flags from 0x1000 up are for file system specific usage:
+  */
+-#define IOMAP_F_PRIVATE               0x1000
++#define IOMAP_F_PRIVATE               (1U << 12)
+ /*
+@@ -89,6 +98,7 @@ struct iomap {
+       void                    *inline_data;
+       void                    *private; /* filesystem private */
+       const struct iomap_page_ops *page_ops;
++      u64                     validity_cookie; /* used with .iomap_valid() */
+ };
+ static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos)
+@@ -128,6 +138,23 @@ struct iomap_page_ops {
+       int (*page_prepare)(struct inode *inode, loff_t pos, unsigned len);
+       void (*page_done)(struct inode *inode, loff_t pos, unsigned copied,
+                       struct page *page);
++
++      /*
++       * Check that the cached iomap still maps correctly to the filesystem's
++       * internal extent map. FS internal extent maps can change while iomap
++       * is iterating a cached iomap, so this hook allows iomap to detect that
++       * the iomap needs to be refreshed during a long running write
++       * operation.
++       *
++       * The filesystem can store internal state (e.g. a sequence number) in
++       * iomap->validity_cookie when the iomap is first mapped to be able to
++       * detect changes between mapping time and whenever .iomap_valid() is
++       * called.
++       *
++       * This is called with the folio over the specified file position held
++       * locked by the iomap code.
++       */
++      bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap);
+ };
+ /*
diff --git a/queue-6.1/keys-trusted-fix-memory-leak-in-tpm2_key_encode.patch b/queue-6.1/keys-trusted-fix-memory-leak-in-tpm2_key_encode.patch
new file mode 100644 (file)
index 0000000..ddd0917
--- /dev/null
@@ -0,0 +1,76 @@
+From ffcaa2172cc1a85ddb8b783de96d38ca8855e248 Mon Sep 17 00:00:00 2001
+From: Jarkko Sakkinen <jarkko@kernel.org>
+Date: Mon, 20 May 2024 02:31:53 +0300
+Subject: KEYS: trusted: Fix memory leak in tpm2_key_encode()
+
+From: Jarkko Sakkinen <jarkko@kernel.org>
+
+commit ffcaa2172cc1a85ddb8b783de96d38ca8855e248 upstream.
+
+'scratch' is never freed. Fix this by calling kfree() in the success, and
+in the error case.
+
+Cc: stable@vger.kernel.org # +v5.13
+Fixes: f2219745250f ("security: keys: trusted: use ASN.1 TPM2 key format for the blobs")
+Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ security/keys/trusted-keys/trusted_tpm2.c |   24 ++++++++++++++++++------
+ 1 file changed, 18 insertions(+), 6 deletions(-)
+
+--- a/security/keys/trusted-keys/trusted_tpm2.c
++++ b/security/keys/trusted-keys/trusted_tpm2.c
+@@ -38,6 +38,7 @@ static int tpm2_key_encode(struct truste
+       u8 *end_work = scratch + SCRATCH_SIZE;
+       u8 *priv, *pub;
+       u16 priv_len, pub_len;
++      int ret;
+       priv_len = get_unaligned_be16(src) + 2;
+       priv = src;
+@@ -57,8 +58,10 @@ static int tpm2_key_encode(struct truste
+               unsigned char bool[3], *w = bool;
+               /* tag 0 is emptyAuth */
+               w = asn1_encode_boolean(w, w + sizeof(bool), true);
+-              if (WARN(IS_ERR(w), "BUG: Boolean failed to encode"))
+-                      return PTR_ERR(w);
++              if (WARN(IS_ERR(w), "BUG: Boolean failed to encode")) {
++                      ret = PTR_ERR(w);
++                      goto err;
++              }
+               work = asn1_encode_tag(work, end_work, 0, bool, w - bool);
+       }
+@@ -69,8 +72,10 @@ static int tpm2_key_encode(struct truste
+        * trigger, so if it does there's something nefarious going on
+        */
+       if (WARN(work - scratch + pub_len + priv_len + 14 > SCRATCH_SIZE,
+-               "BUG: scratch buffer is too small"))
+-              return -EINVAL;
++               "BUG: scratch buffer is too small")) {
++              ret = -EINVAL;
++              goto err;
++      }
+       work = asn1_encode_integer(work, end_work, options->keyhandle);
+       work = asn1_encode_octet_string(work, end_work, pub, pub_len);
+@@ -79,10 +84,17 @@ static int tpm2_key_encode(struct truste
+       work1 = payload->blob;
+       work1 = asn1_encode_sequence(work1, work1 + sizeof(payload->blob),
+                                    scratch, work - scratch);
+-      if (WARN(IS_ERR(work1), "BUG: ASN.1 encoder failed"))
+-              return PTR_ERR(work1);
++      if (WARN(IS_ERR(work1), "BUG: ASN.1 encoder failed")) {
++              ret = PTR_ERR(work1);
++              goto err;
++      }
++      kfree(scratch);
+       return work1 - payload->blob;
++
++err:
++      kfree(scratch);
++      return ret;
+ }
+ struct tpm2_key_context {
diff --git a/queue-6.1/mmc-core-add-hs400-tuning-in-hs400es-initialization.patch b/queue-6.1/mmc-core-add-hs400-tuning-in-hs400es-initialization.patch
new file mode 100644 (file)
index 0000000..7f75251
--- /dev/null
@@ -0,0 +1,40 @@
+From 77e01b49e35f24ebd1659096d5fc5c3b75975545 Mon Sep 17 00:00:00 2001
+From: Mengqi Zhang <mengqi.zhang@mediatek.com>
+Date: Mon, 25 Dec 2023 17:38:40 +0800
+Subject: mmc: core: Add HS400 tuning in HS400es initialization
+
+From: Mengqi Zhang <mengqi.zhang@mediatek.com>
+
+commit 77e01b49e35f24ebd1659096d5fc5c3b75975545 upstream.
+
+During the initialization to HS400es stage, add a HS400 tuning flow as an
+optional process. For Mediatek IP, the HS400es mode requires a specific
+tuning to ensure the correct HS400 timing setting.
+
+Signed-off-by: Mengqi Zhang <mengqi.zhang@mediatek.com>
+Link: https://lore.kernel.org/r/20231225093839.22931-2-mengqi.zhang@mediatek.com
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Cc: "Lin Gui (æ¡‚æž—)" <Lin.Gui@mediatek.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/mmc/core/mmc.c |    9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/drivers/mmc/core/mmc.c
++++ b/drivers/mmc/core/mmc.c
+@@ -1819,8 +1819,13 @@ static int mmc_init_card(struct mmc_host
+               if (err)
+                       goto free_card;
+-
+-      } else if (!mmc_card_hs400es(card)) {
++      } else if (mmc_card_hs400es(card)) {
++              if (host->ops->execute_hs400_tuning) {
++                      err = host->ops->execute_hs400_tuning(host, card);
++                      if (err)
++                              goto free_card;
++              }
++      } else {
+               /* Select the desired bus width optionally */
+               err = mmc_select_bus_width(card);
+               if (err > 0 && mmc_card_hs(card)) {
index 18d55dd23b0a4ff8997493a0dfdf888c26dc1942..1538bc3e5dd6a3ee98ffe44d93cdf7cae292c676 100644 (file)
@@ -5,3 +5,29 @@ ice-remove-unnecessary-duplicate-checks-for-vf-vsi-id.patch
 pinctrl-core-handle-radix_tree_insert-errors-in-pinctrl_register_one_pin.patch
 mfd-stpmic1-fix-swapped-mask-unmask-in-irq-chip.patch
 nfsd-don-t-allow-nfsd-threads-to-be-signalled.patch
+keys-trusted-fix-memory-leak-in-tpm2_key_encode.patch
+mmc-core-add-hs400-tuning-in-hs400es-initialization.patch
+xfs-write-page-faults-in-iomap-are-not-buffered-writes.patch
+xfs-punching-delalloc-extents-on-write-failure-is-racy.patch
+xfs-use-byte-ranges-for-write-cleanup-ranges.patch
+xfs-iomap-move-delalloc-punching-to-iomap.patch
+iomap-buffered-write-failure-should-not-truncate-the-page-cache.patch
+xfs-xfs_bmap_punch_delalloc_range-should-take-a-byte-range.patch
+iomap-write-iomap-validity-checks.patch
+xfs-use-iomap_valid-method-to-detect-stale-cached-iomaps.patch
+xfs-drop-write-error-injection-is-unfixable-remove-it.patch
+xfs-fix-off-by-one-block-in-xfs_discard_folio.patch
+xfs-fix-incorrect-error-out-in-xfs_remove.patch
+xfs-fix-sb-write-verify-for-lazysbcount.patch
+xfs-fix-incorrect-i_nlink-caused-by-inode-racing.patch
+xfs-invalidate-block-device-page-cache-during-unmount.patch
+xfs-attach-dquots-to-inode-before-reading-data-cow-fork-mappings.patch
+xfs-wait-iclog-complete-before-tearing-down-ail.patch
+xfs-fix-super-block-buf-log-item-uaf-during-force-shutdown.patch
+xfs-hoist-refcount-record-merge-predicates.patch
+xfs-estimate-post-merge-refcounts-correctly.patch
+xfs-invalidate-xfs_bufs-when-allocating-cow-extents.patch
+xfs-allow-inode-inactivation-during-a-ro-mount-log-recovery.patch
+xfs-fix-log-recovery-when-unknown-rocompat-bits-are-set.patch
+xfs-get-root-inode-correctly-at-bulkstat.patch
+xfs-short-circuit-xfs_growfs_data_private-if-delta-is-zero.patch
diff --git a/queue-6.1/xfs-allow-inode-inactivation-during-a-ro-mount-log-recovery.patch b/queue-6.1/xfs-allow-inode-inactivation-during-a-ro-mount-log-recovery.patch
new file mode 100644 (file)
index 0000000..5375a0b
--- /dev/null
@@ -0,0 +1,64 @@
+From stable+bounces-42910-greg=kroah.com@vger.kernel.org Wed May  1 20:42:14 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:41:09 -0700
+Subject: xfs: allow inode inactivation during a ro mount log recovery
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-21-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 76e589013fec672c3587d6314f2d1f0aeddc26d9 ]
+
+In the next patch, we're going to prohibit log recovery if the primary
+superblock contains an unrecognized rocompat feature bit even on
+readonly mounts.  This requires removing all the code in the log
+mounting process that temporarily disables the readonly state.
+
+Unfortunately, inode inactivation disables itself on readonly mounts.
+Clearing the iunlinked lists after log recovery needs inactivation to
+run to free the unreferenced inodes, which (AFAICT) is the only reason
+why log mounting plays games with the readonly state in the first place.
+
+Therefore, change the inactivation predicates to allow inactivation
+during log recovery of a readonly mount.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_inode.c |   14 ++++++++++----
+ 1 file changed, 10 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1652,8 +1652,11 @@ xfs_inode_needs_inactive(
+       if (VFS_I(ip)->i_mode == 0)
+               return false;
+-      /* If this is a read-only mount, don't do this (would generate I/O) */
+-      if (xfs_is_readonly(mp))
++      /*
++       * If this is a read-only mount, don't do this (would generate I/O)
++       * unless we're in log recovery and cleaning the iunlinked list.
++       */
++      if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log))
+               return false;
+       /* If the log isn't running, push inodes straight to reclaim. */
+@@ -1713,8 +1716,11 @@ xfs_inactive(
+       mp = ip->i_mount;
+       ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
+-      /* If this is a read-only mount, don't do this (would generate I/O) */
+-      if (xfs_is_readonly(mp))
++      /*
++       * If this is a read-only mount, don't do this (would generate I/O)
++       * unless we're in log recovery and cleaning the iunlinked list.
++       */
++      if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log))
+               goto out;
+       /* Metadata inodes require explicit resource cleanup. */
diff --git a/queue-6.1/xfs-attach-dquots-to-inode-before-reading-data-cow-fork-mappings.patch b/queue-6.1/xfs-attach-dquots-to-inode-before-reading-data-cow-fork-mappings.patch
new file mode 100644 (file)
index 0000000..da77037
--- /dev/null
@@ -0,0 +1,158 @@
+From stable+bounces-42904-greg=kroah.com@vger.kernel.org Wed May  1 20:42:01 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:41:03 -0700
+Subject: xfs: attach dquots to inode before reading data/cow fork mappings
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-15-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 4c6dbfd2756bd83a0085ed804e2bb7be9cc16bc5 ]
+
+I've been running near-continuous integration testing of online fsck,
+and I've noticed that once a day, one of the ARM VMs will fail the test
+with out of order records in the data fork.
+
+xfs/804 races fsstress with online scrub (aka scan but do not change
+anything), so I think this might be a bug in the core xfs code.  This
+also only seems to trigger if one runs the test for more than ~6 minutes
+via TIME_FACTOR=13 or something.
+https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/tree/tests/xfs/804?h=djwong-wtf
+
+I added a debugging patch to the kernel to check the data fork extents
+after taking the ILOCK, before dropping ILOCK, and before and after each
+bmapping operation.  So far I've narrowed it down to the delalloc code
+inserting a record in the wrong place in the iext tree:
+
+xfs_bmap_add_extent_hole_delay, near line 2691:
+
+       case 0:
+               /*
+                * New allocation is not contiguous with another
+                * delayed allocation.
+                * Insert a new entry.
+                */
+               oldlen = newlen = 0;
+               xfs_iunlock_check_datafork(ip);         <-- ok here
+               xfs_iext_insert(ip, icur, new, state);
+               xfs_iunlock_check_datafork(ip);         <-- bad here
+               break;
+       }
+
+I recorded the state of the data fork mappings and iext cursor state
+when a corrupt data fork is detected immediately after the
+xfs_bmap_add_extent_hole_delay call in xfs_bmapi_reserve_delalloc:
+
+ino 0x140bb3 func xfs_bmapi_reserve_delalloc line 4164 data fork:
+    ino 0x140bb3 nr 0x0 nr_real 0x0 offset 0xb9 blockcount 0x1f startblock 0x935de2 state 1
+    ino 0x140bb3 nr 0x1 nr_real 0x1 offset 0xe6 blockcount 0xa startblock 0xffffffffe0007 state 0
+    ino 0x140bb3 nr 0x2 nr_real 0x1 offset 0xd8 blockcount 0xe startblock 0x935e01 state 0
+
+Here we see that a delalloc extent was inserted into the wrong position
+in the iext leaf, same as all the other times.  The extra trace data I
+collected are as follows:
+
+ino 0x140bb3 fork 0 oldoff 0xe6 oldlen 0x4 oldprealloc 0x6 isize 0xe6000
+    ino 0x140bb3 oldgotoff 0xea oldgotstart 0xfffffffffffffffe oldgotcount 0x0 oldgotstate 0
+    ino 0x140bb3 crapgotoff 0x0 crapgotstart 0x0 crapgotcount 0x0 crapgotstate 0
+    ino 0x140bb3 freshgotoff 0xd8 freshgotstart 0x935e01 freshgotcount 0xe freshgotstate 0
+    ino 0x140bb3 nowgotoff 0xe6 nowgotstart 0xffffffffe0007 nowgotcount 0xa nowgotstate 0
+    ino 0x140bb3 oldicurpos 1 oldleafnr 2 oldleaf 0xfffffc00f0609a00
+    ino 0x140bb3 crapicurpos 2 crapleafnr 2 crapleaf 0xfffffc00f0609a00
+    ino 0x140bb3 freshicurpos 1 freshleafnr 2 freshleaf 0xfffffc00f0609a00
+    ino 0x140bb3 newicurpos 1 newleafnr 3 newleaf 0xfffffc00f0609a00
+
+The first line shows that xfs_bmapi_reserve_delalloc was called with
+whichfork=XFS_DATA_FORK, off=0xe6, len=0x4, prealloc=6.
+
+The second line ("oldgot") shows the contents of @got at the beginning
+of the call, which are the results of the first iext lookup in
+xfs_buffered_write_iomap_begin.
+
+Line 3 ("crapgot") is the result of duplicating the cursor at the start
+of the body of xfs_bmapi_reserve_delalloc and performing a fresh lookup
+at @off.
+
+Line 4 ("freshgot") is the result of a new xfs_iext_get_extent right
+before the call to xfs_bmap_add_extent_hole_delay.  Totally garbage.
+
+Line 5 ("nowgot") is contents of @got after the
+xfs_bmap_add_extent_hole_delay call.
+
+Line 6 is the contents of @icur at the beginning fo the call.  Lines 7-9
+are the contents of the iext cursors at the point where the block
+mappings were sampled.
+
+I think @oldgot is a HOLESTARTBLOCK extent because the first lookup
+didn't find anything, so we filled in imap with "fake hole until the
+end".  At the time of the first lookup, I suspect that there's only one
+32-block unwritten extent in the mapping (hence oldicurpos==1) but by
+the time we get to recording crapgot, crapicurpos==2.
+
+Dave then added:
+
+Ok, that's much simpler to reason about, and implies the smoke is
+coming from xfs_buffered_write_iomap_begin() or
+xfs_bmapi_reserve_delalloc(). I suspect the former - it does a lot
+of stuff with the ILOCK_EXCL held.....
+
+.... including calling xfs_qm_dqattach_locked().
+
+xfs_buffered_write_iomap_begin
+  ILOCK_EXCL
+  look up icur
+  xfs_qm_dqattach_locked
+    xfs_qm_dqattach_one
+      xfs_qm_dqget_inode
+        dquot cache miss
+        xfs_iunlock(ip, XFS_ILOCK_EXCL);
+        error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp);
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+  ....
+  xfs_bmapi_reserve_delalloc(icur)
+
+Yup, that's what is letting the magic smoke out -
+xfs_qm_dqattach_locked() can cycle the ILOCK. If that happens, we
+can pass a stale icur to xfs_bmapi_reserve_delalloc() and it all
+goes downhill from there.
+
+Back to Darrick now:
+
+So.  Fix this by moving the dqattach_locked call up before we take the
+ILOCK, like all the other callers in that file.
+
+Fixes: a526c85c2236 ("xfs: move xfs_file_iomap_begin_delay around") # goes further back than this
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_iomap.c |    8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -968,6 +968,10 @@ xfs_buffered_write_iomap_begin(
+       ASSERT(!XFS_IS_REALTIME_INODE(ip));
++      error = xfs_qm_dqattach(ip);
++      if (error)
++              return error;
++
+       error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+       if (error)
+               return error;
+@@ -1071,10 +1075,6 @@ xfs_buffered_write_iomap_begin(
+                       allocfork = XFS_COW_FORK;
+       }
+-      error = xfs_qm_dqattach_locked(ip, false);
+-      if (error)
+-              goto out_unlock;
+-
+       if (eof && offset + count > XFS_ISIZE(ip)) {
+               /*
+                * Determine the initial size of the preallocation.
diff --git a/queue-6.1/xfs-drop-write-error-injection-is-unfixable-remove-it.patch b/queue-6.1/xfs-drop-write-error-injection-is-unfixable-remove-it.patch
new file mode 100644 (file)
index 0000000..9c6ed27
--- /dev/null
@@ -0,0 +1,212 @@
+From stable+bounces-42897-greg=kroah.com@vger.kernel.org Wed May  1 20:41:44 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:40:57 -0700
+Subject: xfs: drop write error injection is unfixable, remove it
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-9-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 6e8af15ccdc4e138a5b529c1901a0013e1dcaa09 ]
+
+With the changes to scan the page cache for dirty data to avoid data
+corruptions from partial write cleanup racing with other page cache
+operations, the drop writes error injection no longer works the same
+way it used to and causes xfs/196 to fail. This is because xfs/196
+writes to the file and populates the page cache before it turns on
+the error injection and starts failing -overwrites-.
+
+The result is that the original drop-writes code failed writes only
+-after- overwriting the data in the cache, followed by invalidates
+the cached data, then punching out the delalloc extent from under
+that data.
+
+On the surface, this looks fine. The problem is that page cache
+invalidation *doesn't guarantee that it removes anything from the
+page cache* and it doesn't change the dirty state of the folio. When
+block size == page size and we do page aligned IO (as xfs/196 does)
+everything happens to align perfectly and page cache invalidation
+removes the single page folios that span the written data. Hence the
+followup delalloc punch pass does not find cached data over that
+range and it can punch the extent out.
+
+IOWs, xfs/196 "works" for block size == page size with the new
+code. I say "works", because it actually only works for the case
+where IO is page aligned, and no data was read from disk before
+writes occur. Because the moment we actually read data first, the
+readahead code allocates multipage folios and suddenly the
+invalidate code goes back to zeroing subfolio ranges without
+changing dirty state.
+
+Hence, with multipage folios in play, block size == page size is
+functionally identical to block size < page size behaviour, and
+drop-writes is manifestly broken w.r.t to this case. Invalidation of
+a subfolio range doesn't result in the folio being removed from the
+cache, just the range gets zeroed. Hence after we've sequentially
+walked over a folio that we've dirtied (via write data) and then
+invalidated, we end up with a dirty folio full of zeroed data.
+
+And because the new code skips punching ranges that have dirty
+folios covering them, we end up leaving the delalloc range intact
+after failing all the writes. Hence failed writes now end up
+writing zeroes to disk in the cases where invalidation zeroes folios
+rather than removing them from cache.
+
+This is a fundamental change of behaviour that is needed to avoid
+the data corruption vectors that exist in the old write fail path,
+and it renders the drop-writes injection non-functional and
+unworkable as it stands.
+
+As it is, I think the error injection is also now unnecessary, as
+partial writes that need delalloc extent are going to be a lot more
+common with stale iomap detection in place. Hence this patch removes
+the drop-writes error injection completely. xfs/196 can remain for
+testing kernels that don't have this data corruption fix, but those
+that do will report:
+
+xfs/196 3s ... [not run] XFS error injection drop_writes unknown on this kernel.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_errortag.h |   12 +++++-------
+ fs/xfs/xfs_error.c           |   27 ++++++++++++++++++++-------
+ fs/xfs/xfs_iomap.c           |    9 ---------
+ 3 files changed, 25 insertions(+), 23 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_errortag.h
++++ b/fs/xfs/libxfs/xfs_errortag.h
+@@ -40,13 +40,12 @@
+ #define XFS_ERRTAG_REFCOUNT_FINISH_ONE                        25
+ #define XFS_ERRTAG_BMAP_FINISH_ONE                    26
+ #define XFS_ERRTAG_AG_RESV_CRITICAL                   27
++
+ /*
+- * DEBUG mode instrumentation to test and/or trigger delayed allocation
+- * block killing in the event of failed writes. When enabled, all
+- * buffered writes are silenty dropped and handled as if they failed.
+- * All delalloc blocks in the range of the write (including pre-existing
+- * delalloc blocks!) are tossed as part of the write failure error
+- * handling sequence.
++ * Drop-writes support removed because write error handling cannot trash
++ * pre-existing delalloc extents in any useful way anymore. We retain the
++ * definition so that we can reject it as an invalid value in
++ * xfs_errortag_valid().
+  */
+ #define XFS_ERRTAG_DROP_WRITES                                28
+ #define XFS_ERRTAG_LOG_BAD_CRC                                29
+@@ -95,7 +94,6 @@
+ #define XFS_RANDOM_REFCOUNT_FINISH_ONE                        1
+ #define XFS_RANDOM_BMAP_FINISH_ONE                    1
+ #define XFS_RANDOM_AG_RESV_CRITICAL                   4
+-#define XFS_RANDOM_DROP_WRITES                                1
+ #define XFS_RANDOM_LOG_BAD_CRC                                1
+ #define XFS_RANDOM_LOG_ITEM_PIN                               1
+ #define XFS_RANDOM_BUF_LRU_REF                                2
+--- a/fs/xfs/xfs_error.c
++++ b/fs/xfs/xfs_error.c
+@@ -46,7 +46,7 @@ static unsigned int xfs_errortag_random_
+       XFS_RANDOM_REFCOUNT_FINISH_ONE,
+       XFS_RANDOM_BMAP_FINISH_ONE,
+       XFS_RANDOM_AG_RESV_CRITICAL,
+-      XFS_RANDOM_DROP_WRITES,
++      0, /* XFS_RANDOM_DROP_WRITES has been removed */
+       XFS_RANDOM_LOG_BAD_CRC,
+       XFS_RANDOM_LOG_ITEM_PIN,
+       XFS_RANDOM_BUF_LRU_REF,
+@@ -162,7 +162,6 @@ XFS_ERRORTAG_ATTR_RW(refcount_continue_u
+ XFS_ERRORTAG_ATTR_RW(refcount_finish_one,     XFS_ERRTAG_REFCOUNT_FINISH_ONE);
+ XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE);
+ XFS_ERRORTAG_ATTR_RW(ag_resv_critical,        XFS_ERRTAG_AG_RESV_CRITICAL);
+-XFS_ERRORTAG_ATTR_RW(drop_writes,     XFS_ERRTAG_DROP_WRITES);
+ XFS_ERRORTAG_ATTR_RW(log_bad_crc,     XFS_ERRTAG_LOG_BAD_CRC);
+ XFS_ERRORTAG_ATTR_RW(log_item_pin,    XFS_ERRTAG_LOG_ITEM_PIN);
+ XFS_ERRORTAG_ATTR_RW(buf_lru_ref,     XFS_ERRTAG_BUF_LRU_REF);
+@@ -206,7 +205,6 @@ static struct attribute *xfs_errortag_at
+       XFS_ERRORTAG_ATTR_LIST(refcount_finish_one),
+       XFS_ERRORTAG_ATTR_LIST(bmap_finish_one),
+       XFS_ERRORTAG_ATTR_LIST(ag_resv_critical),
+-      XFS_ERRORTAG_ATTR_LIST(drop_writes),
+       XFS_ERRORTAG_ATTR_LIST(log_bad_crc),
+       XFS_ERRORTAG_ATTR_LIST(log_item_pin),
+       XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
+@@ -256,6 +254,19 @@ xfs_errortag_del(
+       kmem_free(mp->m_errortag);
+ }
++static bool
++xfs_errortag_valid(
++      unsigned int            error_tag)
++{
++      if (error_tag >= XFS_ERRTAG_MAX)
++              return false;
++
++      /* Error out removed injection types */
++      if (error_tag == XFS_ERRTAG_DROP_WRITES)
++              return false;
++      return true;
++}
++
+ bool
+ xfs_errortag_test(
+       struct xfs_mount        *mp,
+@@ -277,7 +288,9 @@ xfs_errortag_test(
+       if (!mp->m_errortag)
+               return false;
+-      ASSERT(error_tag < XFS_ERRTAG_MAX);
++      if (!xfs_errortag_valid(error_tag))
++              return false;
++
+       randfactor = mp->m_errortag[error_tag];
+       if (!randfactor || prandom_u32_max(randfactor))
+               return false;
+@@ -293,7 +306,7 @@ xfs_errortag_get(
+       struct xfs_mount        *mp,
+       unsigned int            error_tag)
+ {
+-      if (error_tag >= XFS_ERRTAG_MAX)
++      if (!xfs_errortag_valid(error_tag))
+               return -EINVAL;
+       return mp->m_errortag[error_tag];
+@@ -305,7 +318,7 @@ xfs_errortag_set(
+       unsigned int            error_tag,
+       unsigned int            tag_value)
+ {
+-      if (error_tag >= XFS_ERRTAG_MAX)
++      if (!xfs_errortag_valid(error_tag))
+               return -EINVAL;
+       mp->m_errortag[error_tag] = tag_value;
+@@ -319,7 +332,7 @@ xfs_errortag_add(
+ {
+       BUILD_BUG_ON(ARRAY_SIZE(xfs_errortag_random_default) != XFS_ERRTAG_MAX);
+-      if (error_tag >= XFS_ERRTAG_MAX)
++      if (!xfs_errortag_valid(error_tag))
+               return -EINVAL;
+       return xfs_errortag_set(mp, error_tag,
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -1190,15 +1190,6 @@ xfs_buffered_write_iomap_end(
+       struct xfs_mount        *mp = XFS_M(inode->i_sb);
+       int                     error;
+-      /*
+-       * Behave as if the write failed if drop writes is enabled. Set the NEW
+-       * flag to force delalloc cleanup.
+-       */
+-      if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DROP_WRITES)) {
+-              iomap->flags |= IOMAP_F_NEW;
+-              written = 0;
+-      }
+-
+       error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset,
+                       length, written, &xfs_buffered_write_delalloc_punch);
+       if (error && !xfs_is_shutdown(mp)) {
diff --git a/queue-6.1/xfs-estimate-post-merge-refcounts-correctly.patch b/queue-6.1/xfs-estimate-post-merge-refcounts-correctly.patch
new file mode 100644 (file)
index 0000000..8562e2f
--- /dev/null
@@ -0,0 +1,114 @@
+From stable+bounces-42908-greg=kroah.com@vger.kernel.org Wed May  1 20:42:09 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:41:07 -0700
+Subject: xfs: estimate post-merge refcounts correctly
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Xiao Yang <yangx.jy@fujitsu.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-19-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit b25d1984aa884fc91a73a5a407b9ac976d441e9b ]
+
+Upon enabling fsdax + reflink for XFS, xfs/179 began to report refcount
+metadata corruptions after being run.  Specifically, xfs_repair noticed
+single-block refcount records that could be combined but had not been.
+
+The root cause of this is improper MAXREFCOUNT edge case handling in
+xfs_refcount_merge_extents.  When we're trying to find candidates for a
+refcount btree record merge, we compute the refcount attribute of the
+merged record, but we fail to account for the fact that once a record
+hits rc_refcount == MAXREFCOUNT, it is pinned that way forever.  Hence
+the computed refcount is wrong, and we fail to merge the extents.
+
+Fix this by adjusting the merge predicates to compute the adjusted
+refcount correctly.
+
+Fixes: 3172725814f9 ("xfs: adjust refcount of an extent of blocks in refcount btree")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Xiao Yang <yangx.jy@fujitsu.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_refcount.c |   25 +++++++++++++++++++++----
+ 1 file changed, 21 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_refcount.c
++++ b/fs/xfs/libxfs/xfs_refcount.c
+@@ -820,6 +820,17 @@ xfs_refc_valid(
+       return rc->rc_startblock != NULLAGBLOCK;
+ }
++static inline xfs_nlink_t
++xfs_refc_merge_refcount(
++      const struct xfs_refcount_irec  *irec,
++      enum xfs_refc_adjust_op         adjust)
++{
++      /* Once a record hits MAXREFCOUNT, it is pinned there forever */
++      if (irec->rc_refcount == MAXREFCOUNT)
++              return MAXREFCOUNT;
++      return irec->rc_refcount + adjust;
++}
++
+ static inline bool
+ xfs_refc_want_merge_center(
+       const struct xfs_refcount_irec  *left,
+@@ -831,6 +842,7 @@ xfs_refc_want_merge_center(
+       unsigned long long              *ulenp)
+ {
+       unsigned long long              ulen = left->rc_blockcount;
++      xfs_nlink_t                     new_refcount;
+       /*
+        * To merge with a center record, both shoulder records must be
+@@ -846,9 +858,10 @@ xfs_refc_want_merge_center(
+               return false;
+       /* The shoulder record refcounts must match the new refcount. */
+-      if (left->rc_refcount != cleft->rc_refcount + adjust)
++      new_refcount = xfs_refc_merge_refcount(cleft, adjust);
++      if (left->rc_refcount != new_refcount)
+               return false;
+-      if (right->rc_refcount != cleft->rc_refcount + adjust)
++      if (right->rc_refcount != new_refcount)
+               return false;
+       /*
+@@ -871,6 +884,7 @@ xfs_refc_want_merge_left(
+       enum xfs_refc_adjust_op         adjust)
+ {
+       unsigned long long              ulen = left->rc_blockcount;
++      xfs_nlink_t                     new_refcount;
+       /*
+        * For a left merge, the left shoulder record must be adjacent to the
+@@ -881,7 +895,8 @@ xfs_refc_want_merge_left(
+               return false;
+       /* Left shoulder record refcount must match the new refcount. */
+-      if (left->rc_refcount != cleft->rc_refcount + adjust)
++      new_refcount = xfs_refc_merge_refcount(cleft, adjust);
++      if (left->rc_refcount != new_refcount)
+               return false;
+       /*
+@@ -903,6 +918,7 @@ xfs_refc_want_merge_right(
+       enum xfs_refc_adjust_op         adjust)
+ {
+       unsigned long long              ulen = right->rc_blockcount;
++      xfs_nlink_t                     new_refcount;
+       /*
+        * For a right merge, the right shoulder record must be adjacent to the
+@@ -913,7 +929,8 @@ xfs_refc_want_merge_right(
+               return false;
+       /* Right shoulder record refcount must match the new refcount. */
+-      if (right->rc_refcount != cright->rc_refcount + adjust)
++      new_refcount = xfs_refc_merge_refcount(cright, adjust);
++      if (right->rc_refcount != new_refcount)
+               return false;
+       /*
diff --git a/queue-6.1/xfs-fix-incorrect-error-out-in-xfs_remove.patch b/queue-6.1/xfs-fix-incorrect-error-out-in-xfs_remove.patch
new file mode 100644 (file)
index 0000000..ab484f5
--- /dev/null
@@ -0,0 +1,36 @@
+From stable+bounces-42900-greg=kroah.com@vger.kernel.org Wed May  1 20:41:50 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:40:59 -0700
+Subject: xfs: fix incorrect error-out in xfs_remove
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" <djwong@kernel.org>, Andrey Albershteyn <aalbersh@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-11-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 2653d53345bda90604f673bb211dd060a5a5c232 ]
+
+Clean up resources if resetting the dotdot entry doesn't succeed.
+Observed through code inspection.
+
+Fixes: 5838d0356bb3 ("xfs: reset child dir '..' entry when unlinking child")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Andrey Albershteyn <aalbersh@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_inode.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -2479,7 +2479,7 @@ xfs_remove(
+                       error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
+                                       tp->t_mountp->m_sb.sb_rootino, 0);
+                       if (error)
+-                              return error;
++                              goto out_trans_cancel;
+               }
+       } else {
+               /*
diff --git a/queue-6.1/xfs-fix-incorrect-i_nlink-caused-by-inode-racing.patch b/queue-6.1/xfs-fix-incorrect-i_nlink-caused-by-inode-racing.patch
new file mode 100644 (file)
index 0000000..db2681d
--- /dev/null
@@ -0,0 +1,81 @@
+From stable+bounces-42902-greg=kroah.com@vger.kernel.org Wed May  1 20:41:56 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:41:01 -0700
+Subject: xfs: fix incorrect i_nlink caused by inode racing
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Long Li <leo.lilong@huawei.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-13-leah.rumancik@gmail.com>
+
+From: Long Li <leo.lilong@huawei.com>
+
+[ Upstream commit 28b4b0596343d19d140da059eee0e5c2b5328731 ]
+
+The following error occurred during the fsstress test:
+
+XFS: Assertion failed: VFS_I(ip)->i_nlink >= 2, file: fs/xfs/xfs_inode.c, line: 2452
+
+The problem was that inode race condition causes incorrect i_nlink to be
+written to disk, and then it is read into memory. Consider the following
+call graph, inodes that are marked as both XFS_IFLUSHING and
+XFS_IRECLAIMABLE, i_nlink will be reset to 1 and then restored to original
+value in xfs_reinit_inode(). Therefore, the i_nlink of directory on disk
+may be set to 1.
+
+  xfsaild
+      xfs_inode_item_push
+          xfs_iflush_cluster
+              xfs_iflush
+                  xfs_inode_to_disk
+
+  xfs_iget
+      xfs_iget_cache_hit
+          xfs_iget_recycle
+              xfs_reinit_inode
+                  inode_init_always
+
+xfs_reinit_inode() needs to hold the ILOCK_EXCL as it is changing internal
+inode state and can race with other RCU protected inode lookups. On the
+read side, xfs_iflush_cluster() grabs the ILOCK_SHARED while under rcu +
+ip->i_flags_lock, and so xfs_iflush/xfs_inode_to_disk() are protected from
+racing inode updates (during transactions) by that lock.
+
+Fixes: ff7bebeb91f8 ("xfs: refactor the inode recycling code") # goes further back than this
+Signed-off-by: Long Li <leo.lilong@huawei.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_icache.c |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -342,6 +342,9 @@ xfs_iget_recycle(
+       trace_xfs_iget_recycle(ip);
++      if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
++              return -EAGAIN;
++
+       /*
+        * We need to make it look like the inode is being reclaimed to prevent
+        * the actual reclaim workers from stomping over us while we recycle
+@@ -355,6 +358,7 @@ xfs_iget_recycle(
+       ASSERT(!rwsem_is_locked(&inode->i_rwsem));
+       error = xfs_reinit_inode(mp, inode);
++      xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       if (error) {
+               /*
+                * Re-initializing the inode failed, and we are in deep
+@@ -523,6 +527,8 @@ xfs_iget_cache_hit(
+       if (ip->i_flags & XFS_IRECLAIMABLE) {
+               /* Drops i_flags_lock and RCU read lock. */
+               error = xfs_iget_recycle(pag, ip);
++              if (error == -EAGAIN)
++                      goto out_skip;
+               if (error)
+                       return error;
+       } else {
diff --git a/queue-6.1/xfs-fix-log-recovery-when-unknown-rocompat-bits-are-set.patch b/queue-6.1/xfs-fix-log-recovery-when-unknown-rocompat-bits-are-set.patch
new file mode 100644 (file)
index 0000000..077e24f
--- /dev/null
@@ -0,0 +1,95 @@
+From stable+bounces-42911-greg=kroah.com@vger.kernel.org Wed May  1 20:42:18 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:41:10 -0700
+Subject: xfs: fix log recovery when unknown rocompat bits are set
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-22-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 74ad4693b6473950e971b3dc525b5ee7570e05d0 ]
+
+Log recovery has always run on read only mounts, even where the primary
+superblock advertises unknown rocompat bits.  Due to a misunderstanding
+between Eric and Darrick back in 2018, we accidentally changed the
+superblock write verifier to shutdown the fs over that exact scenario.
+As a result, the log cleaning that occurs at the end of the mounting
+process fails if there are unknown rocompat bits set.
+
+As we now allow writing of the superblock if there are unknown rocompat
+bits set on a RO mount, we no longer want to turn off RO state to allow
+log recovery to succeed on a RO mount.  Hence we also remove all the
+(now unnecessary) RO state toggling from the log recovery path.
+
+Fixes: 9e037cb7972f ("xfs: check for unknown v5 feature bits in superblock write verifier"
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_sb.c |    3 ++-
+ fs/xfs/xfs_log.c       |   17 -----------------
+ 2 files changed, 2 insertions(+), 18 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_sb.c
++++ b/fs/xfs/libxfs/xfs_sb.c
+@@ -266,7 +266,8 @@ xfs_validate_sb_write(
+               return -EFSCORRUPTED;
+       }
+-      if (xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
++      if (!xfs_is_readonly(mp) &&
++          xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+               xfs_alert(mp,
+ "Corruption detected in superblock read-only compatible features (0x%x)!",
+                       (sbp->sb_features_ro_compat &
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -730,15 +730,7 @@ xfs_log_mount(
+        * just worked.
+        */
+       if (!xfs_has_norecovery(mp)) {
+-              /*
+-               * log recovery ignores readonly state and so we need to clear
+-               * mount-based read only state so it can write to disk.
+-               */
+-              bool    readonly = test_and_clear_bit(XFS_OPSTATE_READONLY,
+-                                              &mp->m_opstate);
+               error = xlog_recover(log);
+-              if (readonly)
+-                      set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+               if (error) {
+                       xfs_warn(mp, "log mount/recovery failed: error %d",
+                               error);
+@@ -787,7 +779,6 @@ xfs_log_mount_finish(
+       struct xfs_mount        *mp)
+ {
+       struct xlog             *log = mp->m_log;
+-      bool                    readonly;
+       int                     error = 0;
+       if (xfs_has_norecovery(mp)) {
+@@ -796,12 +787,6 @@ xfs_log_mount_finish(
+       }
+       /*
+-       * log recovery ignores readonly state and so we need to clear
+-       * mount-based read only state so it can write to disk.
+-       */
+-      readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+-
+-      /*
+        * During the second phase of log recovery, we need iget and
+        * iput to behave like they do for an active filesystem.
+        * xfs_fs_drop_inode needs to be able to prevent the deletion
+@@ -850,8 +835,6 @@ xfs_log_mount_finish(
+       xfs_buftarg_drain(mp->m_ddev_targp);
+       clear_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
+-      if (readonly)
+-              set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+       /* Make sure the log is dead if we're returning failure. */
+       ASSERT(!error || xlog_is_shutdown(log));
diff --git a/queue-6.1/xfs-fix-off-by-one-block-in-xfs_discard_folio.patch b/queue-6.1/xfs-fix-off-by-one-block-in-xfs_discard_folio.patch
new file mode 100644 (file)
index 0000000..f015f03
--- /dev/null
@@ -0,0 +1,83 @@
+From stable+bounces-42898-greg=kroah.com@vger.kernel.org Wed May  1 20:41:46 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:40:58 -0700
+Subject: xfs: fix off-by-one-block in xfs_discard_folio()
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, Pengfei Xu <pengfei.xu@intel.com>, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-10-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 8ac5b996bf5199f15b7687ceae989f8b2a410dda ]
+
+The recent writeback corruption fixes changed the code in
+xfs_discard_folio() to calculate a byte range to for punching
+delalloc extents. A mistake was made in using round_up(pos) for the
+end offset, because when pos points at the first byte of a block, it
+does not get rounded up to point to the end byte of the block. hence
+the punch range is short, and this leads to unexpected behaviour in
+certain cases in xfs_bmap_punch_delalloc_range.
+
+e.g. pos = 0 means we call xfs_bmap_punch_delalloc_range(0,0), so
+there is no previous extent and it rounds up the punch to the end of
+the delalloc extent it found at offset 0, not the end of the range
+given to xfs_bmap_punch_delalloc_range().
+
+Fix this by handling the zero block offset case correctly.
+
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=217030
+Link: https://lore.kernel.org/linux-xfs/Y+vOfaxIWX1c%2Fyy9@bfoster/
+Fixes: 7348b322332d ("xfs: xfs_bmap_punch_delalloc_range() should take a byte range")
+Reported-by: Pengfei Xu <pengfei.xu@intel.com>
+Found-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_aops.c |   21 ++++++++++++++-------
+ 1 file changed, 14 insertions(+), 7 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -439,15 +439,17 @@ xfs_prepare_ioend(
+ }
+ /*
+- * If the page has delalloc blocks on it, we need to punch them out before we
+- * invalidate the page.  If we don't, we leave a stale delalloc mapping on the
+- * inode that can trip up a later direct I/O read operation on the same region.
++ * If the folio has delalloc blocks on it, the caller is asking us to punch them
++ * out. If we don't, we can leave a stale delalloc mapping covered by a clean
++ * page that needs to be dirtied again before the delalloc mapping can be
++ * converted. This stale delalloc mapping can trip up a later direct I/O read
++ * operation on the same region.
+  *
+- * We prevent this by truncating away the delalloc regions on the page.  Because
++ * We prevent this by truncating away the delalloc regions on the folio. Because
+  * they are delalloc, we can do this without needing a transaction. Indeed - if
+  * we get ENOSPC errors, we have to be able to do this truncation without a
+- * transaction as there is no space left for block reservation (typically why we
+- * see a ENOSPC in writeback).
++ * transaction as there is no space left for block reservation (typically why
++ * we see a ENOSPC in writeback).
+  */
+ static void
+ xfs_discard_folio(
+@@ -465,8 +467,13 @@ xfs_discard_folio(
+               "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
+                       folio, ip->i_ino, pos);
++      /*
++       * The end of the punch range is always the offset of the the first
++       * byte of the next folio. Hence the end offset is only dependent on the
++       * folio itself and not the start offset that is passed in.
++       */
+       error = xfs_bmap_punch_delalloc_range(ip, pos,
+-                      round_up(pos, folio_size(folio)));
++                              folio_pos(folio) + folio_size(folio));
+       if (error && !xfs_is_shutdown(mp))
+               xfs_alert(mp, "page discard unable to remove delalloc mapping.");
diff --git a/queue-6.1/xfs-fix-sb-write-verify-for-lazysbcount.patch b/queue-6.1/xfs-fix-sb-write-verify-for-lazysbcount.patch
new file mode 100644 (file)
index 0000000..d9d5590
--- /dev/null
@@ -0,0 +1,152 @@
+From stable+bounces-42901-greg=kroah.com@vger.kernel.org Wed May  1 20:41:52 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:41:00 -0700
+Subject: xfs: fix sb write verify for lazysbcount
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Long Li <leo.lilong@huawei.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-12-leah.rumancik@gmail.com>
+
+From: Long Li <leo.lilong@huawei.com>
+
+[ Upstream commit 59f6ab40fd8735c9a1a15401610a31cc06a0bbd6 ]
+
+When lazysbcount is enabled, fsstress and loop mount/unmount test report
+the following problems:
+
+XFS (loop0): SB summary counter sanity check failed
+XFS (loop0): Metadata corruption detected at xfs_sb_write_verify+0x13b/0x460,
+       xfs_sb block 0x0
+XFS (loop0): Unmount and run xfs_repair
+XFS (loop0): First 128 bytes of corrupted metadata buffer:
+00000000: 58 46 53 42 00 00 10 00 00 00 00 00 00 28 00 00  XFSB.........(..
+00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  ................
+00000020: 69 fb 7c cd 5f dc 44 af 85 74 e0 cc d4 e3 34 5a  i.|._.D..t....4Z
+00000030: 00 00 00 00 00 20 00 06 00 00 00 00 00 00 00 80  ..... ..........
+00000040: 00 00 00 00 00 00 00 81 00 00 00 00 00 00 00 82  ................
+00000050: 00 00 00 01 00 0a 00 00 00 00 00 04 00 00 00 00  ................
+00000060: 00 00 0a 00 b4 b5 02 00 02 00 00 08 00 00 00 00  ................
+00000070: 00 00 00 00 00 00 00 00 0c 09 09 03 14 00 00 19  ................
+XFS (loop0): Corruption of in-memory data (0x8) detected at _xfs_buf_ioapply
+       +0xe1e/0x10e0 (fs/xfs/xfs_buf.c:1580).  Shutting down filesystem.
+XFS (loop0): Please unmount the filesystem and rectify the problem(s)
+XFS (loop0): log mount/recovery failed: error -117
+XFS (loop0): log mount failed
+
+This corruption will shutdown the file system and the file system will
+no longer be mountable. The following script can reproduce the problem,
+but it may take a long time.
+
+ #!/bin/bash
+
+ device=/dev/sda
+ testdir=/mnt/test
+ round=0
+
+ function fail()
+ {
+        echo "$*"
+        exit 1
+ }
+
+ mkdir -p $testdir
+ while [ $round -lt 10000 ]
+ do
+        echo "******* round $round ********"
+        mkfs.xfs -f $device
+        mount $device $testdir || fail "mount failed!"
+        fsstress -d $testdir -l 0 -n 10000 -p 4 >/dev/null &
+        sleep 4
+        killall -w fsstress
+        umount $testdir
+        xfs_repair -e $device > /dev/null
+        if [ $? -eq 2 ];then
+                echo "ERR CODE 2: Dirty log exception during repair."
+                exit 1
+        fi
+        round=$(($round+1))
+ done
+
+With lazysbcount is enabled, There is no additional lock protection for
+reading m_ifree and m_icount in xfs_log_sb(), if other cpu modifies the
+m_ifree, this will make the m_ifree greater than m_icount. For example,
+consider the following sequence and ifreedelta is postive:
+
+ CPU0                           CPU1
+ xfs_log_sb                     xfs_trans_unreserve_and_mod_sb
+ ----------                     ------------------------------
+ percpu_counter_sum(&mp->m_icount)
+                                percpu_counter_add_batch(&mp->m_icount,
+                                               idelta, XFS_ICOUNT_BATCH)
+                                percpu_counter_add(&mp->m_ifree, ifreedelta);
+ percpu_counter_sum(&mp->m_ifree)
+
+After this, incorrect inode count (sb_ifree > sb_icount) will be writen to
+the log. In the subsequent writing of sb, incorrect inode count (sb_ifree >
+sb_icount) will fail to pass the boundary check in xfs_validate_sb_write()
+that cause the file system shutdown.
+
+When lazysbcount is enabled, we don't need to guarantee that Lazy sb
+counters are completely correct, but we do need to guarantee that sb_ifree
+<= sb_icount. On the other hand, the constraint that m_ifree <= m_icount
+must be satisfied any time that there /cannot/ be other threads allocating
+or freeing inode chunks. If the constraint is violated under these
+circumstances, sb_i{count,free} (the ondisk superblock inode counters)
+maybe incorrect and need to be marked sick at unmount, the count will
+be rebuilt on the next mount.
+
+Fixes: 8756a5af1819 ("libxfs: add more bounds checking to sb sanity checks")
+Signed-off-by: Long Li <leo.lilong@huawei.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_sb.c |    4 +++-
+ fs/xfs/xfs_mount.c     |   15 +++++++++++++++
+ 2 files changed, 18 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_sb.c
++++ b/fs/xfs/libxfs/xfs_sb.c
+@@ -973,7 +973,9 @@ xfs_log_sb(
+        */
+       if (xfs_has_lazysbcount(mp)) {
+               mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
+-              mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
++              mp->m_sb.sb_ifree = min_t(uint64_t,
++                              percpu_counter_sum(&mp->m_ifree),
++                              mp->m_sb.sb_icount);
+               mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+       }
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -538,6 +538,20 @@ xfs_check_summary_counts(
+       return 0;
+ }
++static void
++xfs_unmount_check(
++      struct xfs_mount        *mp)
++{
++      if (xfs_is_shutdown(mp))
++              return;
++
++      if (percpu_counter_sum(&mp->m_ifree) >
++                      percpu_counter_sum(&mp->m_icount)) {
++              xfs_alert(mp, "ifree/icount mismatch at unmount");
++              xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
++      }
++}
++
+ /*
+  * Flush and reclaim dirty inodes in preparation for unmount. Inodes and
+  * internal inode structures can be sitting in the CIL and AIL at this point,
+@@ -1077,6 +1091,7 @@ xfs_unmountfs(
+       if (error)
+               xfs_warn(mp, "Unable to free reserved block pool. "
+                               "Freespace may not be correct on next mount.");
++      xfs_unmount_check(mp);
+       xfs_log_unmount(mp);
+       xfs_da_unmount(mp);
diff --git a/queue-6.1/xfs-fix-super-block-buf-log-item-uaf-during-force-shutdown.patch b/queue-6.1/xfs-fix-super-block-buf-log-item-uaf-during-force-shutdown.patch
new file mode 100644 (file)
index 0000000..922b8ba
--- /dev/null
@@ -0,0 +1,147 @@
+From stable+bounces-42906-greg=kroah.com@vger.kernel.org Wed May  1 20:42:06 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:41:05 -0700
+Subject: xfs: fix super block buf log item UAF during force shutdown
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Guo Xuenan <guoxuenan@huawei.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-17-leah.rumancik@gmail.com>
+
+From: Guo Xuenan <guoxuenan@huawei.com>
+
+[ Upstream commit 575689fc0ffa6c4bb4e72fd18e31a6525a6124e0 ]
+
+xfs log io error will trigger xlog shut down, and end_io worker call
+xlog_state_shutdown_callbacks to unpin and release the buf log item.
+The race condition is that when there are some thread doing transaction
+commit and happened not to be intercepted by xlog_is_shutdown, then,
+these log item will be insert into CIL, when unpin and release these
+buf log item, UAF will occur. BTW, add delay before `xlog_cil_commit`
+can increase recurrence probability.
+
+The following call graph actually encountered this bad situation.
+fsstress                    io end worker kworker/0:1H-216
+                            xlog_ioend_work
+                              ->xlog_force_shutdown
+                                ->xlog_state_shutdown_callbacks
+                                  ->xlog_cil_process_committed
+                                    ->xlog_cil_committed
+                                      ->xfs_trans_committed_bulk
+->xfs_trans_apply_sb_deltas             ->li_ops->iop_unpin(lip, 1);
+  ->xfs_trans_getsb
+    ->_xfs_trans_bjoin
+      ->xfs_buf_item_init
+        ->if (bip) { return 0;} //relog
+->xlog_cil_commit
+  ->xlog_cil_insert_items //insert into CIL
+                                           ->xfs_buf_ioend_fail(bp);
+                                             ->xfs_buf_ioend
+                                               ->xfs_buf_item_done
+                                                 ->xfs_buf_item_relse
+                                                   ->xfs_buf_item_free
+
+when cil push worker gather percpu cil and insert super block buf log item
+into ctx->log_items then uaf occurs.
+
+==================================================================
+BUG: KASAN: use-after-free in xlog_cil_push_work+0x1c8f/0x22f0
+Write of size 8 at addr ffff88801800f3f0 by task kworker/u4:4/105
+
+CPU: 0 PID: 105 Comm: kworker/u4:4 Tainted: G W
+6.1.0-rc1-00001-g274115149b42 #136
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
+1.13.0-1ubuntu1.1 04/01/2014
+Workqueue: xfs-cil/sda xlog_cil_push_work
+Call Trace:
+ <TASK>
+ dump_stack_lvl+0x4d/0x66
+ print_report+0x171/0x4a6
+ kasan_report+0xb3/0x130
+ xlog_cil_push_work+0x1c8f/0x22f0
+ process_one_work+0x6f9/0xf70
+ worker_thread+0x578/0xf30
+ kthread+0x28c/0x330
+ ret_from_fork+0x1f/0x30
+ </TASK>
+
+Allocated by task 2145:
+ kasan_save_stack+0x1e/0x40
+ kasan_set_track+0x21/0x30
+ __kasan_slab_alloc+0x54/0x60
+ kmem_cache_alloc+0x14a/0x510
+ xfs_buf_item_init+0x160/0x6d0
+ _xfs_trans_bjoin+0x7f/0x2e0
+ xfs_trans_getsb+0xb6/0x3f0
+ xfs_trans_apply_sb_deltas+0x1f/0x8c0
+ __xfs_trans_commit+0xa25/0xe10
+ xfs_symlink+0xe23/0x1660
+ xfs_vn_symlink+0x157/0x280
+ vfs_symlink+0x491/0x790
+ do_symlinkat+0x128/0x220
+ __x64_sys_symlink+0x7a/0x90
+ do_syscall_64+0x35/0x80
+ entry_SYSCALL_64_after_hwframe+0x63/0xcd
+
+Freed by task 216:
+ kasan_save_stack+0x1e/0x40
+ kasan_set_track+0x21/0x30
+ kasan_save_free_info+0x2a/0x40
+ __kasan_slab_free+0x105/0x1a0
+ kmem_cache_free+0xb6/0x460
+ xfs_buf_ioend+0x1e9/0x11f0
+ xfs_buf_item_unpin+0x3d6/0x840
+ xfs_trans_committed_bulk+0x4c2/0x7c0
+ xlog_cil_committed+0xab6/0xfb0
+ xlog_cil_process_committed+0x117/0x1e0
+ xlog_state_shutdown_callbacks+0x208/0x440
+ xlog_force_shutdown+0x1b3/0x3a0
+ xlog_ioend_work+0xef/0x1d0
+ process_one_work+0x6f9/0xf70
+ worker_thread+0x578/0xf30
+ kthread+0x28c/0x330
+ ret_from_fork+0x1f/0x30
+
+The buggy address belongs to the object at ffff88801800f388
+ which belongs to the cache xfs_buf_item of size 272
+The buggy address is located 104 bytes inside of
+ 272-byte region [ffff88801800f388, ffff88801800f498)
+
+The buggy address belongs to the physical page:
+page:ffffea0000600380 refcount:1 mapcount:0 mapping:0000000000000000
+index:0xffff88801800f208 pfn:0x1800e
+head:ffffea0000600380 order:1 compound_mapcount:0 compound_pincount:0
+flags: 0x1fffff80010200(slab|head|node=0|zone=1|lastcpupid=0x1fffff)
+raw: 001fffff80010200 ffffea0000699788 ffff88801319db50 ffff88800fb50640
+raw: ffff88801800f208 000000000015000a 00000001ffffffff 0000000000000000
+page dumped because: kasan: bad access detected
+
+Memory state around the buggy address:
+ ffff88801800f280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ ffff88801800f300: fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc fc
+>ffff88801800f380: fc fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+                                                             ^
+ ffff88801800f400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ ffff88801800f480: fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc fc
+==================================================================
+Disabling lock debugging due to kernel taint
+
+Signed-off-by: Guo Xuenan <guoxuenan@huawei.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -1018,6 +1018,8 @@ xfs_buf_item_relse(
+       trace_xfs_buf_item_relse(bp, _RET_IP_);
+       ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
++      if (atomic_read(&bip->bli_refcount))
++              return;
+       bp->b_log_item = NULL;
+       xfs_buf_rele(bp);
+       xfs_buf_item_free(bip);
diff --git a/queue-6.1/xfs-get-root-inode-correctly-at-bulkstat.patch b/queue-6.1/xfs-get-root-inode-correctly-at-bulkstat.patch
new file mode 100644 (file)
index 0000000..0b40f76
--- /dev/null
@@ -0,0 +1,49 @@
+From stable+bounces-42912-greg=kroah.com@vger.kernel.org Wed May  1 20:42:18 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:41:11 -0700
+Subject: xfs: get root inode correctly at bulkstat
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Hironori Shiina <shiina.hironori@gmail.com>, Hironori Shiina <shiina.hironori@fujitsu.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-23-leah.rumancik@gmail.com>
+
+From: Hironori Shiina <shiina.hironori@gmail.com>
+
+[ Upstream commit 817644fa4525258992f17fecf4f1d6cdd2e1b731 ]
+
+The root inode number should be set to `breq->startino` for getting stat
+information of the root when XFS_BULK_IREQ_SPECIAL_ROOT is used.
+Otherwise, the inode search is started from 1
+(XFS_BULK_IREQ_SPECIAL_ROOT) and the inode with the lowest number in a
+filesystem is returned.
+
+Fixes: bf3cb3944792 ("xfs: allow single bulkstat of special inodes")
+Signed-off-by: Hironori Shiina <shiina.hironori@fujitsu.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_ioctl.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_ioctl.c
++++ b/fs/xfs/xfs_ioctl.c
+@@ -754,7 +754,7 @@ xfs_bulkstat_fmt(
+ static int
+ xfs_bulk_ireq_setup(
+       struct xfs_mount        *mp,
+-      struct xfs_bulk_ireq    *hdr,
++      const struct xfs_bulk_ireq *hdr,
+       struct xfs_ibulk        *breq,
+       void __user             *ubuffer)
+ {
+@@ -780,7 +780,7 @@ xfs_bulk_ireq_setup(
+               switch (hdr->ino) {
+               case XFS_BULK_IREQ_SPECIAL_ROOT:
+-                      hdr->ino = mp->m_sb.sb_rootino;
++                      breq->startino = mp->m_sb.sb_rootino;
+                       break;
+               default:
+                       return -EINVAL;
diff --git a/queue-6.1/xfs-hoist-refcount-record-merge-predicates.patch b/queue-6.1/xfs-hoist-refcount-record-merge-predicates.patch
new file mode 100644 (file)
index 0000000..7437219
--- /dev/null
@@ -0,0 +1,188 @@
+From stable+bounces-42907-greg=kroah.com@vger.kernel.org Wed May  1 20:42:07 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:41:06 -0700
+Subject: xfs: hoist refcount record merge predicates
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Xiao Yang <yangx.jy@fujitsu.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-18-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 9d720a5a658f5135861773f26e927449bef93d61 ]
+
+Hoist these multiline conditionals into separate static inline helpers
+to improve readability and set the stage for corruption fixes that will
+be introduced in the next patch.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Xiao Yang <yangx.jy@fujitsu.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_refcount.c |  129 +++++++++++++++++++++++++++++++++++++------
+ 1 file changed, 113 insertions(+), 16 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_refcount.c
++++ b/fs/xfs/libxfs/xfs_refcount.c
+@@ -815,11 +815,119 @@ out_error:
+ /* Is this extent valid? */
+ static inline bool
+ xfs_refc_valid(
+-      struct xfs_refcount_irec        *rc)
++      const struct xfs_refcount_irec  *rc)
+ {
+       return rc->rc_startblock != NULLAGBLOCK;
+ }
++static inline bool
++xfs_refc_want_merge_center(
++      const struct xfs_refcount_irec  *left,
++      const struct xfs_refcount_irec  *cleft,
++      const struct xfs_refcount_irec  *cright,
++      const struct xfs_refcount_irec  *right,
++      bool                            cleft_is_cright,
++      enum xfs_refc_adjust_op         adjust,
++      unsigned long long              *ulenp)
++{
++      unsigned long long              ulen = left->rc_blockcount;
++
++      /*
++       * To merge with a center record, both shoulder records must be
++       * adjacent to the record we want to adjust.  This is only true if
++       * find_left and find_right made all four records valid.
++       */
++      if (!xfs_refc_valid(left)  || !xfs_refc_valid(right) ||
++          !xfs_refc_valid(cleft) || !xfs_refc_valid(cright))
++              return false;
++
++      /* There must only be one record for the entire range. */
++      if (!cleft_is_cright)
++              return false;
++
++      /* The shoulder record refcounts must match the new refcount. */
++      if (left->rc_refcount != cleft->rc_refcount + adjust)
++              return false;
++      if (right->rc_refcount != cleft->rc_refcount + adjust)
++              return false;
++
++      /*
++       * The new record cannot exceed the max length.  ulen is a ULL as the
++       * individual record block counts can be up to (u32 - 1) in length
++       * hence we need to catch u32 addition overflows here.
++       */
++      ulen += cleft->rc_blockcount + right->rc_blockcount;
++      if (ulen >= MAXREFCEXTLEN)
++              return false;
++
++      *ulenp = ulen;
++      return true;
++}
++
++static inline bool
++xfs_refc_want_merge_left(
++      const struct xfs_refcount_irec  *left,
++      const struct xfs_refcount_irec  *cleft,
++      enum xfs_refc_adjust_op         adjust)
++{
++      unsigned long long              ulen = left->rc_blockcount;
++
++      /*
++       * For a left merge, the left shoulder record must be adjacent to the
++       * start of the range.  If this is true, find_left made left and cleft
++       * contain valid contents.
++       */
++      if (!xfs_refc_valid(left) || !xfs_refc_valid(cleft))
++              return false;
++
++      /* Left shoulder record refcount must match the new refcount. */
++      if (left->rc_refcount != cleft->rc_refcount + adjust)
++              return false;
++
++      /*
++       * The new record cannot exceed the max length.  ulen is a ULL as the
++       * individual record block counts can be up to (u32 - 1) in length
++       * hence we need to catch u32 addition overflows here.
++       */
++      ulen += cleft->rc_blockcount;
++      if (ulen >= MAXREFCEXTLEN)
++              return false;
++
++      return true;
++}
++
++static inline bool
++xfs_refc_want_merge_right(
++      const struct xfs_refcount_irec  *cright,
++      const struct xfs_refcount_irec  *right,
++      enum xfs_refc_adjust_op         adjust)
++{
++      unsigned long long              ulen = right->rc_blockcount;
++
++      /*
++       * For a right merge, the right shoulder record must be adjacent to the
++       * end of the range.  If this is true, find_right made cright and right
++       * contain valid contents.
++       */
++      if (!xfs_refc_valid(right) || !xfs_refc_valid(cright))
++              return false;
++
++      /* Right shoulder record refcount must match the new refcount. */
++      if (right->rc_refcount != cright->rc_refcount + adjust)
++              return false;
++
++      /*
++       * The new record cannot exceed the max length.  ulen is a ULL as the
++       * individual record block counts can be up to (u32 - 1) in length
++       * hence we need to catch u32 addition overflows here.
++       */
++      ulen += cright->rc_blockcount;
++      if (ulen >= MAXREFCEXTLEN)
++              return false;
++
++      return true;
++}
++
+ /*
+  * Try to merge with any extents on the boundaries of the adjustment range.
+  */
+@@ -861,23 +969,15 @@ xfs_refcount_merge_extents(
+                (cleft.rc_blockcount == cright.rc_blockcount);
+       /* Try to merge left, cleft, and right.  cleft must == cright. */
+-      ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount +
+-                      right.rc_blockcount;
+-      if (xfs_refc_valid(&left) && xfs_refc_valid(&right) &&
+-          xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal &&
+-          left.rc_refcount == cleft.rc_refcount + adjust &&
+-          right.rc_refcount == cleft.rc_refcount + adjust &&
+-          ulen < MAXREFCEXTLEN) {
++      if (xfs_refc_want_merge_center(&left, &cleft, &cright, &right, cequal,
++                              adjust, &ulen)) {
+               *shape_changed = true;
+               return xfs_refcount_merge_center_extents(cur, &left, &cleft,
+                               &right, ulen, aglen);
+       }
+       /* Try to merge left and cleft. */
+-      ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount;
+-      if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) &&
+-          left.rc_refcount == cleft.rc_refcount + adjust &&
+-          ulen < MAXREFCEXTLEN) {
++      if (xfs_refc_want_merge_left(&left, &cleft, adjust)) {
+               *shape_changed = true;
+               error = xfs_refcount_merge_left_extent(cur, &left, &cleft,
+                               agbno, aglen);
+@@ -893,10 +993,7 @@ xfs_refcount_merge_extents(
+       }
+       /* Try to merge cright and right. */
+-      ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount;
+-      if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) &&
+-          right.rc_refcount == cright.rc_refcount + adjust &&
+-          ulen < MAXREFCEXTLEN) {
++      if (xfs_refc_want_merge_right(&cright, &right, adjust)) {
+               *shape_changed = true;
+               return xfs_refcount_merge_right_extent(cur, &right, &cright,
+                               aglen);
diff --git a/queue-6.1/xfs-invalidate-block-device-page-cache-during-unmount.patch b/queue-6.1/xfs-invalidate-block-device-page-cache-during-unmount.patch
new file mode 100644 (file)
index 0000000..32c666b
--- /dev/null
@@ -0,0 +1,71 @@
+From stable+bounces-42903-greg=kroah.com@vger.kernel.org Wed May  1 20:41:59 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:41:02 -0700
+Subject: xfs: invalidate block device page cache during unmount
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" <djwong@kernel.org>, Gao Xiang <hsiangkao@linux.alibaba.com>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-14-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 032e160305f6872e590c77f11896fb28365c6d6c ]
+
+Every now and then I see fstests failures on aarch64 (64k pages) that
+trigger on the following sequence:
+
+mkfs.xfs $dev
+mount $dev $mnt
+touch $mnt/a
+umount $mnt
+xfs_db -c 'path /a' -c 'print' $dev
+
+99% of the time this succeeds, but every now and then xfs_db cannot find
+/a and fails.  This turns out to be a race involving udev/blkid, the
+page cache for the block device, and the xfs_db process.
+
+udev is triggered whenever anyone closes a block device or unmounts it.
+The default udev rules invoke blkid to read the fs super and create
+symlinks to the bdev under /dev/disk.  For this, it uses buffered reads
+through the page cache.
+
+xfs_db also uses buffered reads to examine metadata.  There is no
+coordination between xfs_db and udev, which means that they can run
+concurrently.  Note there is no coordination between the kernel and
+blkid either.
+
+On a system with 64k pages, the page cache can cache the superblock and
+the root inode (and hence the root dir) with the same 64k page.  If
+udev spawns blkid after the mkfs and the system is busy enough that it
+is still running when xfs_db starts up, they'll both read from the same
+page in the pagecache.
+
+The unmount writes updated inode metadata to disk directly.  The XFS
+buffer cache does not use the bdev pagecache, nor does it invalidate the
+pagecache on umount.  If the above scenario occurs, the pagecache no
+longer reflects what's on disk, xfs_db reads the stale metadata, and
+fails to find /a.  Most of the time this succeeds because closing a bdev
+invalidates the page cache, but when processes race, everyone loses.
+
+Fix the problem by invalidating the bdev pagecache after flushing the
+bdev, so that xfs_db will see up to date metadata.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/xfs/xfs_buf.c
++++ b/fs/xfs/xfs_buf.c
+@@ -1945,6 +1945,7 @@ xfs_free_buftarg(
+       list_lru_destroy(&btp->bt_lru);
+       blkdev_issue_flush(btp->bt_bdev);
++      invalidate_bdev(btp->bt_bdev);
+       fs_put_dax(btp->bt_daxdev, btp->bt_mount);
+       kmem_free(btp);
diff --git a/queue-6.1/xfs-invalidate-xfs_bufs-when-allocating-cow-extents.patch b/queue-6.1/xfs-invalidate-xfs_bufs-when-allocating-cow-extents.patch
new file mode 100644 (file)
index 0000000..5fc94e6
--- /dev/null
@@ -0,0 +1,55 @@
+From stable+bounces-42909-greg=kroah.com@vger.kernel.org Wed May  1 20:42:12 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:41:08 -0700
+Subject: xfs: invalidate xfs_bufs when allocating cow extents
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-20-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit ddfdd530e43fcb3f7a0a69966e5f6c33497b4ae3 ]
+
+While investigating test failures in xfs/17[1-3] in alwayscow mode, I
+noticed through code inspection that xfs_bmap_alloc_userdata isn't
+setting XFS_ALLOC_USERDATA when allocating extents for a file's CoW
+fork.  COW staging extents should be flagged as USERDATA, since user
+data are persisted to these blocks before being remapped into a file.
+
+This mis-classification has a few impacts on the behavior of the system.
+First, the filestreams allocator is supposed to keep allocating from a
+chosen AG until it runs out of space in that AG.  However, it only does
+that for USERDATA allocations, which means that COW allocations aren't
+tied to the filestreams AG.  Fortunately, few people use filestreams, so
+nobody's noticed.
+
+A more serious problem is that xfs_alloc_ag_vextent_small looks for a
+buffer to invalidate *if* the USERDATA flag is set and the AG is so full
+that the allocation had to come from the AGFL because the cntbt is
+empty.  The consequences of not invalidating the buffer are severe --
+if the AIL incorrectly checkpoints a buffer that is now being used to
+store user data, that action will clobber the user's written data.
+
+Fix filestreams and yet another data corruption vector by flagging COW
+allocations as USERDATA.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_bmap.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -4058,7 +4058,7 @@ xfs_bmap_alloc_userdata(
+        * the busy list.
+        */
+       bma->datatype = XFS_ALLOC_NOBUSY;
+-      if (whichfork == XFS_DATA_FORK) {
++      if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) {
+               bma->datatype |= XFS_ALLOC_USERDATA;
+               if (bma->offset == 0)
+                       bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
diff --git a/queue-6.1/xfs-iomap-move-delalloc-punching-to-iomap.patch b/queue-6.1/xfs-iomap-move-delalloc-punching-to-iomap.patch
new file mode 100644 (file)
index 0000000..8f55ff6
--- /dev/null
@@ -0,0 +1,188 @@
+From stable+bounces-42893-greg=kroah.com@vger.kernel.org Wed May  1 20:41:32 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:40:52 -0700
+Subject: xfs,iomap: move delalloc punching to iomap
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-4-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 9c7babf94a0d686b552e53aded8d4703d1b8b92b ]
+
+Because that's what Christoph wants for this error handling path
+only XFS uses.
+
+It requires a new iomap export for handling errors over delalloc
+ranges. This is basically the XFS code as is stands, but even though
+Christoph wants this as iomap funcitonality, we still have
+to call it from the filesystem specific ->iomap_end callback, and
+call into the iomap code with yet another filesystem specific
+callback to punch the delalloc extent within the defined ranges.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/iomap/buffered-io.c |   60 +++++++++++++++++++++++++++++++++++++++++++++++++
+ fs/xfs/xfs_iomap.c     |   47 ++++++--------------------------------
+ include/linux/iomap.h  |    4 +++
+ 3 files changed, 72 insertions(+), 39 deletions(-)
+
+--- a/fs/iomap/buffered-io.c
++++ b/fs/iomap/buffered-io.c
+@@ -827,6 +827,66 @@ iomap_file_buffered_write(struct kiocb *
+ }
+ EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
++/*
++ * When a short write occurs, the filesystem may need to remove reserved space
++ * that was allocated in ->iomap_begin from it's ->iomap_end method. For
++ * filesystems that use delayed allocation, we need to punch out delalloc
++ * extents from the range that are not dirty in the page cache. As the write can
++ * race with page faults, there can be dirty pages over the delalloc extent
++ * outside the range of a short write but still within the delalloc extent
++ * allocated for this iomap.
++ *
++ * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
++ * simplify range iterations, but converts them back to {offset,len} tuples for
++ * the punch callback.
++ */
++int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
++              struct iomap *iomap, loff_t pos, loff_t length,
++              ssize_t written,
++              int (*punch)(struct inode *inode, loff_t pos, loff_t length))
++{
++      loff_t                  start_byte;
++      loff_t                  end_byte;
++      int                     blocksize = i_blocksize(inode);
++      int                     error = 0;
++
++      if (iomap->type != IOMAP_DELALLOC)
++              return 0;
++
++      /* If we didn't reserve the blocks, we're not allowed to punch them. */
++      if (!(iomap->flags & IOMAP_F_NEW))
++              return 0;
++
++      /*
++       * start_byte refers to the first unused block after a short write. If
++       * nothing was written, round offset down to point at the first block in
++       * the range.
++       */
++      if (unlikely(!written))
++              start_byte = round_down(pos, blocksize);
++      else
++              start_byte = round_up(pos + written, blocksize);
++      end_byte = round_up(pos + length, blocksize);
++
++      /* Nothing to do if we've written the entire delalloc extent */
++      if (start_byte >= end_byte)
++              return 0;
++
++      /*
++       * Lock the mapping to avoid races with page faults re-instantiating
++       * folios and dirtying them via ->page_mkwrite between the page cache
++       * truncation and the delalloc extent removal. Failing to do this can
++       * leave dirty pages with no space reservation in the cache.
++       */
++      filemap_invalidate_lock(inode->i_mapping);
++      truncate_pagecache_range(inode, start_byte, end_byte - 1);
++      error = punch(inode, start_byte, end_byte - start_byte);
++      filemap_invalidate_unlock(inode->i_mapping);
++
++      return error;
++}
++EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
++
+ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
+ {
+       struct iomap *iomap = &iter->iomap;
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -1123,12 +1123,12 @@ out_unlock:
+ static int
+ xfs_buffered_write_delalloc_punch(
+       struct inode            *inode,
+-      loff_t                  start_byte,
+-      loff_t                  end_byte)
++      loff_t                  offset,
++      loff_t                  length)
+ {
+       struct xfs_mount        *mp = XFS_M(inode->i_sb);
+-      xfs_fileoff_t           start_fsb = XFS_B_TO_FSBT(mp, start_byte);
+-      xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, end_byte);
++      xfs_fileoff_t           start_fsb = XFS_B_TO_FSBT(mp, offset);
++      xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + length);
+       return xfs_bmap_punch_delalloc_range(XFS_I(inode), start_fsb,
+                               end_fsb - start_fsb);
+@@ -1143,13 +1143,9 @@ xfs_buffered_write_iomap_end(
+       unsigned                flags,
+       struct iomap            *iomap)
+ {
+-      struct xfs_mount        *mp = XFS_M(inode->i_sb);
+-      loff_t                  start_byte;
+-      loff_t                  end_byte;
+-      int                     error = 0;
+-      if (iomap->type != IOMAP_DELALLOC)
+-              return 0;
++      struct xfs_mount        *mp = XFS_M(inode->i_sb);
++      int                     error;
+       /*
+        * Behave as if the write failed if drop writes is enabled. Set the NEW
+@@ -1160,35 +1156,8 @@ xfs_buffered_write_iomap_end(
+               written = 0;
+       }
+-      /* If we didn't reserve the blocks, we're not allowed to punch them. */
+-      if (!(iomap->flags & IOMAP_F_NEW))
+-              return 0;
+-
+-      /*
+-       * start_fsb refers to the first unused block after a short write. If
+-       * nothing was written, round offset down to point at the first block in
+-       * the range.
+-       */
+-      if (unlikely(!written))
+-              start_byte = round_down(offset, mp->m_sb.sb_blocksize);
+-      else
+-              start_byte = round_up(offset + written, mp->m_sb.sb_blocksize);
+-      end_byte = round_up(offset + length, mp->m_sb.sb_blocksize);
+-
+-      /* Nothing to do if we've written the entire delalloc extent */
+-      if (start_byte >= end_byte)
+-              return 0;
+-
+-      /*
+-       * Lock the mapping to avoid races with page faults re-instantiating
+-       * folios and dirtying them via ->page_mkwrite between the page cache
+-       * truncation and the delalloc extent removal. Failing to do this can
+-       * leave dirty pages with no space reservation in the cache.
+-       */
+-      filemap_invalidate_lock(inode->i_mapping);
+-      truncate_pagecache_range(inode, start_byte, end_byte - 1);
+-      error = xfs_buffered_write_delalloc_punch(inode, start_byte, end_byte);
+-      filemap_invalidate_unlock(inode->i_mapping);
++      error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset,
++                      length, written, &xfs_buffered_write_delalloc_punch);
+       if (error && !xfs_is_shutdown(mp)) {
+               xfs_alert(mp, "%s: unable to clean up ino 0x%llx",
+                       __func__, XFS_I(inode)->i_ino);
+--- a/include/linux/iomap.h
++++ b/include/linux/iomap.h
+@@ -226,6 +226,10 @@ static inline const struct iomap *iomap_
+ ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
+               const struct iomap_ops *ops);
++int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
++              struct iomap *iomap, loff_t pos, loff_t length, ssize_t written,
++              int (*punch)(struct inode *inode, loff_t pos, loff_t length));
++
+ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
+ void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
+ bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
diff --git a/queue-6.1/xfs-punching-delalloc-extents-on-write-failure-is-racy.patch b/queue-6.1/xfs-punching-delalloc-extents-on-write-failure-is-racy.patch
new file mode 100644 (file)
index 0000000..164ca8d
--- /dev/null
@@ -0,0 +1,117 @@
+From stable+bounces-42891-greg=kroah.com@vger.kernel.org Wed May  1 20:41:26 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:40:50 -0700
+Subject: xfs: punching delalloc extents on write failure is racy
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, Christoph Hellwig <hch@lst.de>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-2-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 198dd8aedee6a7d2de0dfa739f9a008a938f6848 ]
+
+xfs_buffered_write_iomap_end() has a comment about the safety of
+punching delalloc extents based holding the IOLOCK_EXCL. This
+comment is wrong, and punching delalloc extents is not race free.
+
+When we punch out a delalloc extent after a write failure in
+xfs_buffered_write_iomap_end(), we punch out the page cache with
+truncate_pagecache_range() before we punch out the delalloc extents.
+At this point, we only hold the IOLOCK_EXCL, so there is nothing
+stopping mmap() write faults racing with this cleanup operation,
+reinstantiating a folio over the range we are about to punch and
+hence requiring the delalloc extent to be kept.
+
+If this race condition is hit, we can end up with a dirty page in
+the page cache that has no delalloc extent or space reservation
+backing it. This leads to bad things happening at writeback time.
+
+To avoid this race condition, we need the page cache truncation to
+be atomic w.r.t. the extent manipulation. We can do this by holding
+the mapping->invalidate_lock exclusively across this operation -
+this will prevent new pages from being inserted into the page cache
+whilst we are removing the pages and the backing extent and space
+reservation.
+
+Taking the mapping->invalidate_lock exclusively in the buffered
+write IO path is safe - it naturally nests inside the IOLOCK (see
+truncate and fallocate paths). iomap_zero_range() can be called from
+under the mapping->invalidate_lock (from the truncate path via
+either xfs_zero_eof() or xfs_truncate_page(), but iomap_zero_iter()
+will not instantiate new delalloc pages (because it skips holes) and
+hence will not ever need to punch out delalloc extents on failure.
+
+Fix the locking issue, and clean up the code logic a little to avoid
+unnecessary work if we didn't allocate the delalloc extent or wrote
+the entire region we allocated.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_iomap.c |   41 +++++++++++++++++++++++------------------
+ 1 file changed, 23 insertions(+), 18 deletions(-)
+
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -1147,6 +1147,10 @@ xfs_buffered_write_iomap_end(
+               written = 0;
+       }
++      /* If we didn't reserve the blocks, we're not allowed to punch them. */
++      if (!(iomap->flags & IOMAP_F_NEW))
++              return 0;
++
+       /*
+        * start_fsb refers to the first unused block after a short write. If
+        * nothing was written, round offset down to point at the first block in
+@@ -1158,27 +1162,28 @@ xfs_buffered_write_iomap_end(
+               start_fsb = XFS_B_TO_FSB(mp, offset + written);
+       end_fsb = XFS_B_TO_FSB(mp, offset + length);
++      /* Nothing to do if we've written the entire delalloc extent */
++      if (start_fsb >= end_fsb)
++              return 0;
++
+       /*
+-       * Trim delalloc blocks if they were allocated by this write and we
+-       * didn't manage to write the whole range.
+-       *
+-       * We don't need to care about racing delalloc as we hold i_mutex
+-       * across the reserve/allocate/unreserve calls. If there are delalloc
+-       * blocks in the range, they are ours.
++       * Lock the mapping to avoid races with page faults re-instantiating
++       * folios and dirtying them via ->page_mkwrite between the page cache
++       * truncation and the delalloc extent removal. Failing to do this can
++       * leave dirty pages with no space reservation in the cache.
+        */
+-      if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) {
+-              truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
+-                                       XFS_FSB_TO_B(mp, end_fsb) - 1);
+-
+-              error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+-                                             end_fsb - start_fsb);
+-              if (error && !xfs_is_shutdown(mp)) {
+-                      xfs_alert(mp, "%s: unable to clean up ino %lld",
+-                              __func__, ip->i_ino);
+-                      return error;
+-              }
+-      }
++      filemap_invalidate_lock(inode->i_mapping);
++      truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
++                               XFS_FSB_TO_B(mp, end_fsb) - 1);
++      error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
++                                     end_fsb - start_fsb);
++      filemap_invalidate_unlock(inode->i_mapping);
++      if (error && !xfs_is_shutdown(mp)) {
++              xfs_alert(mp, "%s: unable to clean up ino %lld",
++                      __func__, ip->i_ino);
++              return error;
++      }
+       return 0;
+ }
diff --git a/queue-6.1/xfs-short-circuit-xfs_growfs_data_private-if-delta-is-zero.patch b/queue-6.1/xfs-short-circuit-xfs_growfs_data_private-if-delta-is-zero.patch
new file mode 100644 (file)
index 0000000..eb1c41c
--- /dev/null
@@ -0,0 +1,51 @@
+From stable+bounces-42913-greg=kroah.com@vger.kernel.org Wed May  1 20:42:22 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:41:12 -0700
+Subject: xfs: short circuit xfs_growfs_data_private() if delta is zero
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Eric Sandeen <sandeen@redhat.com>, "Darrick J. Wong" <djwong@kernel.org>, Chandan Babu R <chandanbabu@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-24-leah.rumancik@gmail.com>
+
+From: Eric Sandeen <sandeen@redhat.com>
+
+[ Upstream commit 84712492e6dab803bf595fb8494d11098b74a652 ]
+
+Although xfs_growfs_data() doesn't call xfs_growfs_data_private()
+if in->newblocks == mp->m_sb.sb_dblocks, xfs_growfs_data_private()
+further massages the new block count so that we don't i.e. try
+to create a too-small new AG.
+
+This may lead to a delta of "0" in xfs_growfs_data_private(), so
+we end up in the shrink case and emit the EXPERIMENTAL warning
+even if we're not changing anything at all.
+
+Fix this by returning straightaway if the block delta is zero.
+
+(nb: in older kernels, the result of entering the shrink case
+with delta == 0 may actually let an -ENOSPC escape to userspace,
+which is confusing for users.)
+
+Fixes: fb2fc1720185 ("xfs: support shrinking unused space in the last AG")
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
+Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_fsops.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/fs/xfs/xfs_fsops.c
++++ b/fs/xfs/xfs_fsops.c
+@@ -129,6 +129,10 @@ xfs_growfs_data_private(
+       if (delta < 0 && nagcount < 2)
+               return -EINVAL;
++      /* No work to do */
++      if (delta == 0)
++              return 0;
++
+       oagcount = mp->m_sb.sb_agcount;
+       /* allocate the new per-ag structures */
+       if (nagcount > oagcount) {
diff --git a/queue-6.1/xfs-use-byte-ranges-for-write-cleanup-ranges.patch b/queue-6.1/xfs-use-byte-ranges-for-write-cleanup-ranges.patch
new file mode 100644 (file)
index 0000000..2da1ed7
--- /dev/null
@@ -0,0 +1,112 @@
+From stable+bounces-42892-greg=kroah.com@vger.kernel.org Wed May  1 20:41:29 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:40:51 -0700
+Subject: xfs: use byte ranges for write cleanup ranges
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-3-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit b71f889c18ada210a97aa3eb5e00c0de552234c6 ]
+
+xfs_buffered_write_iomap_end() currently converts the byte ranges
+passed to it to filesystem blocks to pass them to the bmap code to
+punch out delalloc blocks, but then has to convert filesytem
+blocks back to byte ranges for page cache truncate.
+
+We're about to make the page cache truncate go away and replace it
+with a page cache walk, so having to convert everything to/from/to
+filesystem blocks is messy and error-prone. It is much easier to
+pass around byte ranges and convert to page indexes and/or
+filesystem blocks only where those units are needed.
+
+In preparation for the page cache walk being added, add a helper
+that converts byte ranges to filesystem blocks and calls
+xfs_bmap_punch_delalloc_range() and convert
+xfs_buffered_write_iomap_end() to calculate limits in byte ranges.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_iomap.c |   40 +++++++++++++++++++++++++---------------
+ 1 file changed, 25 insertions(+), 15 deletions(-)
+
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -1121,6 +1121,20 @@ out_unlock:
+ }
+ static int
++xfs_buffered_write_delalloc_punch(
++      struct inode            *inode,
++      loff_t                  start_byte,
++      loff_t                  end_byte)
++{
++      struct xfs_mount        *mp = XFS_M(inode->i_sb);
++      xfs_fileoff_t           start_fsb = XFS_B_TO_FSBT(mp, start_byte);
++      xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, end_byte);
++
++      return xfs_bmap_punch_delalloc_range(XFS_I(inode), start_fsb,
++                              end_fsb - start_fsb);
++}
++
++static int
+ xfs_buffered_write_iomap_end(
+       struct inode            *inode,
+       loff_t                  offset,
+@@ -1129,10 +1143,9 @@ xfs_buffered_write_iomap_end(
+       unsigned                flags,
+       struct iomap            *iomap)
+ {
+-      struct xfs_inode        *ip = XFS_I(inode);
+-      struct xfs_mount        *mp = ip->i_mount;
+-      xfs_fileoff_t           start_fsb;
+-      xfs_fileoff_t           end_fsb;
++      struct xfs_mount        *mp = XFS_M(inode->i_sb);
++      loff_t                  start_byte;
++      loff_t                  end_byte;
+       int                     error = 0;
+       if (iomap->type != IOMAP_DELALLOC)
+@@ -1157,13 +1170,13 @@ xfs_buffered_write_iomap_end(
+        * the range.
+        */
+       if (unlikely(!written))
+-              start_fsb = XFS_B_TO_FSBT(mp, offset);
++              start_byte = round_down(offset, mp->m_sb.sb_blocksize);
+       else
+-              start_fsb = XFS_B_TO_FSB(mp, offset + written);
+-      end_fsb = XFS_B_TO_FSB(mp, offset + length);
++              start_byte = round_up(offset + written, mp->m_sb.sb_blocksize);
++      end_byte = round_up(offset + length, mp->m_sb.sb_blocksize);
+       /* Nothing to do if we've written the entire delalloc extent */
+-      if (start_fsb >= end_fsb)
++      if (start_byte >= end_byte)
+               return 0;
+       /*
+@@ -1173,15 +1186,12 @@ xfs_buffered_write_iomap_end(
+        * leave dirty pages with no space reservation in the cache.
+        */
+       filemap_invalidate_lock(inode->i_mapping);
+-      truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
+-                               XFS_FSB_TO_B(mp, end_fsb) - 1);
+-
+-      error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+-                                     end_fsb - start_fsb);
++      truncate_pagecache_range(inode, start_byte, end_byte - 1);
++      error = xfs_buffered_write_delalloc_punch(inode, start_byte, end_byte);
+       filemap_invalidate_unlock(inode->i_mapping);
+       if (error && !xfs_is_shutdown(mp)) {
+-              xfs_alert(mp, "%s: unable to clean up ino %lld",
+-                      __func__, ip->i_ino);
++              xfs_alert(mp, "%s: unable to clean up ino 0x%llx",
++                      __func__, XFS_I(inode)->i_ino);
+               return error;
+       }
+       return 0;
diff --git a/queue-6.1/xfs-use-iomap_valid-method-to-detect-stale-cached-iomaps.patch b/queue-6.1/xfs-use-iomap_valid-method-to-detect-stale-cached-iomaps.patch
new file mode 100644 (file)
index 0000000..2e778b0
--- /dev/null
@@ -0,0 +1,387 @@
+From stable+bounces-42899-greg=kroah.com@vger.kernel.org Wed May  1 20:41:46 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:40:56 -0700
+Subject: xfs: use iomap_valid method to detect stale cached iomaps
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-8-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 304a68b9c63bbfc1f6e159d68e8892fc54a06067 ]
+
+Now that iomap supports a mechanism to validate cached iomaps for
+buffered write operations, hook it up to the XFS buffered write ops
+so that we can avoid data corruptions that result from stale cached
+iomaps. See:
+
+https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/
+
+or the ->iomap_valid() introduction commit for exact details of the
+corruption vector.
+
+The validity cookie we store in the iomap is based on the type of
+iomap we return. It is expected that the iomap->flags we set in
+xfs_bmbt_to_iomap() is not perturbed by the iomap core and are
+returned to us in the iomap passed via the .iomap_valid() callback.
+This ensures that the validity cookie is always checking the correct
+inode fork sequence numbers to detect potential changes that affect
+the extent cached by the iomap.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_bmap.c |    6 +-
+ fs/xfs/xfs_aops.c        |    2 
+ fs/xfs/xfs_iomap.c       |   95 +++++++++++++++++++++++++++++++++++++----------
+ fs/xfs/xfs_iomap.h       |    5 +-
+ fs/xfs/xfs_pnfs.c        |    6 +-
+ 5 files changed, 87 insertions(+), 27 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -4551,7 +4551,8 @@ xfs_bmapi_convert_delalloc(
+        * the extent.  Just return the real extent at this offset.
+        */
+       if (!isnullstartblock(bma.got.br_startblock)) {
+-              xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
++              xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
++                              xfs_iomap_inode_sequence(ip, flags));
+               *seq = READ_ONCE(ifp->if_seq);
+               goto out_trans_cancel;
+       }
+@@ -4599,7 +4600,8 @@ xfs_bmapi_convert_delalloc(
+       XFS_STATS_INC(mp, xs_xstrat_quick);
+       ASSERT(!isnullstartblock(bma.got.br_startblock));
+-      xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
++      xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
++                              xfs_iomap_inode_sequence(ip, flags));
+       *seq = READ_ONCE(ifp->if_seq);
+       if (whichfork == XFS_COW_FORK)
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -372,7 +372,7 @@ retry:
+           isnullstartblock(imap.br_startblock))
+               goto allocate_blocks;
+-      xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0);
++      xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
+       trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
+       return 0;
+ allocate_blocks:
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -48,13 +48,45 @@ xfs_alert_fsblock_zero(
+       return -EFSCORRUPTED;
+ }
++u64
++xfs_iomap_inode_sequence(
++      struct xfs_inode        *ip,
++      u16                     iomap_flags)
++{
++      u64                     cookie = 0;
++
++      if (iomap_flags & IOMAP_F_XATTR)
++              return READ_ONCE(ip->i_af.if_seq);
++      if ((iomap_flags & IOMAP_F_SHARED) && ip->i_cowfp)
++              cookie = (u64)READ_ONCE(ip->i_cowfp->if_seq) << 32;
++      return cookie | READ_ONCE(ip->i_df.if_seq);
++}
++
++/*
++ * Check that the iomap passed to us is still valid for the given offset and
++ * length.
++ */
++static bool
++xfs_iomap_valid(
++      struct inode            *inode,
++      const struct iomap      *iomap)
++{
++      return iomap->validity_cookie ==
++                      xfs_iomap_inode_sequence(XFS_I(inode), iomap->flags);
++}
++
++const struct iomap_page_ops xfs_iomap_page_ops = {
++      .iomap_valid            = xfs_iomap_valid,
++};
++
+ int
+ xfs_bmbt_to_iomap(
+       struct xfs_inode        *ip,
+       struct iomap            *iomap,
+       struct xfs_bmbt_irec    *imap,
+       unsigned int            mapping_flags,
+-      u16                     iomap_flags)
++      u16                     iomap_flags,
++      u64                     sequence_cookie)
+ {
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_buftarg      *target = xfs_inode_buftarg(ip);
+@@ -91,6 +123,9 @@ xfs_bmbt_to_iomap(
+       if (xfs_ipincount(ip) &&
+           (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+               iomap->flags |= IOMAP_F_DIRTY;
++
++      iomap->validity_cookie = sequence_cookie;
++      iomap->page_ops = &xfs_iomap_page_ops;
+       return 0;
+ }
+@@ -195,7 +230,8 @@ xfs_iomap_write_direct(
+       xfs_fileoff_t           offset_fsb,
+       xfs_fileoff_t           count_fsb,
+       unsigned int            flags,
+-      struct xfs_bmbt_irec    *imap)
++      struct xfs_bmbt_irec    *imap,
++      u64                     *seq)
+ {
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_trans        *tp;
+@@ -285,6 +321,7 @@ xfs_iomap_write_direct(
+               error = xfs_alert_fsblock_zero(ip, imap);
+ out_unlock:
++      *seq = xfs_iomap_inode_sequence(ip, 0);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       return error;
+@@ -743,6 +780,7 @@ xfs_direct_write_iomap_begin(
+       bool                    shared = false;
+       u16                     iomap_flags = 0;
+       unsigned int            lockmode = XFS_ILOCK_SHARED;
++      u64                     seq;
+       ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
+@@ -811,9 +849,10 @@ xfs_direct_write_iomap_begin(
+                       goto out_unlock;
+       }
++      seq = xfs_iomap_inode_sequence(ip, iomap_flags);
+       xfs_iunlock(ip, lockmode);
+       trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
+-      return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags);
++      return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq);
+ allocate_blocks:
+       error = -EAGAIN;
+@@ -839,24 +878,26 @@ allocate_blocks:
+       xfs_iunlock(ip, lockmode);
+       error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
+-                      flags, &imap);
++                      flags, &imap, &seq);
+       if (error)
+               return error;
+       trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
+       return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
+-                               iomap_flags | IOMAP_F_NEW);
++                               iomap_flags | IOMAP_F_NEW, seq);
+ out_found_cow:
+-      xfs_iunlock(ip, lockmode);
+       length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
+       trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
+       if (imap.br_startblock != HOLESTARTBLOCK) {
+-              error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
++              seq = xfs_iomap_inode_sequence(ip, 0);
++              error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
+               if (error)
+-                      return error;
++                      goto out_unlock;
+       }
+-      return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED);
++      seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
++      xfs_iunlock(ip, lockmode);
++      return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
+ out_unlock:
+       if (lockmode)
+@@ -915,6 +956,7 @@ xfs_buffered_write_iomap_begin(
+       int                     allocfork = XFS_DATA_FORK;
+       int                     error = 0;
+       unsigned int            lockmode = XFS_ILOCK_EXCL;
++      u64                     seq;
+       if (xfs_is_shutdown(mp))
+               return -EIO;
+@@ -1094,26 +1136,31 @@ retry:
+        * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
+        * them out if the write happens to fail.
+        */
++      seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
+-      return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW);
++      return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq);
+ found_imap:
++      seq = xfs_iomap_inode_sequence(ip, 0);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+-      return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
++      return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
+ found_cow:
+-      xfs_iunlock(ip, XFS_ILOCK_EXCL);
++      seq = xfs_iomap_inode_sequence(ip, 0);
+       if (imap.br_startoff <= offset_fsb) {
+-              error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
++              error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
+               if (error)
+-                      return error;
++                      goto out_unlock;
++              seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
++              xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
+-                                       IOMAP_F_SHARED);
++                                       IOMAP_F_SHARED, seq);
+       }
+       xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
+-      return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0);
++      xfs_iunlock(ip, XFS_ILOCK_EXCL);
++      return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq);
+ out_unlock:
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+@@ -1193,6 +1240,7 @@ xfs_read_iomap_begin(
+       int                     nimaps = 1, error = 0;
+       bool                    shared = false;
+       unsigned int            lockmode = XFS_ILOCK_SHARED;
++      u64                     seq;
+       ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
+@@ -1206,13 +1254,14 @@ xfs_read_iomap_begin(
+                              &nimaps, 0);
+       if (!error && (flags & IOMAP_REPORT))
+               error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
++      seq = xfs_iomap_inode_sequence(ip, shared ? IOMAP_F_SHARED : 0);
+       xfs_iunlock(ip, lockmode);
+       if (error)
+               return error;
+       trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
+       return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
+-                               shared ? IOMAP_F_SHARED : 0);
++                               shared ? IOMAP_F_SHARED : 0, seq);
+ }
+ const struct iomap_ops xfs_read_iomap_ops = {
+@@ -1237,6 +1286,7 @@ xfs_seek_iomap_begin(
+       struct xfs_bmbt_irec    imap, cmap;
+       int                     error = 0;
+       unsigned                lockmode;
++      u64                     seq;
+       if (xfs_is_shutdown(mp))
+               return -EIO;
+@@ -1271,8 +1321,9 @@ xfs_seek_iomap_begin(
+               if (data_fsb < cow_fsb + cmap.br_blockcount)
+                       end_fsb = min(end_fsb, data_fsb);
+               xfs_trim_extent(&cmap, offset_fsb, end_fsb);
++              seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
+               error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
+-                                        IOMAP_F_SHARED);
++                              IOMAP_F_SHARED, seq);
+               /*
+                * This is a COW extent, so we must probe the page cache
+                * because there could be dirty page cache being backed
+@@ -1293,8 +1344,9 @@ xfs_seek_iomap_begin(
+       imap.br_startblock = HOLESTARTBLOCK;
+       imap.br_state = XFS_EXT_NORM;
+ done:
++      seq = xfs_iomap_inode_sequence(ip, 0);
+       xfs_trim_extent(&imap, offset_fsb, end_fsb);
+-      error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
++      error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
+ out_unlock:
+       xfs_iunlock(ip, lockmode);
+       return error;
+@@ -1320,6 +1372,7 @@ xfs_xattr_iomap_begin(
+       struct xfs_bmbt_irec    imap;
+       int                     nimaps = 1, error = 0;
+       unsigned                lockmode;
++      int                     seq;
+       if (xfs_is_shutdown(mp))
+               return -EIO;
+@@ -1336,12 +1389,14 @@ xfs_xattr_iomap_begin(
+       error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+                              &nimaps, XFS_BMAPI_ATTRFORK);
+ out_unlock:
++
++      seq = xfs_iomap_inode_sequence(ip, IOMAP_F_XATTR);
+       xfs_iunlock(ip, lockmode);
+       if (error)
+               return error;
+       ASSERT(nimaps);
+-      return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
++      return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_XATTR, seq);
+ }
+ const struct iomap_ops xfs_xattr_iomap_ops = {
+--- a/fs/xfs/xfs_iomap.h
++++ b/fs/xfs/xfs_iomap.h
+@@ -13,14 +13,15 @@ struct xfs_bmbt_irec;
+ int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
+               xfs_fileoff_t count_fsb, unsigned int flags,
+-              struct xfs_bmbt_irec *imap);
++              struct xfs_bmbt_irec *imap, u64 *sequence);
+ int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
+ xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
+               xfs_fileoff_t end_fsb);
++u64 xfs_iomap_inode_sequence(struct xfs_inode *ip, u16 iomap_flags);
+ int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
+               struct xfs_bmbt_irec *imap, unsigned int mapping_flags,
+-              u16 iomap_flags);
++              u16 iomap_flags, u64 sequence_cookie);
+ int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
+               bool *did_zero);
+--- a/fs/xfs/xfs_pnfs.c
++++ b/fs/xfs/xfs_pnfs.c
+@@ -125,6 +125,7 @@ xfs_fs_map_blocks(
+       int                     nimaps = 1;
+       uint                    lock_flags;
+       int                     error = 0;
++      u64                     seq;
+       if (xfs_is_shutdown(mp))
+               return -EIO;
+@@ -176,6 +177,7 @@ xfs_fs_map_blocks(
+       lock_flags = xfs_ilock_data_map_shared(ip);
+       error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+                               &imap, &nimaps, bmapi_flags);
++      seq = xfs_iomap_inode_sequence(ip, 0);
+       ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK);
+@@ -189,7 +191,7 @@ xfs_fs_map_blocks(
+               xfs_iunlock(ip, lock_flags);
+               error = xfs_iomap_write_direct(ip, offset_fsb,
+-                              end_fsb - offset_fsb, 0, &imap);
++                              end_fsb - offset_fsb, 0, &imap, &seq);
+               if (error)
+                       goto out_unlock;
+@@ -209,7 +211,7 @@ xfs_fs_map_blocks(
+       }
+       xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+-      error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0);
++      error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq);
+       *device_generation = mp->m_generation;
+       return error;
+ out_unlock:
diff --git a/queue-6.1/xfs-wait-iclog-complete-before-tearing-down-ail.patch b/queue-6.1/xfs-wait-iclog-complete-before-tearing-down-ail.patch
new file mode 100644 (file)
index 0000000..eefa890
--- /dev/null
@@ -0,0 +1,185 @@
+From stable+bounces-42905-greg=kroah.com@vger.kernel.org Wed May  1 20:42:02 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:41:04 -0700
+Subject: xfs: wait iclog complete before tearing down AIL
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Guo Xuenan <guoxuenan@huawei.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-16-leah.rumancik@gmail.com>
+
+From: Guo Xuenan <guoxuenan@huawei.com>
+
+[ Upstream commit 1eb52a6a71981b80f9acbd915acd6a05a5037196 ]
+
+Fix uaf in xfs_trans_ail_delete during xlog force shutdown.
+In commit cd6f79d1fb32 ("xfs: run callbacks before waking waiters in
+xlog_state_shutdown_callbacks") changed the order of running callbacks
+and wait for iclog completion to avoid unmount path untimely destroy AIL.
+But which seems not enough to ensue this, adding mdelay in
+`xfs_buf_item_unpin` can prove that.
+
+The reproduction is as follows. To ensure destroy AIL safely,
+we should wait all xlog ioend workers done and sync the AIL.
+
+==================================================================
+BUG: KASAN: use-after-free in xfs_trans_ail_delete+0x240/0x2a0
+Read of size 8 at addr ffff888023169400 by task kworker/1:1H/43
+
+CPU: 1 PID: 43 Comm: kworker/1:1H Tainted: G        W
+6.1.0-rc1-00002-gc28266863c4a #137
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
+1.13.0-1ubuntu1.1 04/01/2014
+Workqueue: xfs-log/sda xlog_ioend_work
+Call Trace:
+ <TASK>
+ dump_stack_lvl+0x4d/0x66
+ print_report+0x171/0x4a6
+ kasan_report+0xb3/0x130
+ xfs_trans_ail_delete+0x240/0x2a0
+ xfs_buf_item_done+0x7b/0xa0
+ xfs_buf_ioend+0x1e9/0x11f0
+ xfs_buf_item_unpin+0x4c8/0x860
+ xfs_trans_committed_bulk+0x4c2/0x7c0
+ xlog_cil_committed+0xab6/0xfb0
+ xlog_cil_process_committed+0x117/0x1e0
+ xlog_state_shutdown_callbacks+0x208/0x440
+ xlog_force_shutdown+0x1b3/0x3a0
+ xlog_ioend_work+0xef/0x1d0
+ process_one_work+0x6f9/0xf70
+ worker_thread+0x578/0xf30
+ kthread+0x28c/0x330
+ ret_from_fork+0x1f/0x30
+ </TASK>
+
+Allocated by task 9606:
+ kasan_save_stack+0x1e/0x40
+ kasan_set_track+0x21/0x30
+ __kasan_kmalloc+0x7a/0x90
+ __kmalloc+0x59/0x140
+ kmem_alloc+0xb2/0x2f0
+ xfs_trans_ail_init+0x20/0x320
+ xfs_log_mount+0x37e/0x690
+ xfs_mountfs+0xe36/0x1b40
+ xfs_fs_fill_super+0xc5c/0x1a70
+ get_tree_bdev+0x3c5/0x6c0
+ vfs_get_tree+0x85/0x250
+ path_mount+0xec3/0x1830
+ do_mount+0xef/0x110
+ __x64_sys_mount+0x150/0x1f0
+ do_syscall_64+0x35/0x80
+ entry_SYSCALL_64_after_hwframe+0x63/0xcd
+
+Freed by task 9662:
+ kasan_save_stack+0x1e/0x40
+ kasan_set_track+0x21/0x30
+ kasan_save_free_info+0x2a/0x40
+ __kasan_slab_free+0x105/0x1a0
+ __kmem_cache_free+0x99/0x2d0
+ kvfree+0x3a/0x40
+ xfs_log_unmount+0x60/0xf0
+ xfs_unmountfs+0xf3/0x1d0
+ xfs_fs_put_super+0x78/0x300
+ generic_shutdown_super+0x151/0x400
+ kill_block_super+0x9a/0xe0
+ deactivate_locked_super+0x82/0xe0
+ deactivate_super+0x91/0xb0
+ cleanup_mnt+0x32a/0x4a0
+ task_work_run+0x15f/0x240
+ exit_to_user_mode_prepare+0x188/0x190
+ syscall_exit_to_user_mode+0x12/0x30
+ do_syscall_64+0x42/0x80
+ entry_SYSCALL_64_after_hwframe+0x63/0xcd
+
+The buggy address belongs to the object at ffff888023169400
+ which belongs to the cache kmalloc-128 of size 128
+The buggy address is located 0 bytes inside of
+ 128-byte region [ffff888023169400, ffff888023169480)
+
+The buggy address belongs to the physical page:
+page:ffffea00008c5a00 refcount:1 mapcount:0 mapping:0000000000000000
+index:0xffff888023168f80 pfn:0x23168
+head:ffffea00008c5a00 order:1 compound_mapcount:0 compound_pincount:0
+flags: 0x1fffff80010200(slab|head|node=0|zone=1|lastcpupid=0x1fffff)
+raw: 001fffff80010200 ffffea00006b3988 ffffea0000577a88 ffff88800f842ac0
+raw: ffff888023168f80 0000000000150007 00000001ffffffff 0000000000000000
+page dumped because: kasan: bad access detected
+
+Memory state around the buggy address:
+ ffff888023169300: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff888023169380: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+>ffff888023169400: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+                   ^
+ ffff888023169480: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff888023169500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+==================================================================
+Disabling lock debugging due to kernel taint
+
+Fixes: cd6f79d1fb32 ("xfs: run callbacks before waking waiters in xlog_state_shutdown_callbacks")
+Signed-off-by: Guo Xuenan <guoxuenan@huawei.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c |   36 +++++++++++++++++++++++++-----------
+ 1 file changed, 25 insertions(+), 11 deletions(-)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -887,6 +887,23 @@ xlog_force_iclog(
+ }
+ /*
++ * Cycle all the iclogbuf locks to make sure all log IO completion
++ * is done before we tear down these buffers.
++ */
++static void
++xlog_wait_iclog_completion(struct xlog *log)
++{
++      int             i;
++      struct xlog_in_core     *iclog = log->l_iclog;
++
++      for (i = 0; i < log->l_iclog_bufs; i++) {
++              down(&iclog->ic_sema);
++              up(&iclog->ic_sema);
++              iclog = iclog->ic_next;
++      }
++}
++
++/*
+  * Wait for the iclog and all prior iclogs to be written disk as required by the
+  * log force state machine. Waiting on ic_force_wait ensures iclog completions
+  * have been ordered and callbacks run before we are woken here, hence
+@@ -1111,6 +1128,14 @@ xfs_log_unmount(
+ {
+       xfs_log_clean(mp);
++      /*
++       * If shutdown has come from iclog IO context, the log
++       * cleaning will have been skipped and so we need to wait
++       * for the iclog to complete shutdown processing before we
++       * tear anything down.
++       */
++      xlog_wait_iclog_completion(mp->m_log);
++
+       xfs_buftarg_drain(mp->m_ddev_targp);
+       xfs_trans_ail_destroy(mp);
+@@ -2114,17 +2139,6 @@ xlog_dealloc_log(
+       int             i;
+       /*
+-       * Cycle all the iclogbuf locks to make sure all log IO completion
+-       * is done before we tear down these buffers.
+-       */
+-      iclog = log->l_iclog;
+-      for (i = 0; i < log->l_iclog_bufs; i++) {
+-              down(&iclog->ic_sema);
+-              up(&iclog->ic_sema);
+-              iclog = iclog->ic_next;
+-      }
+-
+-      /*
+        * Destroy the CIL after waiting for iclog IO completion because an
+        * iclog EIO error will try to shut down the log, which accesses the
+        * CIL to wake up the waiters.
diff --git a/queue-6.1/xfs-write-page-faults-in-iomap-are-not-buffered-writes.patch b/queue-6.1/xfs-write-page-faults-in-iomap-are-not-buffered-writes.patch
new file mode 100644 (file)
index 0000000..c172270
--- /dev/null
@@ -0,0 +1,134 @@
+From stable+bounces-42890-greg=kroah.com@vger.kernel.org Wed May  1 20:41:25 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:40:49 -0700
+Subject: xfs: write page faults in iomap are not buffered writes
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, Christoph Hellwig <hch@lst.de>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-1-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 118e021b4b66f758f8e8f21dc0e5e0a4c721e69e ]
+
+When we reserve a delalloc region in xfs_buffered_write_iomap_begin,
+we mark the iomap as IOMAP_F_NEW so that the the write context
+understands that it allocated the delalloc region.
+
+If we then fail that buffered write, xfs_buffered_write_iomap_end()
+checks for the IOMAP_F_NEW flag and if it is set, it punches out
+the unused delalloc region that was allocated for the write.
+
+The assumption this code makes is that all buffered write operations
+that can allocate space are run under an exclusive lock (i_rwsem).
+This is an invalid assumption: page faults in mmap()d regions call
+through this same function pair to map the file range being faulted
+and this runs only holding the inode->i_mapping->invalidate_lock in
+shared mode.
+
+IOWs, we can have races between page faults and write() calls that
+fail the nested page cache write operation that result in data loss.
+That is, the failing iomap_end call will punch out the data that
+the other racing iomap iteration brought into the page cache. This
+can be reproduced with generic/34[46] if we arbitrarily fail page
+cache copy-in operations from write() syscalls.
+
+Code analysis tells us that the iomap_page_mkwrite() function holds
+the already instantiated and uptodate folio locked across the iomap
+mapping iterations. Hence the folio cannot be removed from memory
+whilst we are mapping the range it covers, and as such we do not
+care if the mapping changes state underneath the iomap iteration
+loop:
+
+1. if the folio is not already dirty, there is no writeback races
+   possible.
+2. if we allocated the mapping (delalloc or unwritten), the folio
+   cannot already be dirty. See #1.
+3. If the folio is already dirty, it must be up to date. As we hold
+   it locked, it cannot be reclaimed from memory. Hence we always
+   have valid data in the page cache while iterating the mapping.
+4. Valid data in the page cache can exist when the underlying
+   mapping is DELALLOC, UNWRITTEN or WRITTEN. Having the mapping
+   change from DELALLOC->UNWRITTEN or UNWRITTEN->WRITTEN does not
+   change the data in the page - it only affects actions if we are
+   initialising a new page. Hence #3 applies  and we don't care
+   about these extent map transitions racing with
+   iomap_page_mkwrite().
+5. iomap_page_mkwrite() checks for page invalidation races
+   (truncate, hole punch, etc) after it locks the folio. We also
+   hold the mapping->invalidation_lock here, and hence the mapping
+   cannot change due to extent removal operations while we are
+   iterating the folio.
+
+As such, filesystems that don't use bufferheads will never fail
+the iomap_folio_mkwrite_iter() operation on the current mapping,
+regardless of whether the iomap should be considered stale.
+
+Further, the range we are asked to iterate is limited to the range
+inside EOF that the folio spans. Hence, for XFS, we will only map
+the exact range we are asked for, and we will only do speculative
+preallocation with delalloc if we are mapping a hole at the EOF
+page. The iterator will consume the entire range of the folio that
+is within EOF, and anything beyond the EOF block cannot be accessed.
+We never need to truncate this post-EOF speculative prealloc away in
+the context of the iomap_page_mkwrite() iterator because if it
+remains unused we'll remove it when the last reference to the inode
+goes away.
+
+Hence we don't actually need an .iomap_end() cleanup/error handling
+path at all for iomap_page_mkwrite() for XFS. This means we can
+separate the page fault processing from the complexity of the
+.iomap_end() processing in the buffered write path. This also means
+that the buffered write path will also be able to take the
+mapping->invalidate_lock as necessary.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_file.c  |    2 +-
+ fs/xfs/xfs_iomap.c |    9 +++++++++
+ fs/xfs/xfs_iomap.h |    1 +
+ 3 files changed, 11 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -1325,7 +1325,7 @@ __xfs_filemap_fault(
+               if (write_fault) {
+                       xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+                       ret = iomap_page_mkwrite(vmf,
+-                                      &xfs_buffered_write_iomap_ops);
++                                      &xfs_page_mkwrite_iomap_ops);
+                       xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+               } else {
+                       ret = filemap_fault(vmf);
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -1187,6 +1187,15 @@ const struct iomap_ops xfs_buffered_writ
+       .iomap_end              = xfs_buffered_write_iomap_end,
+ };
++/*
++ * iomap_page_mkwrite() will never fail in a way that requires delalloc extents
++ * that it allocated to be revoked. Hence we do not need an .iomap_end method
++ * for this operation.
++ */
++const struct iomap_ops xfs_page_mkwrite_iomap_ops = {
++      .iomap_begin            = xfs_buffered_write_iomap_begin,
++};
++
+ static int
+ xfs_read_iomap_begin(
+       struct inode            *inode,
+--- a/fs/xfs/xfs_iomap.h
++++ b/fs/xfs/xfs_iomap.h
+@@ -47,6 +47,7 @@ xfs_aligned_fsb_count(
+ }
+ extern const struct iomap_ops xfs_buffered_write_iomap_ops;
++extern const struct iomap_ops xfs_page_mkwrite_iomap_ops;
+ extern const struct iomap_ops xfs_direct_write_iomap_ops;
+ extern const struct iomap_ops xfs_read_iomap_ops;
+ extern const struct iomap_ops xfs_seek_iomap_ops;
diff --git a/queue-6.1/xfs-xfs_bmap_punch_delalloc_range-should-take-a-byte-range.patch b/queue-6.1/xfs-xfs_bmap_punch_delalloc_range-should-take-a-byte-range.patch
new file mode 100644 (file)
index 0000000..3fc2933
--- /dev/null
@@ -0,0 +1,126 @@
+From stable+bounces-42895-greg=kroah.com@vger.kernel.org Wed May  1 20:41:37 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed,  1 May 2024 11:40:54 -0700
+Subject: xfs: xfs_bmap_punch_delalloc_range() should take a byte range
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-6-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 7348b322332d8602a4133f0b861334ea021b134a ]
+
+All the callers of xfs_bmap_punch_delalloc_range() jump through
+hoops to convert a byte range to filesystem blocks before calling
+xfs_bmap_punch_delalloc_range(). Instead, pass the byte range to
+xfs_bmap_punch_delalloc_range() and have it do the conversion to
+filesystem blocks internally.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_aops.c      |   16 ++++++----------
+ fs/xfs/xfs_bmap_util.c |   10 ++++++----
+ fs/xfs/xfs_bmap_util.h |    2 +-
+ fs/xfs/xfs_iomap.c     |    8 ++------
+ 4 files changed, 15 insertions(+), 21 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -114,9 +114,8 @@ xfs_end_ioend(
+       if (unlikely(error)) {
+               if (ioend->io_flags & IOMAP_F_SHARED) {
+                       xfs_reflink_cancel_cow_range(ip, offset, size, true);
+-                      xfs_bmap_punch_delalloc_range(ip,
+-                                                    XFS_B_TO_FSBT(mp, offset),
+-                                                    XFS_B_TO_FSB(mp, size));
++                      xfs_bmap_punch_delalloc_range(ip, offset,
++                                      offset + size);
+               }
+               goto done;
+       }
+@@ -455,12 +454,8 @@ xfs_discard_folio(
+       struct folio            *folio,
+       loff_t                  pos)
+ {
+-      struct inode            *inode = folio->mapping->host;
+-      struct xfs_inode        *ip = XFS_I(inode);
++      struct xfs_inode        *ip = XFS_I(folio->mapping->host);
+       struct xfs_mount        *mp = ip->i_mount;
+-      size_t                  offset = offset_in_folio(folio, pos);
+-      xfs_fileoff_t           start_fsb = XFS_B_TO_FSBT(mp, pos);
+-      xfs_fileoff_t           pageoff_fsb = XFS_B_TO_FSBT(mp, offset);
+       int                     error;
+       if (xfs_is_shutdown(mp))
+@@ -470,8 +465,9 @@ xfs_discard_folio(
+               "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
+                       folio, ip->i_ino, pos);
+-      error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+-                      i_blocks_per_folio(inode, folio) - pageoff_fsb);
++      error = xfs_bmap_punch_delalloc_range(ip, pos,
++                      round_up(pos, folio_size(folio)));
++
+       if (error && !xfs_is_shutdown(mp))
+               xfs_alert(mp, "page discard unable to remove delalloc mapping.");
+ }
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -590,11 +590,13 @@ out_unlock_iolock:
+ int
+ xfs_bmap_punch_delalloc_range(
+       struct xfs_inode        *ip,
+-      xfs_fileoff_t           start_fsb,
+-      xfs_fileoff_t           length)
++      xfs_off_t               start_byte,
++      xfs_off_t               end_byte)
+ {
++      struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_ifork        *ifp = &ip->i_df;
+-      xfs_fileoff_t           end_fsb = start_fsb + length;
++      xfs_fileoff_t           start_fsb = XFS_B_TO_FSBT(mp, start_byte);
++      xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, end_byte);
+       struct xfs_bmbt_irec    got, del;
+       struct xfs_iext_cursor  icur;
+       int                     error = 0;
+@@ -607,7 +609,7 @@ xfs_bmap_punch_delalloc_range(
+       while (got.br_startoff + got.br_blockcount > start_fsb) {
+               del = got;
+-              xfs_trim_extent(&del, start_fsb, length);
++              xfs_trim_extent(&del, start_fsb, end_fsb - start_fsb);
+               /*
+                * A delete can push the cursor forward. Step back to the
+--- a/fs/xfs/xfs_bmap_util.h
++++ b/fs/xfs/xfs_bmap_util.h
+@@ -31,7 +31,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap
+ #endif /* CONFIG_XFS_RT */
+ int   xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
+-              xfs_fileoff_t start_fsb, xfs_fileoff_t length);
++              xfs_off_t start_byte, xfs_off_t end_byte);
+ struct kgetbmap {
+       __s64           bmv_offset;     /* file offset of segment in blocks */
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -1126,12 +1126,8 @@ xfs_buffered_write_delalloc_punch(
+       loff_t                  offset,
+       loff_t                  length)
+ {
+-      struct xfs_mount        *mp = XFS_M(inode->i_sb);
+-      xfs_fileoff_t           start_fsb = XFS_B_TO_FSBT(mp, offset);
+-      xfs_fileoff_t           end_fsb = XFS_B_TO_FSB(mp, offset + length);
+-
+-      return xfs_bmap_punch_delalloc_range(XFS_I(inode), start_fsb,
+-                              end_fsb - start_fsb);
++      return xfs_bmap_punch_delalloc_range(XFS_I(inode), offset,
++                      offset + length);
+ }
+ static int