--- /dev/null
+From stable+bounces-42894-greg=kroah.com@vger.kernel.org Wed May 1 20:41:36 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:40:53 -0700
+Subject: iomap: buffered write failure should not truncate the page cache
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-5-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit f43dc4dc3eff028b5ddddd99f3a66c5a6bdd4e78 ]
+
+iomap_file_buffered_write_punch_delalloc() currently invalidates the
+page cache over the unused range of the delalloc extent that was
+allocated. While the write allocated the delalloc extent, it does
+not own it exclusively as the write does not hold any locks that
+prevent either writeback or mmap page faults from changing the state
+of either the page cache or the extent state backing this range.
+
+Whilst xfs_bmap_punch_delalloc_range() already handles races in
+extent conversion - it will only punch out delalloc extents and it
+ignores any other type of extent - the page cache truncate does not
+discriminate between data written by this write or some other task.
+As a result, truncating the page cache can result in data corruption
+if the write races with mmap modifications to the file over the same
+range.
+
+generic/346 exercises this workload, and if we randomly fail writes
+(as will happen when iomap gets stale iomap detection later in the
+patchset), it will randomly corrupt the file data because it removes
+data written by mmap() in the same page as the write() that failed.
+
+Hence we do not want to punch out the page cache over the range of
+the extent we failed to write to - what we actually need to do is
+detect the ranges that have dirty data in cache over them and *not
+punch them out*.
+
+To do this, we have to walk the page cache over the range of the
+delalloc extent we want to remove. This is made complex by the fact
+we have to handle partially up-to-date folios correctly and this can
+happen even when the FSB size == PAGE_SIZE because we now support
+multi-page folios in the page cache.
+
+Because we are only interested in discovering the edges of data
+ranges in the page cache (i.e. hole-data boundaries) we can make use
+of mapping_seek_hole_data() to find those transitions in the page
+cache. As we hold the invalidate_lock, we know that the boundaries
+are not going to change while we walk the range. This interface is
+also byte-based and is sub-page block aware, so we can find the data
+ranges in the cache based on byte offsets rather than page, folio or
+fs block sized chunks. This greatly simplifies the logic of finding
+dirty cached ranges in the page cache.
+
+Once we've identified a range that contains cached data, we can then
+iterate the range folio by folio. This allows us to determine if the
+data is dirty and hence perform the correct delalloc extent punching
+operations. The seek interface we use to iterate data ranges will
+give us sub-folio start/end granularity, so we may end up looking up
+the same folio multiple times as the seek interface iterates across
+each discontiguous data region in the folio.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/iomap/buffered-io.c | 195 +++++++++++++++++++++++++++++++++++++++++++++----
+ 1 file changed, 180 insertions(+), 15 deletions(-)
+
+--- a/fs/iomap/buffered-io.c
++++ b/fs/iomap/buffered-io.c
+@@ -828,6 +828,165 @@ iomap_file_buffered_write(struct kiocb *
+ EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+
+ /*
++ * Scan the data range passed to us for dirty page cache folios. If we find a
++ * dirty folio, punch out the preceeding range and update the offset from which
++ * the next punch will start from.
++ *
++ * We can punch out storage reservations under clean pages because they either
++ * contain data that has been written back - in which case the delalloc punch
++ * over that range is a no-op - or they have been read faults in which case they
++ * contain zeroes and we can remove the delalloc backing range and any new
++ * writes to those pages will do the normal hole filling operation...
++ *
++ * This makes the logic simple: we only need to keep the delalloc extents only
++ * over the dirty ranges of the page cache.
++ *
++ * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
++ * simplify range iterations.
++ */
++static int iomap_write_delalloc_scan(struct inode *inode,
++ loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
++ int (*punch)(struct inode *inode, loff_t offset, loff_t length))
++{
++ while (start_byte < end_byte) {
++ struct folio *folio;
++
++ /* grab locked page */
++ folio = filemap_lock_folio(inode->i_mapping,
++ start_byte >> PAGE_SHIFT);
++ if (!folio) {
++ start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) +
++ PAGE_SIZE;
++ continue;
++ }
++
++ /* if dirty, punch up to offset */
++ if (folio_test_dirty(folio)) {
++ if (start_byte > *punch_start_byte) {
++ int error;
++
++ error = punch(inode, *punch_start_byte,
++ start_byte - *punch_start_byte);
++ if (error) {
++ folio_unlock(folio);
++ folio_put(folio);
++ return error;
++ }
++ }
++
++ /*
++ * Make sure the next punch start is correctly bound to
++ * the end of this data range, not the end of the folio.
++ */
++ *punch_start_byte = min_t(loff_t, end_byte,
++ folio_next_index(folio) << PAGE_SHIFT);
++ }
++
++ /* move offset to start of next folio in range */
++ start_byte = folio_next_index(folio) << PAGE_SHIFT;
++ folio_unlock(folio);
++ folio_put(folio);
++ }
++ return 0;
++}
++
++/*
++ * Punch out all the delalloc blocks in the range given except for those that
++ * have dirty data still pending in the page cache - those are going to be
++ * written and so must still retain the delalloc backing for writeback.
++ *
++ * As we are scanning the page cache for data, we don't need to reimplement the
++ * wheel - mapping_seek_hole_data() does exactly what we need to identify the
++ * start and end of data ranges correctly even for sub-folio block sizes. This
++ * byte range based iteration is especially convenient because it means we
++ * don't have to care about variable size folios, nor where the start or end of
++ * the data range lies within a folio, if they lie within the same folio or even
++ * if there are multiple discontiguous data ranges within the folio.
++ *
++ * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so
++ * can return data ranges that exist in the cache beyond EOF. e.g. a page fault
++ * spanning EOF will initialise the post-EOF data to zeroes and mark it up to
++ * date. A write page fault can then mark it dirty. If we then fail a write()
++ * beyond EOF into that up to date cached range, we allocate a delalloc block
++ * beyond EOF and then have to punch it out. Because the range is up to date,
++ * mapping_seek_hole_data() will return it, and we will skip the punch because
++ * the folio is dirty. THis is incorrect - we always need to punch out delalloc
++ * beyond EOF in this case as writeback will never write back and covert that
++ * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF,
++ * resulting in always punching out the range from the EOF to the end of the
++ * range the iomap spans.
++ *
++ * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it
++ * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA
++ * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte)
++ * returns the end of the data range (data_end). Using closed intervals would
++ * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
++ * the code to subtle off-by-one bugs....
++ */
++static int iomap_write_delalloc_release(struct inode *inode,
++ loff_t start_byte, loff_t end_byte,
++ int (*punch)(struct inode *inode, loff_t pos, loff_t length))
++{
++ loff_t punch_start_byte = start_byte;
++ loff_t scan_end_byte = min(i_size_read(inode), end_byte);
++ int error = 0;
++
++ /*
++ * Lock the mapping to avoid races with page faults re-instantiating
++ * folios and dirtying them via ->page_mkwrite whilst we walk the
++ * cache and perform delalloc extent removal. Failing to do this can
++ * leave dirty pages with no space reservation in the cache.
++ */
++ filemap_invalidate_lock(inode->i_mapping);
++ while (start_byte < scan_end_byte) {
++ loff_t data_end;
++
++ start_byte = mapping_seek_hole_data(inode->i_mapping,
++ start_byte, scan_end_byte, SEEK_DATA);
++ /*
++ * If there is no more data to scan, all that is left is to
++ * punch out the remaining range.
++ */
++ if (start_byte == -ENXIO || start_byte == scan_end_byte)
++ break;
++ if (start_byte < 0) {
++ error = start_byte;
++ goto out_unlock;
++ }
++ WARN_ON_ONCE(start_byte < punch_start_byte);
++ WARN_ON_ONCE(start_byte > scan_end_byte);
++
++ /*
++ * We find the end of this contiguous cached data range by
++ * seeking from start_byte to the beginning of the next hole.
++ */
++ data_end = mapping_seek_hole_data(inode->i_mapping, start_byte,
++ scan_end_byte, SEEK_HOLE);
++ if (data_end < 0) {
++ error = data_end;
++ goto out_unlock;
++ }
++ WARN_ON_ONCE(data_end <= start_byte);
++ WARN_ON_ONCE(data_end > scan_end_byte);
++
++ error = iomap_write_delalloc_scan(inode, &punch_start_byte,
++ start_byte, data_end, punch);
++ if (error)
++ goto out_unlock;
++
++ /* The next data search starts at the end of this one. */
++ start_byte = data_end;
++ }
++
++ if (punch_start_byte < end_byte)
++ error = punch(inode, punch_start_byte,
++ end_byte - punch_start_byte);
++out_unlock:
++ filemap_invalidate_unlock(inode->i_mapping);
++ return error;
++}
++
++/*
+ * When a short write occurs, the filesystem may need to remove reserved space
+ * that was allocated in ->iomap_begin from it's ->iomap_end method. For
+ * filesystems that use delayed allocation, we need to punch out delalloc
+@@ -837,8 +996,25 @@ EXPORT_SYMBOL_GPL(iomap_file_buffered_wr
+ * allocated for this iomap.
+ *
+ * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
+- * simplify range iterations, but converts them back to {offset,len} tuples for
+- * the punch callback.
++ * simplify range iterations.
++ *
++ * The punch() callback *must* only punch delalloc extents in the range passed
++ * to it. It must skip over all other types of extents in the range and leave
++ * them completely unchanged. It must do this punch atomically with respect to
++ * other extent modifications.
++ *
++ * The punch() callback may be called with a folio locked to prevent writeback
++ * extent allocation racing at the edge of the range we are currently punching.
++ * The locked folio may or may not cover the range being punched, so it is not
++ * safe for the punch() callback to lock folios itself.
++ *
++ * Lock order is:
++ *
++ * inode->i_rwsem (shared or exclusive)
++ * inode->i_mapping->invalidate_lock (exclusive)
++ * folio_lock()
++ * ->punch
++ * internal filesystem allocation lock
+ */
+ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
+ struct iomap *iomap, loff_t pos, loff_t length,
+@@ -848,7 +1024,6 @@ int iomap_file_buffered_write_punch_dela
+ loff_t start_byte;
+ loff_t end_byte;
+ int blocksize = i_blocksize(inode);
+- int error = 0;
+
+ if (iomap->type != IOMAP_DELALLOC)
+ return 0;
+@@ -872,18 +1047,8 @@ int iomap_file_buffered_write_punch_dela
+ if (start_byte >= end_byte)
+ return 0;
+
+- /*
+- * Lock the mapping to avoid races with page faults re-instantiating
+- * folios and dirtying them via ->page_mkwrite between the page cache
+- * truncation and the delalloc extent removal. Failing to do this can
+- * leave dirty pages with no space reservation in the cache.
+- */
+- filemap_invalidate_lock(inode->i_mapping);
+- truncate_pagecache_range(inode, start_byte, end_byte - 1);
+- error = punch(inode, start_byte, end_byte - start_byte);
+- filemap_invalidate_unlock(inode->i_mapping);
+-
+- return error;
++ return iomap_write_delalloc_release(inode, start_byte, end_byte,
++ punch);
+ }
+ EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
+
--- /dev/null
+From stable+bounces-42896-greg=kroah.com@vger.kernel.org Wed May 1 20:41:39 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:40:55 -0700
+Subject: iomap: write iomap validity checks
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, Christoph Hellwig <hch@lst.de>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-7-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit d7b64041164ca177170191d2ad775da074ab2926 ]
+
+A recent multithreaded write data corruption has been uncovered in
+the iomap write code. The core of the problem is partial folio
+writes can be flushed to disk while a new racing write can map it
+and fill the rest of the page:
+
+writeback new write
+
+allocate blocks
+ blocks are unwritten
+submit IO
+.....
+ map blocks
+ iomap indicates UNWRITTEN range
+ loop {
+ lock folio
+ copyin data
+.....
+IO completes
+ runs unwritten extent conv
+ blocks are marked written
+ <iomap now stale>
+ get next folio
+ }
+
+Now add memory pressure such that memory reclaim evicts the
+partially written folio that has already been written to disk.
+
+When the new write finally gets to the last partial page of the new
+write, it does not find it in cache, so it instantiates a new page,
+sees the iomap is unwritten, and zeros the part of the page that
+it does not have data from. This overwrites the data on disk that
+was originally written.
+
+The full description of the corruption mechanism can be found here:
+
+https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/
+
+To solve this problem, we need to check whether the iomap is still
+valid after we lock each folio during the write. We have to do it
+after we lock the page so that we don't end up with state changes
+occurring while we wait for the folio to be locked.
+
+Hence we need a mechanism to be able to check that the cached iomap
+is still valid (similar to what we already do in buffered
+writeback), and we need a way for ->begin_write to back out and
+tell the high level iomap iterator that we need to remap the
+remaining write range.
+
+The iomap needs to grow some storage for the validity cookie that
+the filesystem provides to travel with the iomap. XFS, in
+particular, also needs to know some more information about what the
+iomap maps (attribute extents rather than file data extents) to for
+the validity cookie to cover all the types of iomaps we might need
+to validate.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/iomap/buffered-io.c | 29 ++++++++++++++++++++++++++++-
+ fs/iomap/iter.c | 19 ++++++++++++++++++-
+ include/linux/iomap.h | 43 +++++++++++++++++++++++++++++++++++--------
+ 3 files changed, 81 insertions(+), 10 deletions(-)
+
+--- a/fs/iomap/buffered-io.c
++++ b/fs/iomap/buffered-io.c
+@@ -579,7 +579,7 @@ static int iomap_write_begin_inline(cons
+ return iomap_read_inline_data(iter, folio);
+ }
+
+-static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
++static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
+ size_t len, struct folio **foliop)
+ {
+ const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
+@@ -613,6 +613,27 @@ static int iomap_write_begin(const struc
+ status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM;
+ goto out_no_page;
+ }
++
++ /*
++ * Now we have a locked folio, before we do anything with it we need to
++ * check that the iomap we have cached is not stale. The inode extent
++ * mapping can change due to concurrent IO in flight (e.g.
++ * IOMAP_UNWRITTEN state can change and memory reclaim could have
++ * reclaimed a previously partially written page at this index after IO
++ * completion before this write reaches this file offset) and hence we
++ * could do the wrong thing here (zero a page range incorrectly or fail
++ * to zero) and corrupt data.
++ */
++ if (page_ops && page_ops->iomap_valid) {
++ bool iomap_valid = page_ops->iomap_valid(iter->inode,
++ &iter->iomap);
++ if (!iomap_valid) {
++ iter->iomap.flags |= IOMAP_F_STALE;
++ status = 0;
++ goto out_unlock;
++ }
++ }
++
+ if (pos + len > folio_pos(folio) + folio_size(folio))
+ len = folio_pos(folio) + folio_size(folio) - pos;
+
+@@ -768,6 +789,8 @@ again:
+ status = iomap_write_begin(iter, pos, bytes, &folio);
+ if (unlikely(status))
+ break;
++ if (iter->iomap.flags & IOMAP_F_STALE)
++ break;
+
+ page = folio_file_page(folio, pos >> PAGE_SHIFT);
+ if (mapping_writably_mapped(mapping))
+@@ -1076,6 +1099,8 @@ static loff_t iomap_unshare_iter(struct
+ status = iomap_write_begin(iter, pos, bytes, &folio);
+ if (unlikely(status))
+ return status;
++ if (iter->iomap.flags & IOMAP_F_STALE)
++ break;
+
+ status = iomap_write_end(iter, pos, bytes, bytes, folio);
+ if (WARN_ON_ONCE(status == 0))
+@@ -1131,6 +1156,8 @@ static loff_t iomap_zero_iter(struct iom
+ status = iomap_write_begin(iter, pos, bytes, &folio);
+ if (status)
+ return status;
++ if (iter->iomap.flags & IOMAP_F_STALE)
++ break;
+
+ offset = offset_in_folio(folio, pos);
+ if (bytes > folio_size(folio) - offset)
+--- a/fs/iomap/iter.c
++++ b/fs/iomap/iter.c
+@@ -7,12 +7,28 @@
+ #include <linux/iomap.h>
+ #include "trace.h"
+
++/*
++ * Advance to the next range we need to map.
++ *
++ * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully
++ * processed - it was aborted because the extent the iomap spanned may have been
++ * changed during the operation. In this case, the iteration behaviour is to
++ * remap the unprocessed range of the iter, and that means we may need to remap
++ * even when we've made no progress (i.e. iter->processed = 0). Hence the
++ * "finished iterating" case needs to distinguish between
++ * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we
++ * need to remap the entire remaining range.
++ */
+ static inline int iomap_iter_advance(struct iomap_iter *iter)
+ {
++ bool stale = iter->iomap.flags & IOMAP_F_STALE;
++
+ /* handle the previous iteration (if any) */
+ if (iter->iomap.length) {
+- if (iter->processed <= 0)
++ if (iter->processed < 0)
+ return iter->processed;
++ if (!iter->processed && !stale)
++ return 0;
+ if (WARN_ON_ONCE(iter->processed > iomap_length(iter)))
+ return -EIO;
+ iter->pos += iter->processed;
+@@ -33,6 +49,7 @@ static inline void iomap_iter_done(struc
+ WARN_ON_ONCE(iter->iomap.offset > iter->pos);
+ WARN_ON_ONCE(iter->iomap.length == 0);
+ WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos);
++ WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE);
+
+ trace_iomap_iter_dstmap(iter->inode, &iter->iomap);
+ if (iter->srcmap.type != IOMAP_HOLE)
+--- a/include/linux/iomap.h
++++ b/include/linux/iomap.h
+@@ -49,26 +49,35 @@ struct vm_fault;
+ *
+ * IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of
+ * buffer heads for this mapping.
++ *
++ * IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent
++ * rather than a file data extent.
+ */
+-#define IOMAP_F_NEW 0x01
+-#define IOMAP_F_DIRTY 0x02
+-#define IOMAP_F_SHARED 0x04
+-#define IOMAP_F_MERGED 0x08
+-#define IOMAP_F_BUFFER_HEAD 0x10
+-#define IOMAP_F_ZONE_APPEND 0x20
++#define IOMAP_F_NEW (1U << 0)
++#define IOMAP_F_DIRTY (1U << 1)
++#define IOMAP_F_SHARED (1U << 2)
++#define IOMAP_F_MERGED (1U << 3)
++#define IOMAP_F_BUFFER_HEAD (1U << 4)
++#define IOMAP_F_ZONE_APPEND (1U << 5)
++#define IOMAP_F_XATTR (1U << 6)
+
+ /*
+ * Flags set by the core iomap code during operations:
+ *
+ * IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size
+ * has changed as the result of this write operation.
++ *
++ * IOMAP_F_STALE indicates that the iomap is not valid any longer and the file
++ * range it covers needs to be remapped by the high level before the operation
++ * can proceed.
+ */
+-#define IOMAP_F_SIZE_CHANGED 0x100
++#define IOMAP_F_SIZE_CHANGED (1U << 8)
++#define IOMAP_F_STALE (1U << 9)
+
+ /*
+ * Flags from 0x1000 up are for file system specific usage:
+ */
+-#define IOMAP_F_PRIVATE 0x1000
++#define IOMAP_F_PRIVATE (1U << 12)
+
+
+ /*
+@@ -89,6 +98,7 @@ struct iomap {
+ void *inline_data;
+ void *private; /* filesystem private */
+ const struct iomap_page_ops *page_ops;
++ u64 validity_cookie; /* used with .iomap_valid() */
+ };
+
+ static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos)
+@@ -128,6 +138,23 @@ struct iomap_page_ops {
+ int (*page_prepare)(struct inode *inode, loff_t pos, unsigned len);
+ void (*page_done)(struct inode *inode, loff_t pos, unsigned copied,
+ struct page *page);
++
++ /*
++ * Check that the cached iomap still maps correctly to the filesystem's
++ * internal extent map. FS internal extent maps can change while iomap
++ * is iterating a cached iomap, so this hook allows iomap to detect that
++ * the iomap needs to be refreshed during a long running write
++ * operation.
++ *
++ * The filesystem can store internal state (e.g. a sequence number) in
++ * iomap->validity_cookie when the iomap is first mapped to be able to
++ * detect changes between mapping time and whenever .iomap_valid() is
++ * called.
++ *
++ * This is called with the folio over the specified file position held
++ * locked by the iomap code.
++ */
++ bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap);
+ };
+
+ /*
--- /dev/null
+From ffcaa2172cc1a85ddb8b783de96d38ca8855e248 Mon Sep 17 00:00:00 2001
+From: Jarkko Sakkinen <jarkko@kernel.org>
+Date: Mon, 20 May 2024 02:31:53 +0300
+Subject: KEYS: trusted: Fix memory leak in tpm2_key_encode()
+
+From: Jarkko Sakkinen <jarkko@kernel.org>
+
+commit ffcaa2172cc1a85ddb8b783de96d38ca8855e248 upstream.
+
+'scratch' is never freed. Fix this by calling kfree() in the success, and
+in the error case.
+
+Cc: stable@vger.kernel.org # +v5.13
+Fixes: f2219745250f ("security: keys: trusted: use ASN.1 TPM2 key format for the blobs")
+Signed-off-by: Jarkko Sakkinen <jarkko@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ security/keys/trusted-keys/trusted_tpm2.c | 24 ++++++++++++++++++------
+ 1 file changed, 18 insertions(+), 6 deletions(-)
+
+--- a/security/keys/trusted-keys/trusted_tpm2.c
++++ b/security/keys/trusted-keys/trusted_tpm2.c
+@@ -38,6 +38,7 @@ static int tpm2_key_encode(struct truste
+ u8 *end_work = scratch + SCRATCH_SIZE;
+ u8 *priv, *pub;
+ u16 priv_len, pub_len;
++ int ret;
+
+ priv_len = get_unaligned_be16(src) + 2;
+ priv = src;
+@@ -57,8 +58,10 @@ static int tpm2_key_encode(struct truste
+ unsigned char bool[3], *w = bool;
+ /* tag 0 is emptyAuth */
+ w = asn1_encode_boolean(w, w + sizeof(bool), true);
+- if (WARN(IS_ERR(w), "BUG: Boolean failed to encode"))
+- return PTR_ERR(w);
++ if (WARN(IS_ERR(w), "BUG: Boolean failed to encode")) {
++ ret = PTR_ERR(w);
++ goto err;
++ }
+ work = asn1_encode_tag(work, end_work, 0, bool, w - bool);
+ }
+
+@@ -69,8 +72,10 @@ static int tpm2_key_encode(struct truste
+ * trigger, so if it does there's something nefarious going on
+ */
+ if (WARN(work - scratch + pub_len + priv_len + 14 > SCRATCH_SIZE,
+- "BUG: scratch buffer is too small"))
+- return -EINVAL;
++ "BUG: scratch buffer is too small")) {
++ ret = -EINVAL;
++ goto err;
++ }
+
+ work = asn1_encode_integer(work, end_work, options->keyhandle);
+ work = asn1_encode_octet_string(work, end_work, pub, pub_len);
+@@ -79,10 +84,17 @@ static int tpm2_key_encode(struct truste
+ work1 = payload->blob;
+ work1 = asn1_encode_sequence(work1, work1 + sizeof(payload->blob),
+ scratch, work - scratch);
+- if (WARN(IS_ERR(work1), "BUG: ASN.1 encoder failed"))
+- return PTR_ERR(work1);
++ if (WARN(IS_ERR(work1), "BUG: ASN.1 encoder failed")) {
++ ret = PTR_ERR(work1);
++ goto err;
++ }
+
++ kfree(scratch);
+ return work1 - payload->blob;
++
++err:
++ kfree(scratch);
++ return ret;
+ }
+
+ struct tpm2_key_context {
--- /dev/null
+From 77e01b49e35f24ebd1659096d5fc5c3b75975545 Mon Sep 17 00:00:00 2001
+From: Mengqi Zhang <mengqi.zhang@mediatek.com>
+Date: Mon, 25 Dec 2023 17:38:40 +0800
+Subject: mmc: core: Add HS400 tuning in HS400es initialization
+
+From: Mengqi Zhang <mengqi.zhang@mediatek.com>
+
+commit 77e01b49e35f24ebd1659096d5fc5c3b75975545 upstream.
+
+During the initialization to HS400es stage, add a HS400 tuning flow as an
+optional process. For Mediatek IP, the HS400es mode requires a specific
+tuning to ensure the correct HS400 timing setting.
+
+Signed-off-by: Mengqi Zhang <mengqi.zhang@mediatek.com>
+Link: https://lore.kernel.org/r/20231225093839.22931-2-mengqi.zhang@mediatek.com
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Cc: "Lin Gui (æ¡‚æž—)" <Lin.Gui@mediatek.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/mmc/core/mmc.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/drivers/mmc/core/mmc.c
++++ b/drivers/mmc/core/mmc.c
+@@ -1819,8 +1819,13 @@ static int mmc_init_card(struct mmc_host
+
+ if (err)
+ goto free_card;
+-
+- } else if (!mmc_card_hs400es(card)) {
++ } else if (mmc_card_hs400es(card)) {
++ if (host->ops->execute_hs400_tuning) {
++ err = host->ops->execute_hs400_tuning(host, card);
++ if (err)
++ goto free_card;
++ }
++ } else {
+ /* Select the desired bus width optionally */
+ err = mmc_select_bus_width(card);
+ if (err > 0 && mmc_card_hs(card)) {
pinctrl-core-handle-radix_tree_insert-errors-in-pinctrl_register_one_pin.patch
mfd-stpmic1-fix-swapped-mask-unmask-in-irq-chip.patch
nfsd-don-t-allow-nfsd-threads-to-be-signalled.patch
+keys-trusted-fix-memory-leak-in-tpm2_key_encode.patch
+mmc-core-add-hs400-tuning-in-hs400es-initialization.patch
+xfs-write-page-faults-in-iomap-are-not-buffered-writes.patch
+xfs-punching-delalloc-extents-on-write-failure-is-racy.patch
+xfs-use-byte-ranges-for-write-cleanup-ranges.patch
+xfs-iomap-move-delalloc-punching-to-iomap.patch
+iomap-buffered-write-failure-should-not-truncate-the-page-cache.patch
+xfs-xfs_bmap_punch_delalloc_range-should-take-a-byte-range.patch
+iomap-write-iomap-validity-checks.patch
+xfs-use-iomap_valid-method-to-detect-stale-cached-iomaps.patch
+xfs-drop-write-error-injection-is-unfixable-remove-it.patch
+xfs-fix-off-by-one-block-in-xfs_discard_folio.patch
+xfs-fix-incorrect-error-out-in-xfs_remove.patch
+xfs-fix-sb-write-verify-for-lazysbcount.patch
+xfs-fix-incorrect-i_nlink-caused-by-inode-racing.patch
+xfs-invalidate-block-device-page-cache-during-unmount.patch
+xfs-attach-dquots-to-inode-before-reading-data-cow-fork-mappings.patch
+xfs-wait-iclog-complete-before-tearing-down-ail.patch
+xfs-fix-super-block-buf-log-item-uaf-during-force-shutdown.patch
+xfs-hoist-refcount-record-merge-predicates.patch
+xfs-estimate-post-merge-refcounts-correctly.patch
+xfs-invalidate-xfs_bufs-when-allocating-cow-extents.patch
+xfs-allow-inode-inactivation-during-a-ro-mount-log-recovery.patch
+xfs-fix-log-recovery-when-unknown-rocompat-bits-are-set.patch
+xfs-get-root-inode-correctly-at-bulkstat.patch
+xfs-short-circuit-xfs_growfs_data_private-if-delta-is-zero.patch
--- /dev/null
+From stable+bounces-42910-greg=kroah.com@vger.kernel.org Wed May 1 20:42:14 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:41:09 -0700
+Subject: xfs: allow inode inactivation during a ro mount log recovery
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-21-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 76e589013fec672c3587d6314f2d1f0aeddc26d9 ]
+
+In the next patch, we're going to prohibit log recovery if the primary
+superblock contains an unrecognized rocompat feature bit even on
+readonly mounts. This requires removing all the code in the log
+mounting process that temporarily disables the readonly state.
+
+Unfortunately, inode inactivation disables itself on readonly mounts.
+Clearing the iunlinked lists after log recovery needs inactivation to
+run to free the unreferenced inodes, which (AFAICT) is the only reason
+why log mounting plays games with the readonly state in the first place.
+
+Therefore, change the inactivation predicates to allow inactivation
+during log recovery of a readonly mount.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_inode.c | 14 ++++++++++----
+ 1 file changed, 10 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -1652,8 +1652,11 @@ xfs_inode_needs_inactive(
+ if (VFS_I(ip)->i_mode == 0)
+ return false;
+
+- /* If this is a read-only mount, don't do this (would generate I/O) */
+- if (xfs_is_readonly(mp))
++ /*
++ * If this is a read-only mount, don't do this (would generate I/O)
++ * unless we're in log recovery and cleaning the iunlinked list.
++ */
++ if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log))
+ return false;
+
+ /* If the log isn't running, push inodes straight to reclaim. */
+@@ -1713,8 +1716,11 @@ xfs_inactive(
+ mp = ip->i_mount;
+ ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
+
+- /* If this is a read-only mount, don't do this (would generate I/O) */
+- if (xfs_is_readonly(mp))
++ /*
++ * If this is a read-only mount, don't do this (would generate I/O)
++ * unless we're in log recovery and cleaning the iunlinked list.
++ */
++ if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log))
+ goto out;
+
+ /* Metadata inodes require explicit resource cleanup. */
--- /dev/null
+From stable+bounces-42904-greg=kroah.com@vger.kernel.org Wed May 1 20:42:01 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:41:03 -0700
+Subject: xfs: attach dquots to inode before reading data/cow fork mappings
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-15-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 4c6dbfd2756bd83a0085ed804e2bb7be9cc16bc5 ]
+
+I've been running near-continuous integration testing of online fsck,
+and I've noticed that once a day, one of the ARM VMs will fail the test
+with out of order records in the data fork.
+
+xfs/804 races fsstress with online scrub (aka scan but do not change
+anything), so I think this might be a bug in the core xfs code. This
+also only seems to trigger if one runs the test for more than ~6 minutes
+via TIME_FACTOR=13 or something.
+https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/tree/tests/xfs/804?h=djwong-wtf
+
+I added a debugging patch to the kernel to check the data fork extents
+after taking the ILOCK, before dropping ILOCK, and before and after each
+bmapping operation. So far I've narrowed it down to the delalloc code
+inserting a record in the wrong place in the iext tree:
+
+xfs_bmap_add_extent_hole_delay, near line 2691:
+
+ case 0:
+ /*
+ * New allocation is not contiguous with another
+ * delayed allocation.
+ * Insert a new entry.
+ */
+ oldlen = newlen = 0;
+ xfs_iunlock_check_datafork(ip); <-- ok here
+ xfs_iext_insert(ip, icur, new, state);
+ xfs_iunlock_check_datafork(ip); <-- bad here
+ break;
+ }
+
+I recorded the state of the data fork mappings and iext cursor state
+when a corrupt data fork is detected immediately after the
+xfs_bmap_add_extent_hole_delay call in xfs_bmapi_reserve_delalloc:
+
+ino 0x140bb3 func xfs_bmapi_reserve_delalloc line 4164 data fork:
+ ino 0x140bb3 nr 0x0 nr_real 0x0 offset 0xb9 blockcount 0x1f startblock 0x935de2 state 1
+ ino 0x140bb3 nr 0x1 nr_real 0x1 offset 0xe6 blockcount 0xa startblock 0xffffffffe0007 state 0
+ ino 0x140bb3 nr 0x2 nr_real 0x1 offset 0xd8 blockcount 0xe startblock 0x935e01 state 0
+
+Here we see that a delalloc extent was inserted into the wrong position
+in the iext leaf, same as all the other times. The extra trace data I
+collected are as follows:
+
+ino 0x140bb3 fork 0 oldoff 0xe6 oldlen 0x4 oldprealloc 0x6 isize 0xe6000
+ ino 0x140bb3 oldgotoff 0xea oldgotstart 0xfffffffffffffffe oldgotcount 0x0 oldgotstate 0
+ ino 0x140bb3 crapgotoff 0x0 crapgotstart 0x0 crapgotcount 0x0 crapgotstate 0
+ ino 0x140bb3 freshgotoff 0xd8 freshgotstart 0x935e01 freshgotcount 0xe freshgotstate 0
+ ino 0x140bb3 nowgotoff 0xe6 nowgotstart 0xffffffffe0007 nowgotcount 0xa nowgotstate 0
+ ino 0x140bb3 oldicurpos 1 oldleafnr 2 oldleaf 0xfffffc00f0609a00
+ ino 0x140bb3 crapicurpos 2 crapleafnr 2 crapleaf 0xfffffc00f0609a00
+ ino 0x140bb3 freshicurpos 1 freshleafnr 2 freshleaf 0xfffffc00f0609a00
+ ino 0x140bb3 newicurpos 1 newleafnr 3 newleaf 0xfffffc00f0609a00
+
+The first line shows that xfs_bmapi_reserve_delalloc was called with
+whichfork=XFS_DATA_FORK, off=0xe6, len=0x4, prealloc=6.
+
+The second line ("oldgot") shows the contents of @got at the beginning
+of the call, which are the results of the first iext lookup in
+xfs_buffered_write_iomap_begin.
+
+Line 3 ("crapgot") is the result of duplicating the cursor at the start
+of the body of xfs_bmapi_reserve_delalloc and performing a fresh lookup
+at @off.
+
+Line 4 ("freshgot") is the result of a new xfs_iext_get_extent right
+before the call to xfs_bmap_add_extent_hole_delay. Totally garbage.
+
+Line 5 ("nowgot") is contents of @got after the
+xfs_bmap_add_extent_hole_delay call.
+
+Line 6 is the contents of @icur at the beginning fo the call. Lines 7-9
+are the contents of the iext cursors at the point where the block
+mappings were sampled.
+
+I think @oldgot is a HOLESTARTBLOCK extent because the first lookup
+didn't find anything, so we filled in imap with "fake hole until the
+end". At the time of the first lookup, I suspect that there's only one
+32-block unwritten extent in the mapping (hence oldicurpos==1) but by
+the time we get to recording crapgot, crapicurpos==2.
+
+Dave then added:
+
+Ok, that's much simpler to reason about, and implies the smoke is
+coming from xfs_buffered_write_iomap_begin() or
+xfs_bmapi_reserve_delalloc(). I suspect the former - it does a lot
+of stuff with the ILOCK_EXCL held.....
+
+.... including calling xfs_qm_dqattach_locked().
+
+xfs_buffered_write_iomap_begin
+ ILOCK_EXCL
+ look up icur
+ xfs_qm_dqattach_locked
+ xfs_qm_dqattach_one
+ xfs_qm_dqget_inode
+ dquot cache miss
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ ....
+ xfs_bmapi_reserve_delalloc(icur)
+
+Yup, that's what is letting the magic smoke out -
+xfs_qm_dqattach_locked() can cycle the ILOCK. If that happens, we
+can pass a stale icur to xfs_bmapi_reserve_delalloc() and it all
+goes downhill from there.
+
+Back to Darrick now:
+
+So. Fix this by moving the dqattach_locked call up before we take the
+ILOCK, like all the other callers in that file.
+
+Fixes: a526c85c2236 ("xfs: move xfs_file_iomap_begin_delay around") # goes further back than this
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_iomap.c | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -968,6 +968,10 @@ xfs_buffered_write_iomap_begin(
+
+ ASSERT(!XFS_IS_REALTIME_INODE(ip));
+
++ error = xfs_qm_dqattach(ip);
++ if (error)
++ return error;
++
+ error = xfs_ilock_for_iomap(ip, flags, &lockmode);
+ if (error)
+ return error;
+@@ -1071,10 +1075,6 @@ xfs_buffered_write_iomap_begin(
+ allocfork = XFS_COW_FORK;
+ }
+
+- error = xfs_qm_dqattach_locked(ip, false);
+- if (error)
+- goto out_unlock;
+-
+ if (eof && offset + count > XFS_ISIZE(ip)) {
+ /*
+ * Determine the initial size of the preallocation.
--- /dev/null
+From stable+bounces-42897-greg=kroah.com@vger.kernel.org Wed May 1 20:41:44 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:40:57 -0700
+Subject: xfs: drop write error injection is unfixable, remove it
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-9-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 6e8af15ccdc4e138a5b529c1901a0013e1dcaa09 ]
+
+With the changes to scan the page cache for dirty data to avoid data
+corruptions from partial write cleanup racing with other page cache
+operations, the drop writes error injection no longer works the same
+way it used to and causes xfs/196 to fail. This is because xfs/196
+writes to the file and populates the page cache before it turns on
+the error injection and starts failing -overwrites-.
+
+The result is that the original drop-writes code failed writes only
+-after- overwriting the data in the cache, followed by invalidates
+the cached data, then punching out the delalloc extent from under
+that data.
+
+On the surface, this looks fine. The problem is that page cache
+invalidation *doesn't guarantee that it removes anything from the
+page cache* and it doesn't change the dirty state of the folio. When
+block size == page size and we do page aligned IO (as xfs/196 does)
+everything happens to align perfectly and page cache invalidation
+removes the single page folios that span the written data. Hence the
+followup delalloc punch pass does not find cached data over that
+range and it can punch the extent out.
+
+IOWs, xfs/196 "works" for block size == page size with the new
+code. I say "works", because it actually only works for the case
+where IO is page aligned, and no data was read from disk before
+writes occur. Because the moment we actually read data first, the
+readahead code allocates multipage folios and suddenly the
+invalidate code goes back to zeroing subfolio ranges without
+changing dirty state.
+
+Hence, with multipage folios in play, block size == page size is
+functionally identical to block size < page size behaviour, and
+drop-writes is manifestly broken w.r.t to this case. Invalidation of
+a subfolio range doesn't result in the folio being removed from the
+cache, just the range gets zeroed. Hence after we've sequentially
+walked over a folio that we've dirtied (via write data) and then
+invalidated, we end up with a dirty folio full of zeroed data.
+
+And because the new code skips punching ranges that have dirty
+folios covering them, we end up leaving the delalloc range intact
+after failing all the writes. Hence failed writes now end up
+writing zeroes to disk in the cases where invalidation zeroes folios
+rather than removing them from cache.
+
+This is a fundamental change of behaviour that is needed to avoid
+the data corruption vectors that exist in the old write fail path,
+and it renders the drop-writes injection non-functional and
+unworkable as it stands.
+
+As it is, I think the error injection is also now unnecessary, as
+partial writes that need delalloc extent are going to be a lot more
+common with stale iomap detection in place. Hence this patch removes
+the drop-writes error injection completely. xfs/196 can remain for
+testing kernels that don't have this data corruption fix, but those
+that do will report:
+
+xfs/196 3s ... [not run] XFS error injection drop_writes unknown on this kernel.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_errortag.h | 12 +++++-------
+ fs/xfs/xfs_error.c | 27 ++++++++++++++++++++-------
+ fs/xfs/xfs_iomap.c | 9 ---------
+ 3 files changed, 25 insertions(+), 23 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_errortag.h
++++ b/fs/xfs/libxfs/xfs_errortag.h
+@@ -40,13 +40,12 @@
+ #define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25
+ #define XFS_ERRTAG_BMAP_FINISH_ONE 26
+ #define XFS_ERRTAG_AG_RESV_CRITICAL 27
++
+ /*
+- * DEBUG mode instrumentation to test and/or trigger delayed allocation
+- * block killing in the event of failed writes. When enabled, all
+- * buffered writes are silenty dropped and handled as if they failed.
+- * All delalloc blocks in the range of the write (including pre-existing
+- * delalloc blocks!) are tossed as part of the write failure error
+- * handling sequence.
++ * Drop-writes support removed because write error handling cannot trash
++ * pre-existing delalloc extents in any useful way anymore. We retain the
++ * definition so that we can reject it as an invalid value in
++ * xfs_errortag_valid().
+ */
+ #define XFS_ERRTAG_DROP_WRITES 28
+ #define XFS_ERRTAG_LOG_BAD_CRC 29
+@@ -95,7 +94,6 @@
+ #define XFS_RANDOM_REFCOUNT_FINISH_ONE 1
+ #define XFS_RANDOM_BMAP_FINISH_ONE 1
+ #define XFS_RANDOM_AG_RESV_CRITICAL 4
+-#define XFS_RANDOM_DROP_WRITES 1
+ #define XFS_RANDOM_LOG_BAD_CRC 1
+ #define XFS_RANDOM_LOG_ITEM_PIN 1
+ #define XFS_RANDOM_BUF_LRU_REF 2
+--- a/fs/xfs/xfs_error.c
++++ b/fs/xfs/xfs_error.c
+@@ -46,7 +46,7 @@ static unsigned int xfs_errortag_random_
+ XFS_RANDOM_REFCOUNT_FINISH_ONE,
+ XFS_RANDOM_BMAP_FINISH_ONE,
+ XFS_RANDOM_AG_RESV_CRITICAL,
+- XFS_RANDOM_DROP_WRITES,
++ 0, /* XFS_RANDOM_DROP_WRITES has been removed */
+ XFS_RANDOM_LOG_BAD_CRC,
+ XFS_RANDOM_LOG_ITEM_PIN,
+ XFS_RANDOM_BUF_LRU_REF,
+@@ -162,7 +162,6 @@ XFS_ERRORTAG_ATTR_RW(refcount_continue_u
+ XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE);
+ XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE);
+ XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL);
+-XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES);
+ XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC);
+ XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN);
+ XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF);
+@@ -206,7 +205,6 @@ static struct attribute *xfs_errortag_at
+ XFS_ERRORTAG_ATTR_LIST(refcount_finish_one),
+ XFS_ERRORTAG_ATTR_LIST(bmap_finish_one),
+ XFS_ERRORTAG_ATTR_LIST(ag_resv_critical),
+- XFS_ERRORTAG_ATTR_LIST(drop_writes),
+ XFS_ERRORTAG_ATTR_LIST(log_bad_crc),
+ XFS_ERRORTAG_ATTR_LIST(log_item_pin),
+ XFS_ERRORTAG_ATTR_LIST(buf_lru_ref),
+@@ -256,6 +254,19 @@ xfs_errortag_del(
+ kmem_free(mp->m_errortag);
+ }
+
++static bool
++xfs_errortag_valid(
++ unsigned int error_tag)
++{
++ if (error_tag >= XFS_ERRTAG_MAX)
++ return false;
++
++ /* Error out removed injection types */
++ if (error_tag == XFS_ERRTAG_DROP_WRITES)
++ return false;
++ return true;
++}
++
+ bool
+ xfs_errortag_test(
+ struct xfs_mount *mp,
+@@ -277,7 +288,9 @@ xfs_errortag_test(
+ if (!mp->m_errortag)
+ return false;
+
+- ASSERT(error_tag < XFS_ERRTAG_MAX);
++ if (!xfs_errortag_valid(error_tag))
++ return false;
++
+ randfactor = mp->m_errortag[error_tag];
+ if (!randfactor || prandom_u32_max(randfactor))
+ return false;
+@@ -293,7 +306,7 @@ xfs_errortag_get(
+ struct xfs_mount *mp,
+ unsigned int error_tag)
+ {
+- if (error_tag >= XFS_ERRTAG_MAX)
++ if (!xfs_errortag_valid(error_tag))
+ return -EINVAL;
+
+ return mp->m_errortag[error_tag];
+@@ -305,7 +318,7 @@ xfs_errortag_set(
+ unsigned int error_tag,
+ unsigned int tag_value)
+ {
+- if (error_tag >= XFS_ERRTAG_MAX)
++ if (!xfs_errortag_valid(error_tag))
+ return -EINVAL;
+
+ mp->m_errortag[error_tag] = tag_value;
+@@ -319,7 +332,7 @@ xfs_errortag_add(
+ {
+ BUILD_BUG_ON(ARRAY_SIZE(xfs_errortag_random_default) != XFS_ERRTAG_MAX);
+
+- if (error_tag >= XFS_ERRTAG_MAX)
++ if (!xfs_errortag_valid(error_tag))
+ return -EINVAL;
+
+ return xfs_errortag_set(mp, error_tag,
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -1190,15 +1190,6 @@ xfs_buffered_write_iomap_end(
+ struct xfs_mount *mp = XFS_M(inode->i_sb);
+ int error;
+
+- /*
+- * Behave as if the write failed if drop writes is enabled. Set the NEW
+- * flag to force delalloc cleanup.
+- */
+- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DROP_WRITES)) {
+- iomap->flags |= IOMAP_F_NEW;
+- written = 0;
+- }
+-
+ error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset,
+ length, written, &xfs_buffered_write_delalloc_punch);
+ if (error && !xfs_is_shutdown(mp)) {
--- /dev/null
+From stable+bounces-42908-greg=kroah.com@vger.kernel.org Wed May 1 20:42:09 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:41:07 -0700
+Subject: xfs: estimate post-merge refcounts correctly
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Xiao Yang <yangx.jy@fujitsu.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-19-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit b25d1984aa884fc91a73a5a407b9ac976d441e9b ]
+
+Upon enabling fsdax + reflink for XFS, xfs/179 began to report refcount
+metadata corruptions after being run. Specifically, xfs_repair noticed
+single-block refcount records that could be combined but had not been.
+
+The root cause of this is improper MAXREFCOUNT edge case handling in
+xfs_refcount_merge_extents. When we're trying to find candidates for a
+refcount btree record merge, we compute the refcount attribute of the
+merged record, but we fail to account for the fact that once a record
+hits rc_refcount == MAXREFCOUNT, it is pinned that way forever. Hence
+the computed refcount is wrong, and we fail to merge the extents.
+
+Fix this by adjusting the merge predicates to compute the adjusted
+refcount correctly.
+
+Fixes: 3172725814f9 ("xfs: adjust refcount of an extent of blocks in refcount btree")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Xiao Yang <yangx.jy@fujitsu.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_refcount.c | 25 +++++++++++++++++++++----
+ 1 file changed, 21 insertions(+), 4 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_refcount.c
++++ b/fs/xfs/libxfs/xfs_refcount.c
+@@ -820,6 +820,17 @@ xfs_refc_valid(
+ return rc->rc_startblock != NULLAGBLOCK;
+ }
+
++static inline xfs_nlink_t
++xfs_refc_merge_refcount(
++ const struct xfs_refcount_irec *irec,
++ enum xfs_refc_adjust_op adjust)
++{
++ /* Once a record hits MAXREFCOUNT, it is pinned there forever */
++ if (irec->rc_refcount == MAXREFCOUNT)
++ return MAXREFCOUNT;
++ return irec->rc_refcount + adjust;
++}
++
+ static inline bool
+ xfs_refc_want_merge_center(
+ const struct xfs_refcount_irec *left,
+@@ -831,6 +842,7 @@ xfs_refc_want_merge_center(
+ unsigned long long *ulenp)
+ {
+ unsigned long long ulen = left->rc_blockcount;
++ xfs_nlink_t new_refcount;
+
+ /*
+ * To merge with a center record, both shoulder records must be
+@@ -846,9 +858,10 @@ xfs_refc_want_merge_center(
+ return false;
+
+ /* The shoulder record refcounts must match the new refcount. */
+- if (left->rc_refcount != cleft->rc_refcount + adjust)
++ new_refcount = xfs_refc_merge_refcount(cleft, adjust);
++ if (left->rc_refcount != new_refcount)
+ return false;
+- if (right->rc_refcount != cleft->rc_refcount + adjust)
++ if (right->rc_refcount != new_refcount)
+ return false;
+
+ /*
+@@ -871,6 +884,7 @@ xfs_refc_want_merge_left(
+ enum xfs_refc_adjust_op adjust)
+ {
+ unsigned long long ulen = left->rc_blockcount;
++ xfs_nlink_t new_refcount;
+
+ /*
+ * For a left merge, the left shoulder record must be adjacent to the
+@@ -881,7 +895,8 @@ xfs_refc_want_merge_left(
+ return false;
+
+ /* Left shoulder record refcount must match the new refcount. */
+- if (left->rc_refcount != cleft->rc_refcount + adjust)
++ new_refcount = xfs_refc_merge_refcount(cleft, adjust);
++ if (left->rc_refcount != new_refcount)
+ return false;
+
+ /*
+@@ -903,6 +918,7 @@ xfs_refc_want_merge_right(
+ enum xfs_refc_adjust_op adjust)
+ {
+ unsigned long long ulen = right->rc_blockcount;
++ xfs_nlink_t new_refcount;
+
+ /*
+ * For a right merge, the right shoulder record must be adjacent to the
+@@ -913,7 +929,8 @@ xfs_refc_want_merge_right(
+ return false;
+
+ /* Right shoulder record refcount must match the new refcount. */
+- if (right->rc_refcount != cright->rc_refcount + adjust)
++ new_refcount = xfs_refc_merge_refcount(cright, adjust);
++ if (right->rc_refcount != new_refcount)
+ return false;
+
+ /*
--- /dev/null
+From stable+bounces-42900-greg=kroah.com@vger.kernel.org Wed May 1 20:41:50 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:40:59 -0700
+Subject: xfs: fix incorrect error-out in xfs_remove
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" <djwong@kernel.org>, Andrey Albershteyn <aalbersh@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-11-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 2653d53345bda90604f673bb211dd060a5a5c232 ]
+
+Clean up resources if resetting the dotdot entry doesn't succeed.
+Observed through code inspection.
+
+Fixes: 5838d0356bb3 ("xfs: reset child dir '..' entry when unlinking child")
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Andrey Albershteyn <aalbersh@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_inode.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_inode.c
++++ b/fs/xfs/xfs_inode.c
+@@ -2479,7 +2479,7 @@ xfs_remove(
+ error = xfs_dir_replace(tp, ip, &xfs_name_dotdot,
+ tp->t_mountp->m_sb.sb_rootino, 0);
+ if (error)
+- return error;
++ goto out_trans_cancel;
+ }
+ } else {
+ /*
--- /dev/null
+From stable+bounces-42902-greg=kroah.com@vger.kernel.org Wed May 1 20:41:56 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:41:01 -0700
+Subject: xfs: fix incorrect i_nlink caused by inode racing
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Long Li <leo.lilong@huawei.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-13-leah.rumancik@gmail.com>
+
+From: Long Li <leo.lilong@huawei.com>
+
+[ Upstream commit 28b4b0596343d19d140da059eee0e5c2b5328731 ]
+
+The following error occurred during the fsstress test:
+
+XFS: Assertion failed: VFS_I(ip)->i_nlink >= 2, file: fs/xfs/xfs_inode.c, line: 2452
+
+The problem was that inode race condition causes incorrect i_nlink to be
+written to disk, and then it is read into memory. Consider the following
+call graph, inodes that are marked as both XFS_IFLUSHING and
+XFS_IRECLAIMABLE, i_nlink will be reset to 1 and then restored to original
+value in xfs_reinit_inode(). Therefore, the i_nlink of directory on disk
+may be set to 1.
+
+ xfsaild
+ xfs_inode_item_push
+ xfs_iflush_cluster
+ xfs_iflush
+ xfs_inode_to_disk
+
+ xfs_iget
+ xfs_iget_cache_hit
+ xfs_iget_recycle
+ xfs_reinit_inode
+ inode_init_always
+
+xfs_reinit_inode() needs to hold the ILOCK_EXCL as it is changing internal
+inode state and can race with other RCU protected inode lookups. On the
+read side, xfs_iflush_cluster() grabs the ILOCK_SHARED while under rcu +
+ip->i_flags_lock, and so xfs_iflush/xfs_inode_to_disk() are protected from
+racing inode updates (during transactions) by that lock.
+
+Fixes: ff7bebeb91f8 ("xfs: refactor the inode recycling code") # goes further back than this
+Signed-off-by: Long Li <leo.lilong@huawei.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_icache.c | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/fs/xfs/xfs_icache.c
++++ b/fs/xfs/xfs_icache.c
+@@ -342,6 +342,9 @@ xfs_iget_recycle(
+
+ trace_xfs_iget_recycle(ip);
+
++ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
++ return -EAGAIN;
++
+ /*
+ * We need to make it look like the inode is being reclaimed to prevent
+ * the actual reclaim workers from stomping over us while we recycle
+@@ -355,6 +358,7 @@ xfs_iget_recycle(
+
+ ASSERT(!rwsem_is_locked(&inode->i_rwsem));
+ error = xfs_reinit_inode(mp, inode);
++ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ if (error) {
+ /*
+ * Re-initializing the inode failed, and we are in deep
+@@ -523,6 +527,8 @@ xfs_iget_cache_hit(
+ if (ip->i_flags & XFS_IRECLAIMABLE) {
+ /* Drops i_flags_lock and RCU read lock. */
+ error = xfs_iget_recycle(pag, ip);
++ if (error == -EAGAIN)
++ goto out_skip;
+ if (error)
+ return error;
+ } else {
--- /dev/null
+From stable+bounces-42911-greg=kroah.com@vger.kernel.org Wed May 1 20:42:18 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:41:10 -0700
+Subject: xfs: fix log recovery when unknown rocompat bits are set
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-22-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 74ad4693b6473950e971b3dc525b5ee7570e05d0 ]
+
+Log recovery has always run on read only mounts, even where the primary
+superblock advertises unknown rocompat bits. Due to a misunderstanding
+between Eric and Darrick back in 2018, we accidentally changed the
+superblock write verifier to shutdown the fs over that exact scenario.
+As a result, the log cleaning that occurs at the end of the mounting
+process fails if there are unknown rocompat bits set.
+
+As we now allow writing of the superblock if there are unknown rocompat
+bits set on a RO mount, we no longer want to turn off RO state to allow
+log recovery to succeed on a RO mount. Hence we also remove all the
+(now unnecessary) RO state toggling from the log recovery path.
+
+Fixes: 9e037cb7972f ("xfs: check for unknown v5 feature bits in superblock write verifier"
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_sb.c | 3 ++-
+ fs/xfs/xfs_log.c | 17 -----------------
+ 2 files changed, 2 insertions(+), 18 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_sb.c
++++ b/fs/xfs/libxfs/xfs_sb.c
+@@ -266,7 +266,8 @@ xfs_validate_sb_write(
+ return -EFSCORRUPTED;
+ }
+
+- if (xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
++ if (!xfs_is_readonly(mp) &&
++ xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) {
+ xfs_alert(mp,
+ "Corruption detected in superblock read-only compatible features (0x%x)!",
+ (sbp->sb_features_ro_compat &
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -730,15 +730,7 @@ xfs_log_mount(
+ * just worked.
+ */
+ if (!xfs_has_norecovery(mp)) {
+- /*
+- * log recovery ignores readonly state and so we need to clear
+- * mount-based read only state so it can write to disk.
+- */
+- bool readonly = test_and_clear_bit(XFS_OPSTATE_READONLY,
+- &mp->m_opstate);
+ error = xlog_recover(log);
+- if (readonly)
+- set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+ if (error) {
+ xfs_warn(mp, "log mount/recovery failed: error %d",
+ error);
+@@ -787,7 +779,6 @@ xfs_log_mount_finish(
+ struct xfs_mount *mp)
+ {
+ struct xlog *log = mp->m_log;
+- bool readonly;
+ int error = 0;
+
+ if (xfs_has_norecovery(mp)) {
+@@ -796,12 +787,6 @@ xfs_log_mount_finish(
+ }
+
+ /*
+- * log recovery ignores readonly state and so we need to clear
+- * mount-based read only state so it can write to disk.
+- */
+- readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+-
+- /*
+ * During the second phase of log recovery, we need iget and
+ * iput to behave like they do for an active filesystem.
+ * xfs_fs_drop_inode needs to be able to prevent the deletion
+@@ -850,8 +835,6 @@ xfs_log_mount_finish(
+ xfs_buftarg_drain(mp->m_ddev_targp);
+
+ clear_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
+- if (readonly)
+- set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+
+ /* Make sure the log is dead if we're returning failure. */
+ ASSERT(!error || xlog_is_shutdown(log));
--- /dev/null
+From stable+bounces-42898-greg=kroah.com@vger.kernel.org Wed May 1 20:41:46 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:40:58 -0700
+Subject: xfs: fix off-by-one-block in xfs_discard_folio()
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, Pengfei Xu <pengfei.xu@intel.com>, Brian Foster <bfoster@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-10-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 8ac5b996bf5199f15b7687ceae989f8b2a410dda ]
+
+The recent writeback corruption fixes changed the code in
+xfs_discard_folio() to calculate a byte range to for punching
+delalloc extents. A mistake was made in using round_up(pos) for the
+end offset, because when pos points at the first byte of a block, it
+does not get rounded up to point to the end byte of the block. hence
+the punch range is short, and this leads to unexpected behaviour in
+certain cases in xfs_bmap_punch_delalloc_range.
+
+e.g. pos = 0 means we call xfs_bmap_punch_delalloc_range(0,0), so
+there is no previous extent and it rounds up the punch to the end of
+the delalloc extent it found at offset 0, not the end of the range
+given to xfs_bmap_punch_delalloc_range().
+
+Fix this by handling the zero block offset case correctly.
+
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=217030
+Link: https://lore.kernel.org/linux-xfs/Y+vOfaxIWX1c%2Fyy9@bfoster/
+Fixes: 7348b322332d ("xfs: xfs_bmap_punch_delalloc_range() should take a byte range")
+Reported-by: Pengfei Xu <pengfei.xu@intel.com>
+Found-by: Brian Foster <bfoster@redhat.com>
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_aops.c | 21 ++++++++++++++-------
+ 1 file changed, 14 insertions(+), 7 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -439,15 +439,17 @@ xfs_prepare_ioend(
+ }
+
+ /*
+- * If the page has delalloc blocks on it, we need to punch them out before we
+- * invalidate the page. If we don't, we leave a stale delalloc mapping on the
+- * inode that can trip up a later direct I/O read operation on the same region.
++ * If the folio has delalloc blocks on it, the caller is asking us to punch them
++ * out. If we don't, we can leave a stale delalloc mapping covered by a clean
++ * page that needs to be dirtied again before the delalloc mapping can be
++ * converted. This stale delalloc mapping can trip up a later direct I/O read
++ * operation on the same region.
+ *
+- * We prevent this by truncating away the delalloc regions on the page. Because
++ * We prevent this by truncating away the delalloc regions on the folio. Because
+ * they are delalloc, we can do this without needing a transaction. Indeed - if
+ * we get ENOSPC errors, we have to be able to do this truncation without a
+- * transaction as there is no space left for block reservation (typically why we
+- * see a ENOSPC in writeback).
++ * transaction as there is no space left for block reservation (typically why
++ * we see a ENOSPC in writeback).
+ */
+ static void
+ xfs_discard_folio(
+@@ -465,8 +467,13 @@ xfs_discard_folio(
+ "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
+ folio, ip->i_ino, pos);
+
++ /*
++ * The end of the punch range is always the offset of the the first
++ * byte of the next folio. Hence the end offset is only dependent on the
++ * folio itself and not the start offset that is passed in.
++ */
+ error = xfs_bmap_punch_delalloc_range(ip, pos,
+- round_up(pos, folio_size(folio)));
++ folio_pos(folio) + folio_size(folio));
+
+ if (error && !xfs_is_shutdown(mp))
+ xfs_alert(mp, "page discard unable to remove delalloc mapping.");
--- /dev/null
+From stable+bounces-42901-greg=kroah.com@vger.kernel.org Wed May 1 20:41:52 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:41:00 -0700
+Subject: xfs: fix sb write verify for lazysbcount
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Long Li <leo.lilong@huawei.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-12-leah.rumancik@gmail.com>
+
+From: Long Li <leo.lilong@huawei.com>
+
+[ Upstream commit 59f6ab40fd8735c9a1a15401610a31cc06a0bbd6 ]
+
+When lazysbcount is enabled, fsstress and loop mount/unmount test report
+the following problems:
+
+XFS (loop0): SB summary counter sanity check failed
+XFS (loop0): Metadata corruption detected at xfs_sb_write_verify+0x13b/0x460,
+ xfs_sb block 0x0
+XFS (loop0): Unmount and run xfs_repair
+XFS (loop0): First 128 bytes of corrupted metadata buffer:
+00000000: 58 46 53 42 00 00 10 00 00 00 00 00 00 28 00 00 XFSB.........(..
+00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................
+00000020: 69 fb 7c cd 5f dc 44 af 85 74 e0 cc d4 e3 34 5a i.|._.D..t....4Z
+00000030: 00 00 00 00 00 20 00 06 00 00 00 00 00 00 00 80 ..... ..........
+00000040: 00 00 00 00 00 00 00 81 00 00 00 00 00 00 00 82 ................
+00000050: 00 00 00 01 00 0a 00 00 00 00 00 04 00 00 00 00 ................
+00000060: 00 00 0a 00 b4 b5 02 00 02 00 00 08 00 00 00 00 ................
+00000070: 00 00 00 00 00 00 00 00 0c 09 09 03 14 00 00 19 ................
+XFS (loop0): Corruption of in-memory data (0x8) detected at _xfs_buf_ioapply
+ +0xe1e/0x10e0 (fs/xfs/xfs_buf.c:1580). Shutting down filesystem.
+XFS (loop0): Please unmount the filesystem and rectify the problem(s)
+XFS (loop0): log mount/recovery failed: error -117
+XFS (loop0): log mount failed
+
+This corruption will shutdown the file system and the file system will
+no longer be mountable. The following script can reproduce the problem,
+but it may take a long time.
+
+ #!/bin/bash
+
+ device=/dev/sda
+ testdir=/mnt/test
+ round=0
+
+ function fail()
+ {
+ echo "$*"
+ exit 1
+ }
+
+ mkdir -p $testdir
+ while [ $round -lt 10000 ]
+ do
+ echo "******* round $round ********"
+ mkfs.xfs -f $device
+ mount $device $testdir || fail "mount failed!"
+ fsstress -d $testdir -l 0 -n 10000 -p 4 >/dev/null &
+ sleep 4
+ killall -w fsstress
+ umount $testdir
+ xfs_repair -e $device > /dev/null
+ if [ $? -eq 2 ];then
+ echo "ERR CODE 2: Dirty log exception during repair."
+ exit 1
+ fi
+ round=$(($round+1))
+ done
+
+With lazysbcount is enabled, There is no additional lock protection for
+reading m_ifree and m_icount in xfs_log_sb(), if other cpu modifies the
+m_ifree, this will make the m_ifree greater than m_icount. For example,
+consider the following sequence and ifreedelta is postive:
+
+ CPU0 CPU1
+ xfs_log_sb xfs_trans_unreserve_and_mod_sb
+ ---------- ------------------------------
+ percpu_counter_sum(&mp->m_icount)
+ percpu_counter_add_batch(&mp->m_icount,
+ idelta, XFS_ICOUNT_BATCH)
+ percpu_counter_add(&mp->m_ifree, ifreedelta);
+ percpu_counter_sum(&mp->m_ifree)
+
+After this, incorrect inode count (sb_ifree > sb_icount) will be writen to
+the log. In the subsequent writing of sb, incorrect inode count (sb_ifree >
+sb_icount) will fail to pass the boundary check in xfs_validate_sb_write()
+that cause the file system shutdown.
+
+When lazysbcount is enabled, we don't need to guarantee that Lazy sb
+counters are completely correct, but we do need to guarantee that sb_ifree
+<= sb_icount. On the other hand, the constraint that m_ifree <= m_icount
+must be satisfied any time that there /cannot/ be other threads allocating
+or freeing inode chunks. If the constraint is violated under these
+circumstances, sb_i{count,free} (the ondisk superblock inode counters)
+maybe incorrect and need to be marked sick at unmount, the count will
+be rebuilt on the next mount.
+
+Fixes: 8756a5af1819 ("libxfs: add more bounds checking to sb sanity checks")
+Signed-off-by: Long Li <leo.lilong@huawei.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_sb.c | 4 +++-
+ fs/xfs/xfs_mount.c | 15 +++++++++++++++
+ 2 files changed, 18 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_sb.c
++++ b/fs/xfs/libxfs/xfs_sb.c
+@@ -973,7 +973,9 @@ xfs_log_sb(
+ */
+ if (xfs_has_lazysbcount(mp)) {
+ mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
+- mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
++ mp->m_sb.sb_ifree = min_t(uint64_t,
++ percpu_counter_sum(&mp->m_ifree),
++ mp->m_sb.sb_icount);
+ mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+ }
+
+--- a/fs/xfs/xfs_mount.c
++++ b/fs/xfs/xfs_mount.c
+@@ -538,6 +538,20 @@ xfs_check_summary_counts(
+ return 0;
+ }
+
++static void
++xfs_unmount_check(
++ struct xfs_mount *mp)
++{
++ if (xfs_is_shutdown(mp))
++ return;
++
++ if (percpu_counter_sum(&mp->m_ifree) >
++ percpu_counter_sum(&mp->m_icount)) {
++ xfs_alert(mp, "ifree/icount mismatch at unmount");
++ xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
++ }
++}
++
+ /*
+ * Flush and reclaim dirty inodes in preparation for unmount. Inodes and
+ * internal inode structures can be sitting in the CIL and AIL at this point,
+@@ -1077,6 +1091,7 @@ xfs_unmountfs(
+ if (error)
+ xfs_warn(mp, "Unable to free reserved block pool. "
+ "Freespace may not be correct on next mount.");
++ xfs_unmount_check(mp);
+
+ xfs_log_unmount(mp);
+ xfs_da_unmount(mp);
--- /dev/null
+From stable+bounces-42906-greg=kroah.com@vger.kernel.org Wed May 1 20:42:06 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:41:05 -0700
+Subject: xfs: fix super block buf log item UAF during force shutdown
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Guo Xuenan <guoxuenan@huawei.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-17-leah.rumancik@gmail.com>
+
+From: Guo Xuenan <guoxuenan@huawei.com>
+
+[ Upstream commit 575689fc0ffa6c4bb4e72fd18e31a6525a6124e0 ]
+
+xfs log io error will trigger xlog shut down, and end_io worker call
+xlog_state_shutdown_callbacks to unpin and release the buf log item.
+The race condition is that when there are some thread doing transaction
+commit and happened not to be intercepted by xlog_is_shutdown, then,
+these log item will be insert into CIL, when unpin and release these
+buf log item, UAF will occur. BTW, add delay before `xlog_cil_commit`
+can increase recurrence probability.
+
+The following call graph actually encountered this bad situation.
+fsstress io end worker kworker/0:1H-216
+ xlog_ioend_work
+ ->xlog_force_shutdown
+ ->xlog_state_shutdown_callbacks
+ ->xlog_cil_process_committed
+ ->xlog_cil_committed
+ ->xfs_trans_committed_bulk
+->xfs_trans_apply_sb_deltas ->li_ops->iop_unpin(lip, 1);
+ ->xfs_trans_getsb
+ ->_xfs_trans_bjoin
+ ->xfs_buf_item_init
+ ->if (bip) { return 0;} //relog
+->xlog_cil_commit
+ ->xlog_cil_insert_items //insert into CIL
+ ->xfs_buf_ioend_fail(bp);
+ ->xfs_buf_ioend
+ ->xfs_buf_item_done
+ ->xfs_buf_item_relse
+ ->xfs_buf_item_free
+
+when cil push worker gather percpu cil and insert super block buf log item
+into ctx->log_items then uaf occurs.
+
+==================================================================
+BUG: KASAN: use-after-free in xlog_cil_push_work+0x1c8f/0x22f0
+Write of size 8 at addr ffff88801800f3f0 by task kworker/u4:4/105
+
+CPU: 0 PID: 105 Comm: kworker/u4:4 Tainted: G W
+6.1.0-rc1-00001-g274115149b42 #136
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
+1.13.0-1ubuntu1.1 04/01/2014
+Workqueue: xfs-cil/sda xlog_cil_push_work
+Call Trace:
+ <TASK>
+ dump_stack_lvl+0x4d/0x66
+ print_report+0x171/0x4a6
+ kasan_report+0xb3/0x130
+ xlog_cil_push_work+0x1c8f/0x22f0
+ process_one_work+0x6f9/0xf70
+ worker_thread+0x578/0xf30
+ kthread+0x28c/0x330
+ ret_from_fork+0x1f/0x30
+ </TASK>
+
+Allocated by task 2145:
+ kasan_save_stack+0x1e/0x40
+ kasan_set_track+0x21/0x30
+ __kasan_slab_alloc+0x54/0x60
+ kmem_cache_alloc+0x14a/0x510
+ xfs_buf_item_init+0x160/0x6d0
+ _xfs_trans_bjoin+0x7f/0x2e0
+ xfs_trans_getsb+0xb6/0x3f0
+ xfs_trans_apply_sb_deltas+0x1f/0x8c0
+ __xfs_trans_commit+0xa25/0xe10
+ xfs_symlink+0xe23/0x1660
+ xfs_vn_symlink+0x157/0x280
+ vfs_symlink+0x491/0x790
+ do_symlinkat+0x128/0x220
+ __x64_sys_symlink+0x7a/0x90
+ do_syscall_64+0x35/0x80
+ entry_SYSCALL_64_after_hwframe+0x63/0xcd
+
+Freed by task 216:
+ kasan_save_stack+0x1e/0x40
+ kasan_set_track+0x21/0x30
+ kasan_save_free_info+0x2a/0x40
+ __kasan_slab_free+0x105/0x1a0
+ kmem_cache_free+0xb6/0x460
+ xfs_buf_ioend+0x1e9/0x11f0
+ xfs_buf_item_unpin+0x3d6/0x840
+ xfs_trans_committed_bulk+0x4c2/0x7c0
+ xlog_cil_committed+0xab6/0xfb0
+ xlog_cil_process_committed+0x117/0x1e0
+ xlog_state_shutdown_callbacks+0x208/0x440
+ xlog_force_shutdown+0x1b3/0x3a0
+ xlog_ioend_work+0xef/0x1d0
+ process_one_work+0x6f9/0xf70
+ worker_thread+0x578/0xf30
+ kthread+0x28c/0x330
+ ret_from_fork+0x1f/0x30
+
+The buggy address belongs to the object at ffff88801800f388
+ which belongs to the cache xfs_buf_item of size 272
+The buggy address is located 104 bytes inside of
+ 272-byte region [ffff88801800f388, ffff88801800f498)
+
+The buggy address belongs to the physical page:
+page:ffffea0000600380 refcount:1 mapcount:0 mapping:0000000000000000
+index:0xffff88801800f208 pfn:0x1800e
+head:ffffea0000600380 order:1 compound_mapcount:0 compound_pincount:0
+flags: 0x1fffff80010200(slab|head|node=0|zone=1|lastcpupid=0x1fffff)
+raw: 001fffff80010200 ffffea0000699788 ffff88801319db50 ffff88800fb50640
+raw: ffff88801800f208 000000000015000a 00000001ffffffff 0000000000000000
+page dumped because: kasan: bad access detected
+
+Memory state around the buggy address:
+ ffff88801800f280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ ffff88801800f300: fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc fc
+>ffff88801800f380: fc fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ ^
+ ffff88801800f400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ ffff88801800f480: fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc fc
+==================================================================
+Disabling lock debugging due to kernel taint
+
+Signed-off-by: Guo Xuenan <guoxuenan@huawei.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf_item.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/xfs/xfs_buf_item.c
++++ b/fs/xfs/xfs_buf_item.c
+@@ -1018,6 +1018,8 @@ xfs_buf_item_relse(
+ trace_xfs_buf_item_relse(bp, _RET_IP_);
+ ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
+
++ if (atomic_read(&bip->bli_refcount))
++ return;
+ bp->b_log_item = NULL;
+ xfs_buf_rele(bp);
+ xfs_buf_item_free(bip);
--- /dev/null
+From stable+bounces-42912-greg=kroah.com@vger.kernel.org Wed May 1 20:42:18 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:41:11 -0700
+Subject: xfs: get root inode correctly at bulkstat
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Hironori Shiina <shiina.hironori@gmail.com>, Hironori Shiina <shiina.hironori@fujitsu.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-23-leah.rumancik@gmail.com>
+
+From: Hironori Shiina <shiina.hironori@gmail.com>
+
+[ Upstream commit 817644fa4525258992f17fecf4f1d6cdd2e1b731 ]
+
+The root inode number should be set to `breq->startino` for getting stat
+information of the root when XFS_BULK_IREQ_SPECIAL_ROOT is used.
+Otherwise, the inode search is started from 1
+(XFS_BULK_IREQ_SPECIAL_ROOT) and the inode with the lowest number in a
+filesystem is returned.
+
+Fixes: bf3cb3944792 ("xfs: allow single bulkstat of special inodes")
+Signed-off-by: Hironori Shiina <shiina.hironori@fujitsu.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_ioctl.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/fs/xfs/xfs_ioctl.c
++++ b/fs/xfs/xfs_ioctl.c
+@@ -754,7 +754,7 @@ xfs_bulkstat_fmt(
+ static int
+ xfs_bulk_ireq_setup(
+ struct xfs_mount *mp,
+- struct xfs_bulk_ireq *hdr,
++ const struct xfs_bulk_ireq *hdr,
+ struct xfs_ibulk *breq,
+ void __user *ubuffer)
+ {
+@@ -780,7 +780,7 @@ xfs_bulk_ireq_setup(
+
+ switch (hdr->ino) {
+ case XFS_BULK_IREQ_SPECIAL_ROOT:
+- hdr->ino = mp->m_sb.sb_rootino;
++ breq->startino = mp->m_sb.sb_rootino;
+ break;
+ default:
+ return -EINVAL;
--- /dev/null
+From stable+bounces-42907-greg=kroah.com@vger.kernel.org Wed May 1 20:42:07 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:41:06 -0700
+Subject: xfs: hoist refcount record merge predicates
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Xiao Yang <yangx.jy@fujitsu.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-18-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 9d720a5a658f5135861773f26e927449bef93d61 ]
+
+Hoist these multiline conditionals into separate static inline helpers
+to improve readability and set the stage for corruption fixes that will
+be introduced in the next patch.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Xiao Yang <yangx.jy@fujitsu.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_refcount.c | 129 +++++++++++++++++++++++++++++++++++++------
+ 1 file changed, 113 insertions(+), 16 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_refcount.c
++++ b/fs/xfs/libxfs/xfs_refcount.c
+@@ -815,11 +815,119 @@ out_error:
+ /* Is this extent valid? */
+ static inline bool
+ xfs_refc_valid(
+- struct xfs_refcount_irec *rc)
++ const struct xfs_refcount_irec *rc)
+ {
+ return rc->rc_startblock != NULLAGBLOCK;
+ }
+
++static inline bool
++xfs_refc_want_merge_center(
++ const struct xfs_refcount_irec *left,
++ const struct xfs_refcount_irec *cleft,
++ const struct xfs_refcount_irec *cright,
++ const struct xfs_refcount_irec *right,
++ bool cleft_is_cright,
++ enum xfs_refc_adjust_op adjust,
++ unsigned long long *ulenp)
++{
++ unsigned long long ulen = left->rc_blockcount;
++
++ /*
++ * To merge with a center record, both shoulder records must be
++ * adjacent to the record we want to adjust. This is only true if
++ * find_left and find_right made all four records valid.
++ */
++ if (!xfs_refc_valid(left) || !xfs_refc_valid(right) ||
++ !xfs_refc_valid(cleft) || !xfs_refc_valid(cright))
++ return false;
++
++ /* There must only be one record for the entire range. */
++ if (!cleft_is_cright)
++ return false;
++
++ /* The shoulder record refcounts must match the new refcount. */
++ if (left->rc_refcount != cleft->rc_refcount + adjust)
++ return false;
++ if (right->rc_refcount != cleft->rc_refcount + adjust)
++ return false;
++
++ /*
++ * The new record cannot exceed the max length. ulen is a ULL as the
++ * individual record block counts can be up to (u32 - 1) in length
++ * hence we need to catch u32 addition overflows here.
++ */
++ ulen += cleft->rc_blockcount + right->rc_blockcount;
++ if (ulen >= MAXREFCEXTLEN)
++ return false;
++
++ *ulenp = ulen;
++ return true;
++}
++
++static inline bool
++xfs_refc_want_merge_left(
++ const struct xfs_refcount_irec *left,
++ const struct xfs_refcount_irec *cleft,
++ enum xfs_refc_adjust_op adjust)
++{
++ unsigned long long ulen = left->rc_blockcount;
++
++ /*
++ * For a left merge, the left shoulder record must be adjacent to the
++ * start of the range. If this is true, find_left made left and cleft
++ * contain valid contents.
++ */
++ if (!xfs_refc_valid(left) || !xfs_refc_valid(cleft))
++ return false;
++
++ /* Left shoulder record refcount must match the new refcount. */
++ if (left->rc_refcount != cleft->rc_refcount + adjust)
++ return false;
++
++ /*
++ * The new record cannot exceed the max length. ulen is a ULL as the
++ * individual record block counts can be up to (u32 - 1) in length
++ * hence we need to catch u32 addition overflows here.
++ */
++ ulen += cleft->rc_blockcount;
++ if (ulen >= MAXREFCEXTLEN)
++ return false;
++
++ return true;
++}
++
++static inline bool
++xfs_refc_want_merge_right(
++ const struct xfs_refcount_irec *cright,
++ const struct xfs_refcount_irec *right,
++ enum xfs_refc_adjust_op adjust)
++{
++ unsigned long long ulen = right->rc_blockcount;
++
++ /*
++ * For a right merge, the right shoulder record must be adjacent to the
++ * end of the range. If this is true, find_right made cright and right
++ * contain valid contents.
++ */
++ if (!xfs_refc_valid(right) || !xfs_refc_valid(cright))
++ return false;
++
++ /* Right shoulder record refcount must match the new refcount. */
++ if (right->rc_refcount != cright->rc_refcount + adjust)
++ return false;
++
++ /*
++ * The new record cannot exceed the max length. ulen is a ULL as the
++ * individual record block counts can be up to (u32 - 1) in length
++ * hence we need to catch u32 addition overflows here.
++ */
++ ulen += cright->rc_blockcount;
++ if (ulen >= MAXREFCEXTLEN)
++ return false;
++
++ return true;
++}
++
+ /*
+ * Try to merge with any extents on the boundaries of the adjustment range.
+ */
+@@ -861,23 +969,15 @@ xfs_refcount_merge_extents(
+ (cleft.rc_blockcount == cright.rc_blockcount);
+
+ /* Try to merge left, cleft, and right. cleft must == cright. */
+- ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount +
+- right.rc_blockcount;
+- if (xfs_refc_valid(&left) && xfs_refc_valid(&right) &&
+- xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal &&
+- left.rc_refcount == cleft.rc_refcount + adjust &&
+- right.rc_refcount == cleft.rc_refcount + adjust &&
+- ulen < MAXREFCEXTLEN) {
++ if (xfs_refc_want_merge_center(&left, &cleft, &cright, &right, cequal,
++ adjust, &ulen)) {
+ *shape_changed = true;
+ return xfs_refcount_merge_center_extents(cur, &left, &cleft,
+ &right, ulen, aglen);
+ }
+
+ /* Try to merge left and cleft. */
+- ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount;
+- if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) &&
+- left.rc_refcount == cleft.rc_refcount + adjust &&
+- ulen < MAXREFCEXTLEN) {
++ if (xfs_refc_want_merge_left(&left, &cleft, adjust)) {
+ *shape_changed = true;
+ error = xfs_refcount_merge_left_extent(cur, &left, &cleft,
+ agbno, aglen);
+@@ -893,10 +993,7 @@ xfs_refcount_merge_extents(
+ }
+
+ /* Try to merge cright and right. */
+- ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount;
+- if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) &&
+- right.rc_refcount == cright.rc_refcount + adjust &&
+- ulen < MAXREFCEXTLEN) {
++ if (xfs_refc_want_merge_right(&cright, &right, adjust)) {
+ *shape_changed = true;
+ return xfs_refcount_merge_right_extent(cur, &right, &cright,
+ aglen);
--- /dev/null
+From stable+bounces-42903-greg=kroah.com@vger.kernel.org Wed May 1 20:41:59 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:41:02 -0700
+Subject: xfs: invalidate block device page cache during unmount
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" <djwong@kernel.org>, Gao Xiang <hsiangkao@linux.alibaba.com>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-14-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit 032e160305f6872e590c77f11896fb28365c6d6c ]
+
+Every now and then I see fstests failures on aarch64 (64k pages) that
+trigger on the following sequence:
+
+mkfs.xfs $dev
+mount $dev $mnt
+touch $mnt/a
+umount $mnt
+xfs_db -c 'path /a' -c 'print' $dev
+
+99% of the time this succeeds, but every now and then xfs_db cannot find
+/a and fails. This turns out to be a race involving udev/blkid, the
+page cache for the block device, and the xfs_db process.
+
+udev is triggered whenever anyone closes a block device or unmounts it.
+The default udev rules invoke blkid to read the fs super and create
+symlinks to the bdev under /dev/disk. For this, it uses buffered reads
+through the page cache.
+
+xfs_db also uses buffered reads to examine metadata. There is no
+coordination between xfs_db and udev, which means that they can run
+concurrently. Note there is no coordination between the kernel and
+blkid either.
+
+On a system with 64k pages, the page cache can cache the superblock and
+the root inode (and hence the root dir) with the same 64k page. If
+udev spawns blkid after the mkfs and the system is busy enough that it
+is still running when xfs_db starts up, they'll both read from the same
+page in the pagecache.
+
+The unmount writes updated inode metadata to disk directly. The XFS
+buffer cache does not use the bdev pagecache, nor does it invalidate the
+pagecache on umount. If the above scenario occurs, the pagecache no
+longer reflects what's on disk, xfs_db reads the stale metadata, and
+fails to find /a. Most of the time this succeeds because closing a bdev
+invalidates the page cache, but when processes race, everyone loses.
+
+Fix the problem by invalidating the bdev pagecache after flushing the
+bdev, so that xfs_db will see up to date metadata.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Gao Xiang <hsiangkao@linux.alibaba.com>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_buf.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/xfs/xfs_buf.c
++++ b/fs/xfs/xfs_buf.c
+@@ -1945,6 +1945,7 @@ xfs_free_buftarg(
+ list_lru_destroy(&btp->bt_lru);
+
+ blkdev_issue_flush(btp->bt_bdev);
++ invalidate_bdev(btp->bt_bdev);
+ fs_put_dax(btp->bt_daxdev, btp->bt_mount);
+
+ kmem_free(btp);
--- /dev/null
+From stable+bounces-42909-greg=kroah.com@vger.kernel.org Wed May 1 20:42:12 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:41:08 -0700
+Subject: xfs: invalidate xfs_bufs when allocating cow extents
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" <djwong@kernel.org>, Dave Chinner <dchinner@redhat.com>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-20-leah.rumancik@gmail.com>
+
+From: "Darrick J. Wong" <djwong@kernel.org>
+
+[ Upstream commit ddfdd530e43fcb3f7a0a69966e5f6c33497b4ae3 ]
+
+While investigating test failures in xfs/17[1-3] in alwayscow mode, I
+noticed through code inspection that xfs_bmap_alloc_userdata isn't
+setting XFS_ALLOC_USERDATA when allocating extents for a file's CoW
+fork. COW staging extents should be flagged as USERDATA, since user
+data are persisted to these blocks before being remapped into a file.
+
+This mis-classification has a few impacts on the behavior of the system.
+First, the filestreams allocator is supposed to keep allocating from a
+chosen AG until it runs out of space in that AG. However, it only does
+that for USERDATA allocations, which means that COW allocations aren't
+tied to the filestreams AG. Fortunately, few people use filestreams, so
+nobody's noticed.
+
+A more serious problem is that xfs_alloc_ag_vextent_small looks for a
+buffer to invalidate *if* the USERDATA flag is set and the AG is so full
+that the allocation had to come from the AGFL because the cntbt is
+empty. The consequences of not invalidating the buffer are severe --
+if the AIL incorrectly checkpoints a buffer that is now being used to
+store user data, that action will clobber the user's written data.
+
+Fix filestreams and yet another data corruption vector by flagging COW
+allocations as USERDATA.
+
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Reviewed-by: Dave Chinner <dchinner@redhat.com>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_bmap.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -4058,7 +4058,7 @@ xfs_bmap_alloc_userdata(
+ * the busy list.
+ */
+ bma->datatype = XFS_ALLOC_NOBUSY;
+- if (whichfork == XFS_DATA_FORK) {
++ if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) {
+ bma->datatype |= XFS_ALLOC_USERDATA;
+ if (bma->offset == 0)
+ bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA;
--- /dev/null
+From stable+bounces-42893-greg=kroah.com@vger.kernel.org Wed May 1 20:41:32 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:40:52 -0700
+Subject: xfs,iomap: move delalloc punching to iomap
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-4-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 9c7babf94a0d686b552e53aded8d4703d1b8b92b ]
+
+Because that's what Christoph wants for this error handling path
+only XFS uses.
+
+It requires a new iomap export for handling errors over delalloc
+ranges. This is basically the XFS code as is stands, but even though
+Christoph wants this as iomap funcitonality, we still have
+to call it from the filesystem specific ->iomap_end callback, and
+call into the iomap code with yet another filesystem specific
+callback to punch the delalloc extent within the defined ranges.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/iomap/buffered-io.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++
+ fs/xfs/xfs_iomap.c | 47 ++++++--------------------------------
+ include/linux/iomap.h | 4 +++
+ 3 files changed, 72 insertions(+), 39 deletions(-)
+
+--- a/fs/iomap/buffered-io.c
++++ b/fs/iomap/buffered-io.c
+@@ -827,6 +827,66 @@ iomap_file_buffered_write(struct kiocb *
+ }
+ EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
+
++/*
++ * When a short write occurs, the filesystem may need to remove reserved space
++ * that was allocated in ->iomap_begin from it's ->iomap_end method. For
++ * filesystems that use delayed allocation, we need to punch out delalloc
++ * extents from the range that are not dirty in the page cache. As the write can
++ * race with page faults, there can be dirty pages over the delalloc extent
++ * outside the range of a short write but still within the delalloc extent
++ * allocated for this iomap.
++ *
++ * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
++ * simplify range iterations, but converts them back to {offset,len} tuples for
++ * the punch callback.
++ */
++int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
++ struct iomap *iomap, loff_t pos, loff_t length,
++ ssize_t written,
++ int (*punch)(struct inode *inode, loff_t pos, loff_t length))
++{
++ loff_t start_byte;
++ loff_t end_byte;
++ int blocksize = i_blocksize(inode);
++ int error = 0;
++
++ if (iomap->type != IOMAP_DELALLOC)
++ return 0;
++
++ /* If we didn't reserve the blocks, we're not allowed to punch them. */
++ if (!(iomap->flags & IOMAP_F_NEW))
++ return 0;
++
++ /*
++ * start_byte refers to the first unused block after a short write. If
++ * nothing was written, round offset down to point at the first block in
++ * the range.
++ */
++ if (unlikely(!written))
++ start_byte = round_down(pos, blocksize);
++ else
++ start_byte = round_up(pos + written, blocksize);
++ end_byte = round_up(pos + length, blocksize);
++
++ /* Nothing to do if we've written the entire delalloc extent */
++ if (start_byte >= end_byte)
++ return 0;
++
++ /*
++ * Lock the mapping to avoid races with page faults re-instantiating
++ * folios and dirtying them via ->page_mkwrite between the page cache
++ * truncation and the delalloc extent removal. Failing to do this can
++ * leave dirty pages with no space reservation in the cache.
++ */
++ filemap_invalidate_lock(inode->i_mapping);
++ truncate_pagecache_range(inode, start_byte, end_byte - 1);
++ error = punch(inode, start_byte, end_byte - start_byte);
++ filemap_invalidate_unlock(inode->i_mapping);
++
++ return error;
++}
++EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
++
+ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
+ {
+ struct iomap *iomap = &iter->iomap;
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -1123,12 +1123,12 @@ out_unlock:
+ static int
+ xfs_buffered_write_delalloc_punch(
+ struct inode *inode,
+- loff_t start_byte,
+- loff_t end_byte)
++ loff_t offset,
++ loff_t length)
+ {
+ struct xfs_mount *mp = XFS_M(inode->i_sb);
+- xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte);
+- xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte);
++ xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset);
++ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length);
+
+ return xfs_bmap_punch_delalloc_range(XFS_I(inode), start_fsb,
+ end_fsb - start_fsb);
+@@ -1143,13 +1143,9 @@ xfs_buffered_write_iomap_end(
+ unsigned flags,
+ struct iomap *iomap)
+ {
+- struct xfs_mount *mp = XFS_M(inode->i_sb);
+- loff_t start_byte;
+- loff_t end_byte;
+- int error = 0;
+
+- if (iomap->type != IOMAP_DELALLOC)
+- return 0;
++ struct xfs_mount *mp = XFS_M(inode->i_sb);
++ int error;
+
+ /*
+ * Behave as if the write failed if drop writes is enabled. Set the NEW
+@@ -1160,35 +1156,8 @@ xfs_buffered_write_iomap_end(
+ written = 0;
+ }
+
+- /* If we didn't reserve the blocks, we're not allowed to punch them. */
+- if (!(iomap->flags & IOMAP_F_NEW))
+- return 0;
+-
+- /*
+- * start_fsb refers to the first unused block after a short write. If
+- * nothing was written, round offset down to point at the first block in
+- * the range.
+- */
+- if (unlikely(!written))
+- start_byte = round_down(offset, mp->m_sb.sb_blocksize);
+- else
+- start_byte = round_up(offset + written, mp->m_sb.sb_blocksize);
+- end_byte = round_up(offset + length, mp->m_sb.sb_blocksize);
+-
+- /* Nothing to do if we've written the entire delalloc extent */
+- if (start_byte >= end_byte)
+- return 0;
+-
+- /*
+- * Lock the mapping to avoid races with page faults re-instantiating
+- * folios and dirtying them via ->page_mkwrite between the page cache
+- * truncation and the delalloc extent removal. Failing to do this can
+- * leave dirty pages with no space reservation in the cache.
+- */
+- filemap_invalidate_lock(inode->i_mapping);
+- truncate_pagecache_range(inode, start_byte, end_byte - 1);
+- error = xfs_buffered_write_delalloc_punch(inode, start_byte, end_byte);
+- filemap_invalidate_unlock(inode->i_mapping);
++ error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset,
++ length, written, &xfs_buffered_write_delalloc_punch);
+ if (error && !xfs_is_shutdown(mp)) {
+ xfs_alert(mp, "%s: unable to clean up ino 0x%llx",
+ __func__, XFS_I(inode)->i_ino);
+--- a/include/linux/iomap.h
++++ b/include/linux/iomap.h
+@@ -226,6 +226,10 @@ static inline const struct iomap *iomap_
+
+ ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
+ const struct iomap_ops *ops);
++int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
++ struct iomap *iomap, loff_t pos, loff_t length, ssize_t written,
++ int (*punch)(struct inode *inode, loff_t pos, loff_t length));
++
+ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
+ void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
+ bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
--- /dev/null
+From stable+bounces-42891-greg=kroah.com@vger.kernel.org Wed May 1 20:41:26 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:40:50 -0700
+Subject: xfs: punching delalloc extents on write failure is racy
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, Christoph Hellwig <hch@lst.de>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-2-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 198dd8aedee6a7d2de0dfa739f9a008a938f6848 ]
+
+xfs_buffered_write_iomap_end() has a comment about the safety of
+punching delalloc extents based holding the IOLOCK_EXCL. This
+comment is wrong, and punching delalloc extents is not race free.
+
+When we punch out a delalloc extent after a write failure in
+xfs_buffered_write_iomap_end(), we punch out the page cache with
+truncate_pagecache_range() before we punch out the delalloc extents.
+At this point, we only hold the IOLOCK_EXCL, so there is nothing
+stopping mmap() write faults racing with this cleanup operation,
+reinstantiating a folio over the range we are about to punch and
+hence requiring the delalloc extent to be kept.
+
+If this race condition is hit, we can end up with a dirty page in
+the page cache that has no delalloc extent or space reservation
+backing it. This leads to bad things happening at writeback time.
+
+To avoid this race condition, we need the page cache truncation to
+be atomic w.r.t. the extent manipulation. We can do this by holding
+the mapping->invalidate_lock exclusively across this operation -
+this will prevent new pages from being inserted into the page cache
+whilst we are removing the pages and the backing extent and space
+reservation.
+
+Taking the mapping->invalidate_lock exclusively in the buffered
+write IO path is safe - it naturally nests inside the IOLOCK (see
+truncate and fallocate paths). iomap_zero_range() can be called from
+under the mapping->invalidate_lock (from the truncate path via
+either xfs_zero_eof() or xfs_truncate_page(), but iomap_zero_iter()
+will not instantiate new delalloc pages (because it skips holes) and
+hence will not ever need to punch out delalloc extents on failure.
+
+Fix the locking issue, and clean up the code logic a little to avoid
+unnecessary work if we didn't allocate the delalloc extent or wrote
+the entire region we allocated.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_iomap.c | 41 +++++++++++++++++++++++------------------
+ 1 file changed, 23 insertions(+), 18 deletions(-)
+
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -1147,6 +1147,10 @@ xfs_buffered_write_iomap_end(
+ written = 0;
+ }
+
++ /* If we didn't reserve the blocks, we're not allowed to punch them. */
++ if (!(iomap->flags & IOMAP_F_NEW))
++ return 0;
++
+ /*
+ * start_fsb refers to the first unused block after a short write. If
+ * nothing was written, round offset down to point at the first block in
+@@ -1158,27 +1162,28 @@ xfs_buffered_write_iomap_end(
+ start_fsb = XFS_B_TO_FSB(mp, offset + written);
+ end_fsb = XFS_B_TO_FSB(mp, offset + length);
+
++ /* Nothing to do if we've written the entire delalloc extent */
++ if (start_fsb >= end_fsb)
++ return 0;
++
+ /*
+- * Trim delalloc blocks if they were allocated by this write and we
+- * didn't manage to write the whole range.
+- *
+- * We don't need to care about racing delalloc as we hold i_mutex
+- * across the reserve/allocate/unreserve calls. If there are delalloc
+- * blocks in the range, they are ours.
++ * Lock the mapping to avoid races with page faults re-instantiating
++ * folios and dirtying them via ->page_mkwrite between the page cache
++ * truncation and the delalloc extent removal. Failing to do this can
++ * leave dirty pages with no space reservation in the cache.
+ */
+- if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) {
+- truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
+- XFS_FSB_TO_B(mp, end_fsb) - 1);
+-
+- error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+- end_fsb - start_fsb);
+- if (error && !xfs_is_shutdown(mp)) {
+- xfs_alert(mp, "%s: unable to clean up ino %lld",
+- __func__, ip->i_ino);
+- return error;
+- }
+- }
++ filemap_invalidate_lock(inode->i_mapping);
++ truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
++ XFS_FSB_TO_B(mp, end_fsb) - 1);
+
++ error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
++ end_fsb - start_fsb);
++ filemap_invalidate_unlock(inode->i_mapping);
++ if (error && !xfs_is_shutdown(mp)) {
++ xfs_alert(mp, "%s: unable to clean up ino %lld",
++ __func__, ip->i_ino);
++ return error;
++ }
+ return 0;
+ }
+
--- /dev/null
+From stable+bounces-42913-greg=kroah.com@vger.kernel.org Wed May 1 20:42:22 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:41:12 -0700
+Subject: xfs: short circuit xfs_growfs_data_private() if delta is zero
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Eric Sandeen <sandeen@redhat.com>, "Darrick J. Wong" <djwong@kernel.org>, Chandan Babu R <chandanbabu@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-24-leah.rumancik@gmail.com>
+
+From: Eric Sandeen <sandeen@redhat.com>
+
+[ Upstream commit 84712492e6dab803bf595fb8494d11098b74a652 ]
+
+Although xfs_growfs_data() doesn't call xfs_growfs_data_private()
+if in->newblocks == mp->m_sb.sb_dblocks, xfs_growfs_data_private()
+further massages the new block count so that we don't i.e. try
+to create a too-small new AG.
+
+This may lead to a delta of "0" in xfs_growfs_data_private(), so
+we end up in the shrink case and emit the EXPERIMENTAL warning
+even if we're not changing anything at all.
+
+Fix this by returning straightaway if the block delta is zero.
+
+(nb: in older kernels, the result of entering the shrink case
+with delta == 0 may actually let an -ENOSPC escape to userspace,
+which is confusing for users.)
+
+Fixes: fb2fc1720185 ("xfs: support shrinking unused space in the last AG")
+Signed-off-by: Eric Sandeen <sandeen@redhat.com>
+Reviewed-by: "Darrick J. Wong" <djwong@kernel.org>
+Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_fsops.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/fs/xfs/xfs_fsops.c
++++ b/fs/xfs/xfs_fsops.c
+@@ -129,6 +129,10 @@ xfs_growfs_data_private(
+ if (delta < 0 && nagcount < 2)
+ return -EINVAL;
+
++ /* No work to do */
++ if (delta == 0)
++ return 0;
++
+ oagcount = mp->m_sb.sb_agcount;
+ /* allocate the new per-ag structures */
+ if (nagcount > oagcount) {
--- /dev/null
+From stable+bounces-42892-greg=kroah.com@vger.kernel.org Wed May 1 20:41:29 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:40:51 -0700
+Subject: xfs: use byte ranges for write cleanup ranges
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-3-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit b71f889c18ada210a97aa3eb5e00c0de552234c6 ]
+
+xfs_buffered_write_iomap_end() currently converts the byte ranges
+passed to it to filesystem blocks to pass them to the bmap code to
+punch out delalloc blocks, but then has to convert filesytem
+blocks back to byte ranges for page cache truncate.
+
+We're about to make the page cache truncate go away and replace it
+with a page cache walk, so having to convert everything to/from/to
+filesystem blocks is messy and error-prone. It is much easier to
+pass around byte ranges and convert to page indexes and/or
+filesystem blocks only where those units are needed.
+
+In preparation for the page cache walk being added, add a helper
+that converts byte ranges to filesystem blocks and calls
+xfs_bmap_punch_delalloc_range() and convert
+xfs_buffered_write_iomap_end() to calculate limits in byte ranges.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_iomap.c | 40 +++++++++++++++++++++++++---------------
+ 1 file changed, 25 insertions(+), 15 deletions(-)
+
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -1121,6 +1121,20 @@ out_unlock:
+ }
+
+ static int
++xfs_buffered_write_delalloc_punch(
++ struct inode *inode,
++ loff_t start_byte,
++ loff_t end_byte)
++{
++ struct xfs_mount *mp = XFS_M(inode->i_sb);
++ xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte);
++ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte);
++
++ return xfs_bmap_punch_delalloc_range(XFS_I(inode), start_fsb,
++ end_fsb - start_fsb);
++}
++
++static int
+ xfs_buffered_write_iomap_end(
+ struct inode *inode,
+ loff_t offset,
+@@ -1129,10 +1143,9 @@ xfs_buffered_write_iomap_end(
+ unsigned flags,
+ struct iomap *iomap)
+ {
+- struct xfs_inode *ip = XFS_I(inode);
+- struct xfs_mount *mp = ip->i_mount;
+- xfs_fileoff_t start_fsb;
+- xfs_fileoff_t end_fsb;
++ struct xfs_mount *mp = XFS_M(inode->i_sb);
++ loff_t start_byte;
++ loff_t end_byte;
+ int error = 0;
+
+ if (iomap->type != IOMAP_DELALLOC)
+@@ -1157,13 +1170,13 @@ xfs_buffered_write_iomap_end(
+ * the range.
+ */
+ if (unlikely(!written))
+- start_fsb = XFS_B_TO_FSBT(mp, offset);
++ start_byte = round_down(offset, mp->m_sb.sb_blocksize);
+ else
+- start_fsb = XFS_B_TO_FSB(mp, offset + written);
+- end_fsb = XFS_B_TO_FSB(mp, offset + length);
++ start_byte = round_up(offset + written, mp->m_sb.sb_blocksize);
++ end_byte = round_up(offset + length, mp->m_sb.sb_blocksize);
+
+ /* Nothing to do if we've written the entire delalloc extent */
+- if (start_fsb >= end_fsb)
++ if (start_byte >= end_byte)
+ return 0;
+
+ /*
+@@ -1173,15 +1186,12 @@ xfs_buffered_write_iomap_end(
+ * leave dirty pages with no space reservation in the cache.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+- truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb),
+- XFS_FSB_TO_B(mp, end_fsb) - 1);
+-
+- error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+- end_fsb - start_fsb);
++ truncate_pagecache_range(inode, start_byte, end_byte - 1);
++ error = xfs_buffered_write_delalloc_punch(inode, start_byte, end_byte);
+ filemap_invalidate_unlock(inode->i_mapping);
+ if (error && !xfs_is_shutdown(mp)) {
+- xfs_alert(mp, "%s: unable to clean up ino %lld",
+- __func__, ip->i_ino);
++ xfs_alert(mp, "%s: unable to clean up ino 0x%llx",
++ __func__, XFS_I(inode)->i_ino);
+ return error;
+ }
+ return 0;
--- /dev/null
+From stable+bounces-42899-greg=kroah.com@vger.kernel.org Wed May 1 20:41:46 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:40:56 -0700
+Subject: xfs: use iomap_valid method to detect stale cached iomaps
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-8-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 304a68b9c63bbfc1f6e159d68e8892fc54a06067 ]
+
+Now that iomap supports a mechanism to validate cached iomaps for
+buffered write operations, hook it up to the XFS buffered write ops
+so that we can avoid data corruptions that result from stale cached
+iomaps. See:
+
+https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/
+
+or the ->iomap_valid() introduction commit for exact details of the
+corruption vector.
+
+The validity cookie we store in the iomap is based on the type of
+iomap we return. It is expected that the iomap->flags we set in
+xfs_bmbt_to_iomap() is not perturbed by the iomap core and are
+returned to us in the iomap passed via the .iomap_valid() callback.
+This ensures that the validity cookie is always checking the correct
+inode fork sequence numbers to detect potential changes that affect
+the extent cached by the iomap.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/libxfs/xfs_bmap.c | 6 +-
+ fs/xfs/xfs_aops.c | 2
+ fs/xfs/xfs_iomap.c | 95 +++++++++++++++++++++++++++++++++++++----------
+ fs/xfs/xfs_iomap.h | 5 +-
+ fs/xfs/xfs_pnfs.c | 6 +-
+ 5 files changed, 87 insertions(+), 27 deletions(-)
+
+--- a/fs/xfs/libxfs/xfs_bmap.c
++++ b/fs/xfs/libxfs/xfs_bmap.c
+@@ -4551,7 +4551,8 @@ xfs_bmapi_convert_delalloc(
+ * the extent. Just return the real extent at this offset.
+ */
+ if (!isnullstartblock(bma.got.br_startblock)) {
+- xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
++ xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
++ xfs_iomap_inode_sequence(ip, flags));
+ *seq = READ_ONCE(ifp->if_seq);
+ goto out_trans_cancel;
+ }
+@@ -4599,7 +4600,8 @@ xfs_bmapi_convert_delalloc(
+ XFS_STATS_INC(mp, xs_xstrat_quick);
+
+ ASSERT(!isnullstartblock(bma.got.br_startblock));
+- xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags);
++ xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
++ xfs_iomap_inode_sequence(ip, flags));
+ *seq = READ_ONCE(ifp->if_seq);
+
+ if (whichfork == XFS_COW_FORK)
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -372,7 +372,7 @@ retry:
+ isnullstartblock(imap.br_startblock))
+ goto allocate_blocks;
+
+- xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0);
++ xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq);
+ trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
+ return 0;
+ allocate_blocks:
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -48,13 +48,45 @@ xfs_alert_fsblock_zero(
+ return -EFSCORRUPTED;
+ }
+
++u64
++xfs_iomap_inode_sequence(
++ struct xfs_inode *ip,
++ u16 iomap_flags)
++{
++ u64 cookie = 0;
++
++ if (iomap_flags & IOMAP_F_XATTR)
++ return READ_ONCE(ip->i_af.if_seq);
++ if ((iomap_flags & IOMAP_F_SHARED) && ip->i_cowfp)
++ cookie = (u64)READ_ONCE(ip->i_cowfp->if_seq) << 32;
++ return cookie | READ_ONCE(ip->i_df.if_seq);
++}
++
++/*
++ * Check that the iomap passed to us is still valid for the given offset and
++ * length.
++ */
++static bool
++xfs_iomap_valid(
++ struct inode *inode,
++ const struct iomap *iomap)
++{
++ return iomap->validity_cookie ==
++ xfs_iomap_inode_sequence(XFS_I(inode), iomap->flags);
++}
++
++const struct iomap_page_ops xfs_iomap_page_ops = {
++ .iomap_valid = xfs_iomap_valid,
++};
++
+ int
+ xfs_bmbt_to_iomap(
+ struct xfs_inode *ip,
+ struct iomap *iomap,
+ struct xfs_bmbt_irec *imap,
+ unsigned int mapping_flags,
+- u16 iomap_flags)
++ u16 iomap_flags,
++ u64 sequence_cookie)
+ {
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_buftarg *target = xfs_inode_buftarg(ip);
+@@ -91,6 +123,9 @@ xfs_bmbt_to_iomap(
+ if (xfs_ipincount(ip) &&
+ (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+ iomap->flags |= IOMAP_F_DIRTY;
++
++ iomap->validity_cookie = sequence_cookie;
++ iomap->page_ops = &xfs_iomap_page_ops;
+ return 0;
+ }
+
+@@ -195,7 +230,8 @@ xfs_iomap_write_direct(
+ xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t count_fsb,
+ unsigned int flags,
+- struct xfs_bmbt_irec *imap)
++ struct xfs_bmbt_irec *imap,
++ u64 *seq)
+ {
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_trans *tp;
+@@ -285,6 +321,7 @@ xfs_iomap_write_direct(
+ error = xfs_alert_fsblock_zero(ip, imap);
+
+ out_unlock:
++ *seq = xfs_iomap_inode_sequence(ip, 0);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return error;
+
+@@ -743,6 +780,7 @@ xfs_direct_write_iomap_begin(
+ bool shared = false;
+ u16 iomap_flags = 0;
+ unsigned int lockmode = XFS_ILOCK_SHARED;
++ u64 seq;
+
+ ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
+
+@@ -811,9 +849,10 @@ xfs_direct_write_iomap_begin(
+ goto out_unlock;
+ }
+
++ seq = xfs_iomap_inode_sequence(ip, iomap_flags);
+ xfs_iunlock(ip, lockmode);
+ trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
+- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags);
++ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq);
+
+ allocate_blocks:
+ error = -EAGAIN;
+@@ -839,24 +878,26 @@ allocate_blocks:
+ xfs_iunlock(ip, lockmode);
+
+ error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb,
+- flags, &imap);
++ flags, &imap, &seq);
+ if (error)
+ return error;
+
+ trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
+- iomap_flags | IOMAP_F_NEW);
++ iomap_flags | IOMAP_F_NEW, seq);
+
+ out_found_cow:
+- xfs_iunlock(ip, lockmode);
+ length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount);
+ trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap);
+ if (imap.br_startblock != HOLESTARTBLOCK) {
+- error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
++ seq = xfs_iomap_inode_sequence(ip, 0);
++ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
+ if (error)
+- return error;
++ goto out_unlock;
+ }
+- return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED);
++ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
++ xfs_iunlock(ip, lockmode);
++ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq);
+
+ out_unlock:
+ if (lockmode)
+@@ -915,6 +956,7 @@ xfs_buffered_write_iomap_begin(
+ int allocfork = XFS_DATA_FORK;
+ int error = 0;
+ unsigned int lockmode = XFS_ILOCK_EXCL;
++ u64 seq;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+@@ -1094,26 +1136,31 @@ retry:
+ * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
+ * them out if the write happens to fail.
+ */
++ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
+- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW);
++ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq);
+
+ found_imap:
++ seq = xfs_iomap_inode_sequence(ip, 0);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
++ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
+
+ found_cow:
+- xfs_iunlock(ip, XFS_ILOCK_EXCL);
++ seq = xfs_iomap_inode_sequence(ip, 0);
+ if (imap.br_startoff <= offset_fsb) {
+- error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0);
++ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq);
+ if (error)
+- return error;
++ goto out_unlock;
++ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
++ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
+- IOMAP_F_SHARED);
++ IOMAP_F_SHARED, seq);
+ }
+
+ xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
+- return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0);
++ xfs_iunlock(ip, XFS_ILOCK_EXCL);
++ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq);
+
+ out_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+@@ -1193,6 +1240,7 @@ xfs_read_iomap_begin(
+ int nimaps = 1, error = 0;
+ bool shared = false;
+ unsigned int lockmode = XFS_ILOCK_SHARED;
++ u64 seq;
+
+ ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
+
+@@ -1206,13 +1254,14 @@ xfs_read_iomap_begin(
+ &nimaps, 0);
+ if (!error && (flags & IOMAP_REPORT))
+ error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
++ seq = xfs_iomap_inode_sequence(ip, shared ? IOMAP_F_SHARED : 0);
+ xfs_iunlock(ip, lockmode);
+
+ if (error)
+ return error;
+ trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap);
+ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags,
+- shared ? IOMAP_F_SHARED : 0);
++ shared ? IOMAP_F_SHARED : 0, seq);
+ }
+
+ const struct iomap_ops xfs_read_iomap_ops = {
+@@ -1237,6 +1286,7 @@ xfs_seek_iomap_begin(
+ struct xfs_bmbt_irec imap, cmap;
+ int error = 0;
+ unsigned lockmode;
++ u64 seq;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+@@ -1271,8 +1321,9 @@ xfs_seek_iomap_begin(
+ if (data_fsb < cow_fsb + cmap.br_blockcount)
+ end_fsb = min(end_fsb, data_fsb);
+ xfs_trim_extent(&cmap, offset_fsb, end_fsb);
++ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
+ error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
+- IOMAP_F_SHARED);
++ IOMAP_F_SHARED, seq);
+ /*
+ * This is a COW extent, so we must probe the page cache
+ * because there could be dirty page cache being backed
+@@ -1293,8 +1344,9 @@ xfs_seek_iomap_begin(
+ imap.br_startblock = HOLESTARTBLOCK;
+ imap.br_state = XFS_EXT_NORM;
+ done:
++ seq = xfs_iomap_inode_sequence(ip, 0);
+ xfs_trim_extent(&imap, offset_fsb, end_fsb);
+- error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
++ error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
+ out_unlock:
+ xfs_iunlock(ip, lockmode);
+ return error;
+@@ -1320,6 +1372,7 @@ xfs_xattr_iomap_begin(
+ struct xfs_bmbt_irec imap;
+ int nimaps = 1, error = 0;
+ unsigned lockmode;
++ int seq;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+@@ -1336,12 +1389,14 @@ xfs_xattr_iomap_begin(
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
+ &nimaps, XFS_BMAPI_ATTRFORK);
+ out_unlock:
++
++ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_XATTR);
+ xfs_iunlock(ip, lockmode);
+
+ if (error)
+ return error;
+ ASSERT(nimaps);
+- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0);
++ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_XATTR, seq);
+ }
+
+ const struct iomap_ops xfs_xattr_iomap_ops = {
+--- a/fs/xfs/xfs_iomap.h
++++ b/fs/xfs/xfs_iomap.h
+@@ -13,14 +13,15 @@ struct xfs_bmbt_irec;
+
+ int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t count_fsb, unsigned int flags,
+- struct xfs_bmbt_irec *imap);
++ struct xfs_bmbt_irec *imap, u64 *sequence);
+ int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool);
+ xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip,
+ xfs_fileoff_t end_fsb);
+
++u64 xfs_iomap_inode_sequence(struct xfs_inode *ip, u16 iomap_flags);
+ int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap,
+ struct xfs_bmbt_irec *imap, unsigned int mapping_flags,
+- u16 iomap_flags);
++ u16 iomap_flags, u64 sequence_cookie);
+
+ int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len,
+ bool *did_zero);
+--- a/fs/xfs/xfs_pnfs.c
++++ b/fs/xfs/xfs_pnfs.c
+@@ -125,6 +125,7 @@ xfs_fs_map_blocks(
+ int nimaps = 1;
+ uint lock_flags;
+ int error = 0;
++ u64 seq;
+
+ if (xfs_is_shutdown(mp))
+ return -EIO;
+@@ -176,6 +177,7 @@ xfs_fs_map_blocks(
+ lock_flags = xfs_ilock_data_map_shared(ip);
+ error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+ &imap, &nimaps, bmapi_flags);
++ seq = xfs_iomap_inode_sequence(ip, 0);
+
+ ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK);
+
+@@ -189,7 +191,7 @@ xfs_fs_map_blocks(
+ xfs_iunlock(ip, lock_flags);
+
+ error = xfs_iomap_write_direct(ip, offset_fsb,
+- end_fsb - offset_fsb, 0, &imap);
++ end_fsb - offset_fsb, 0, &imap, &seq);
+ if (error)
+ goto out_unlock;
+
+@@ -209,7 +211,7 @@ xfs_fs_map_blocks(
+ }
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+
+- error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0);
++ error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq);
+ *device_generation = mp->m_generation;
+ return error;
+ out_unlock:
--- /dev/null
+From stable+bounces-42905-greg=kroah.com@vger.kernel.org Wed May 1 20:42:02 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:41:04 -0700
+Subject: xfs: wait iclog complete before tearing down AIL
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Guo Xuenan <guoxuenan@huawei.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-16-leah.rumancik@gmail.com>
+
+From: Guo Xuenan <guoxuenan@huawei.com>
+
+[ Upstream commit 1eb52a6a71981b80f9acbd915acd6a05a5037196 ]
+
+Fix uaf in xfs_trans_ail_delete during xlog force shutdown.
+In commit cd6f79d1fb32 ("xfs: run callbacks before waking waiters in
+xlog_state_shutdown_callbacks") changed the order of running callbacks
+and wait for iclog completion to avoid unmount path untimely destroy AIL.
+But which seems not enough to ensue this, adding mdelay in
+`xfs_buf_item_unpin` can prove that.
+
+The reproduction is as follows. To ensure destroy AIL safely,
+we should wait all xlog ioend workers done and sync the AIL.
+
+==================================================================
+BUG: KASAN: use-after-free in xfs_trans_ail_delete+0x240/0x2a0
+Read of size 8 at addr ffff888023169400 by task kworker/1:1H/43
+
+CPU: 1 PID: 43 Comm: kworker/1:1H Tainted: G W
+6.1.0-rc1-00002-gc28266863c4a #137
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
+1.13.0-1ubuntu1.1 04/01/2014
+Workqueue: xfs-log/sda xlog_ioend_work
+Call Trace:
+ <TASK>
+ dump_stack_lvl+0x4d/0x66
+ print_report+0x171/0x4a6
+ kasan_report+0xb3/0x130
+ xfs_trans_ail_delete+0x240/0x2a0
+ xfs_buf_item_done+0x7b/0xa0
+ xfs_buf_ioend+0x1e9/0x11f0
+ xfs_buf_item_unpin+0x4c8/0x860
+ xfs_trans_committed_bulk+0x4c2/0x7c0
+ xlog_cil_committed+0xab6/0xfb0
+ xlog_cil_process_committed+0x117/0x1e0
+ xlog_state_shutdown_callbacks+0x208/0x440
+ xlog_force_shutdown+0x1b3/0x3a0
+ xlog_ioend_work+0xef/0x1d0
+ process_one_work+0x6f9/0xf70
+ worker_thread+0x578/0xf30
+ kthread+0x28c/0x330
+ ret_from_fork+0x1f/0x30
+ </TASK>
+
+Allocated by task 9606:
+ kasan_save_stack+0x1e/0x40
+ kasan_set_track+0x21/0x30
+ __kasan_kmalloc+0x7a/0x90
+ __kmalloc+0x59/0x140
+ kmem_alloc+0xb2/0x2f0
+ xfs_trans_ail_init+0x20/0x320
+ xfs_log_mount+0x37e/0x690
+ xfs_mountfs+0xe36/0x1b40
+ xfs_fs_fill_super+0xc5c/0x1a70
+ get_tree_bdev+0x3c5/0x6c0
+ vfs_get_tree+0x85/0x250
+ path_mount+0xec3/0x1830
+ do_mount+0xef/0x110
+ __x64_sys_mount+0x150/0x1f0
+ do_syscall_64+0x35/0x80
+ entry_SYSCALL_64_after_hwframe+0x63/0xcd
+
+Freed by task 9662:
+ kasan_save_stack+0x1e/0x40
+ kasan_set_track+0x21/0x30
+ kasan_save_free_info+0x2a/0x40
+ __kasan_slab_free+0x105/0x1a0
+ __kmem_cache_free+0x99/0x2d0
+ kvfree+0x3a/0x40
+ xfs_log_unmount+0x60/0xf0
+ xfs_unmountfs+0xf3/0x1d0
+ xfs_fs_put_super+0x78/0x300
+ generic_shutdown_super+0x151/0x400
+ kill_block_super+0x9a/0xe0
+ deactivate_locked_super+0x82/0xe0
+ deactivate_super+0x91/0xb0
+ cleanup_mnt+0x32a/0x4a0
+ task_work_run+0x15f/0x240
+ exit_to_user_mode_prepare+0x188/0x190
+ syscall_exit_to_user_mode+0x12/0x30
+ do_syscall_64+0x42/0x80
+ entry_SYSCALL_64_after_hwframe+0x63/0xcd
+
+The buggy address belongs to the object at ffff888023169400
+ which belongs to the cache kmalloc-128 of size 128
+The buggy address is located 0 bytes inside of
+ 128-byte region [ffff888023169400, ffff888023169480)
+
+The buggy address belongs to the physical page:
+page:ffffea00008c5a00 refcount:1 mapcount:0 mapping:0000000000000000
+index:0xffff888023168f80 pfn:0x23168
+head:ffffea00008c5a00 order:1 compound_mapcount:0 compound_pincount:0
+flags: 0x1fffff80010200(slab|head|node=0|zone=1|lastcpupid=0x1fffff)
+raw: 001fffff80010200 ffffea00006b3988 ffffea0000577a88 ffff88800f842ac0
+raw: ffff888023168f80 0000000000150007 00000001ffffffff 0000000000000000
+page dumped because: kasan: bad access detected
+
+Memory state around the buggy address:
+ ffff888023169300: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff888023169380: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+>ffff888023169400: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ ^
+ ffff888023169480: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff888023169500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+==================================================================
+Disabling lock debugging due to kernel taint
+
+Fixes: cd6f79d1fb32 ("xfs: run callbacks before waking waiters in xlog_state_shutdown_callbacks")
+Signed-off-by: Guo Xuenan <guoxuenan@huawei.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_log.c | 36 +++++++++++++++++++++++++-----------
+ 1 file changed, 25 insertions(+), 11 deletions(-)
+
+--- a/fs/xfs/xfs_log.c
++++ b/fs/xfs/xfs_log.c
+@@ -887,6 +887,23 @@ xlog_force_iclog(
+ }
+
+ /*
++ * Cycle all the iclogbuf locks to make sure all log IO completion
++ * is done before we tear down these buffers.
++ */
++static void
++xlog_wait_iclog_completion(struct xlog *log)
++{
++ int i;
++ struct xlog_in_core *iclog = log->l_iclog;
++
++ for (i = 0; i < log->l_iclog_bufs; i++) {
++ down(&iclog->ic_sema);
++ up(&iclog->ic_sema);
++ iclog = iclog->ic_next;
++ }
++}
++
++/*
+ * Wait for the iclog and all prior iclogs to be written disk as required by the
+ * log force state machine. Waiting on ic_force_wait ensures iclog completions
+ * have been ordered and callbacks run before we are woken here, hence
+@@ -1111,6 +1128,14 @@ xfs_log_unmount(
+ {
+ xfs_log_clean(mp);
+
++ /*
++ * If shutdown has come from iclog IO context, the log
++ * cleaning will have been skipped and so we need to wait
++ * for the iclog to complete shutdown processing before we
++ * tear anything down.
++ */
++ xlog_wait_iclog_completion(mp->m_log);
++
+ xfs_buftarg_drain(mp->m_ddev_targp);
+
+ xfs_trans_ail_destroy(mp);
+@@ -2114,17 +2139,6 @@ xlog_dealloc_log(
+ int i;
+
+ /*
+- * Cycle all the iclogbuf locks to make sure all log IO completion
+- * is done before we tear down these buffers.
+- */
+- iclog = log->l_iclog;
+- for (i = 0; i < log->l_iclog_bufs; i++) {
+- down(&iclog->ic_sema);
+- up(&iclog->ic_sema);
+- iclog = iclog->ic_next;
+- }
+-
+- /*
+ * Destroy the CIL after waiting for iclog IO completion because an
+ * iclog EIO error will try to shut down the log, which accesses the
+ * CIL to wake up the waiters.
--- /dev/null
+From stable+bounces-42890-greg=kroah.com@vger.kernel.org Wed May 1 20:41:25 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:40:49 -0700
+Subject: xfs: write page faults in iomap are not buffered writes
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, Christoph Hellwig <hch@lst.de>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-1-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 118e021b4b66f758f8e8f21dc0e5e0a4c721e69e ]
+
+When we reserve a delalloc region in xfs_buffered_write_iomap_begin,
+we mark the iomap as IOMAP_F_NEW so that the the write context
+understands that it allocated the delalloc region.
+
+If we then fail that buffered write, xfs_buffered_write_iomap_end()
+checks for the IOMAP_F_NEW flag and if it is set, it punches out
+the unused delalloc region that was allocated for the write.
+
+The assumption this code makes is that all buffered write operations
+that can allocate space are run under an exclusive lock (i_rwsem).
+This is an invalid assumption: page faults in mmap()d regions call
+through this same function pair to map the file range being faulted
+and this runs only holding the inode->i_mapping->invalidate_lock in
+shared mode.
+
+IOWs, we can have races between page faults and write() calls that
+fail the nested page cache write operation that result in data loss.
+That is, the failing iomap_end call will punch out the data that
+the other racing iomap iteration brought into the page cache. This
+can be reproduced with generic/34[46] if we arbitrarily fail page
+cache copy-in operations from write() syscalls.
+
+Code analysis tells us that the iomap_page_mkwrite() function holds
+the already instantiated and uptodate folio locked across the iomap
+mapping iterations. Hence the folio cannot be removed from memory
+whilst we are mapping the range it covers, and as such we do not
+care if the mapping changes state underneath the iomap iteration
+loop:
+
+1. if the folio is not already dirty, there is no writeback races
+ possible.
+2. if we allocated the mapping (delalloc or unwritten), the folio
+ cannot already be dirty. See #1.
+3. If the folio is already dirty, it must be up to date. As we hold
+ it locked, it cannot be reclaimed from memory. Hence we always
+ have valid data in the page cache while iterating the mapping.
+4. Valid data in the page cache can exist when the underlying
+ mapping is DELALLOC, UNWRITTEN or WRITTEN. Having the mapping
+ change from DELALLOC->UNWRITTEN or UNWRITTEN->WRITTEN does not
+ change the data in the page - it only affects actions if we are
+ initialising a new page. Hence #3 applies and we don't care
+ about these extent map transitions racing with
+ iomap_page_mkwrite().
+5. iomap_page_mkwrite() checks for page invalidation races
+ (truncate, hole punch, etc) after it locks the folio. We also
+ hold the mapping->invalidation_lock here, and hence the mapping
+ cannot change due to extent removal operations while we are
+ iterating the folio.
+
+As such, filesystems that don't use bufferheads will never fail
+the iomap_folio_mkwrite_iter() operation on the current mapping,
+regardless of whether the iomap should be considered stale.
+
+Further, the range we are asked to iterate is limited to the range
+inside EOF that the folio spans. Hence, for XFS, we will only map
+the exact range we are asked for, and we will only do speculative
+preallocation with delalloc if we are mapping a hole at the EOF
+page. The iterator will consume the entire range of the folio that
+is within EOF, and anything beyond the EOF block cannot be accessed.
+We never need to truncate this post-EOF speculative prealloc away in
+the context of the iomap_page_mkwrite() iterator because if it
+remains unused we'll remove it when the last reference to the inode
+goes away.
+
+Hence we don't actually need an .iomap_end() cleanup/error handling
+path at all for iomap_page_mkwrite() for XFS. This means we can
+separate the page fault processing from the complexity of the
+.iomap_end() processing in the buffered write path. This also means
+that the buffered write path will also be able to take the
+mapping->invalidate_lock as necessary.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_file.c | 2 +-
+ fs/xfs/xfs_iomap.c | 9 +++++++++
+ fs/xfs/xfs_iomap.h | 1 +
+ 3 files changed, 11 insertions(+), 1 deletion(-)
+
+--- a/fs/xfs/xfs_file.c
++++ b/fs/xfs/xfs_file.c
+@@ -1325,7 +1325,7 @@ __xfs_filemap_fault(
+ if (write_fault) {
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ ret = iomap_page_mkwrite(vmf,
+- &xfs_buffered_write_iomap_ops);
++ &xfs_page_mkwrite_iomap_ops);
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ } else {
+ ret = filemap_fault(vmf);
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -1187,6 +1187,15 @@ const struct iomap_ops xfs_buffered_writ
+ .iomap_end = xfs_buffered_write_iomap_end,
+ };
+
++/*
++ * iomap_page_mkwrite() will never fail in a way that requires delalloc extents
++ * that it allocated to be revoked. Hence we do not need an .iomap_end method
++ * for this operation.
++ */
++const struct iomap_ops xfs_page_mkwrite_iomap_ops = {
++ .iomap_begin = xfs_buffered_write_iomap_begin,
++};
++
+ static int
+ xfs_read_iomap_begin(
+ struct inode *inode,
+--- a/fs/xfs/xfs_iomap.h
++++ b/fs/xfs/xfs_iomap.h
+@@ -47,6 +47,7 @@ xfs_aligned_fsb_count(
+ }
+
+ extern const struct iomap_ops xfs_buffered_write_iomap_ops;
++extern const struct iomap_ops xfs_page_mkwrite_iomap_ops;
+ extern const struct iomap_ops xfs_direct_write_iomap_ops;
+ extern const struct iomap_ops xfs_read_iomap_ops;
+ extern const struct iomap_ops xfs_seek_iomap_ops;
--- /dev/null
+From stable+bounces-42895-greg=kroah.com@vger.kernel.org Wed May 1 20:41:37 2024
+From: Leah Rumancik <leah.rumancik@gmail.com>
+Date: Wed, 1 May 2024 11:40:54 -0700
+Subject: xfs: xfs_bmap_punch_delalloc_range() should take a byte range
+To: stable@vger.kernel.org
+Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner <dchinner@redhat.com>, "Darrick J . Wong" <djwong@kernel.org>, Leah Rumancik <leah.rumancik@gmail.com>
+Message-ID: <20240501184112.3799035-6-leah.rumancik@gmail.com>
+
+From: Dave Chinner <dchinner@redhat.com>
+
+[ Upstream commit 7348b322332d8602a4133f0b861334ea021b134a ]
+
+All the callers of xfs_bmap_punch_delalloc_range() jump through
+hoops to convert a byte range to filesystem blocks before calling
+xfs_bmap_punch_delalloc_range(). Instead, pass the byte range to
+xfs_bmap_punch_delalloc_range() and have it do the conversion to
+filesystem blocks internally.
+
+Signed-off-by: Dave Chinner <dchinner@redhat.com>
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Leah Rumancik <leah.rumancik@gmail.com>
+Acked-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/xfs_aops.c | 16 ++++++----------
+ fs/xfs/xfs_bmap_util.c | 10 ++++++----
+ fs/xfs/xfs_bmap_util.h | 2 +-
+ fs/xfs/xfs_iomap.c | 8 ++------
+ 4 files changed, 15 insertions(+), 21 deletions(-)
+
+--- a/fs/xfs/xfs_aops.c
++++ b/fs/xfs/xfs_aops.c
+@@ -114,9 +114,8 @@ xfs_end_ioend(
+ if (unlikely(error)) {
+ if (ioend->io_flags & IOMAP_F_SHARED) {
+ xfs_reflink_cancel_cow_range(ip, offset, size, true);
+- xfs_bmap_punch_delalloc_range(ip,
+- XFS_B_TO_FSBT(mp, offset),
+- XFS_B_TO_FSB(mp, size));
++ xfs_bmap_punch_delalloc_range(ip, offset,
++ offset + size);
+ }
+ goto done;
+ }
+@@ -455,12 +454,8 @@ xfs_discard_folio(
+ struct folio *folio,
+ loff_t pos)
+ {
+- struct inode *inode = folio->mapping->host;
+- struct xfs_inode *ip = XFS_I(inode);
++ struct xfs_inode *ip = XFS_I(folio->mapping->host);
+ struct xfs_mount *mp = ip->i_mount;
+- size_t offset = offset_in_folio(folio, pos);
+- xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, pos);
+- xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, offset);
+ int error;
+
+ if (xfs_is_shutdown(mp))
+@@ -470,8 +465,9 @@ xfs_discard_folio(
+ "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
+ folio, ip->i_ino, pos);
+
+- error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+- i_blocks_per_folio(inode, folio) - pageoff_fsb);
++ error = xfs_bmap_punch_delalloc_range(ip, pos,
++ round_up(pos, folio_size(folio)));
++
+ if (error && !xfs_is_shutdown(mp))
+ xfs_alert(mp, "page discard unable to remove delalloc mapping.");
+ }
+--- a/fs/xfs/xfs_bmap_util.c
++++ b/fs/xfs/xfs_bmap_util.c
+@@ -590,11 +590,13 @@ out_unlock_iolock:
+ int
+ xfs_bmap_punch_delalloc_range(
+ struct xfs_inode *ip,
+- xfs_fileoff_t start_fsb,
+- xfs_fileoff_t length)
++ xfs_off_t start_byte,
++ xfs_off_t end_byte)
+ {
++ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = &ip->i_df;
+- xfs_fileoff_t end_fsb = start_fsb + length;
++ xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte);
++ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte);
+ struct xfs_bmbt_irec got, del;
+ struct xfs_iext_cursor icur;
+ int error = 0;
+@@ -607,7 +609,7 @@ xfs_bmap_punch_delalloc_range(
+
+ while (got.br_startoff + got.br_blockcount > start_fsb) {
+ del = got;
+- xfs_trim_extent(&del, start_fsb, length);
++ xfs_trim_extent(&del, start_fsb, end_fsb - start_fsb);
+
+ /*
+ * A delete can push the cursor forward. Step back to the
+--- a/fs/xfs/xfs_bmap_util.h
++++ b/fs/xfs/xfs_bmap_util.h
+@@ -31,7 +31,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap
+ #endif /* CONFIG_XFS_RT */
+
+ int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip,
+- xfs_fileoff_t start_fsb, xfs_fileoff_t length);
++ xfs_off_t start_byte, xfs_off_t end_byte);
+
+ struct kgetbmap {
+ __s64 bmv_offset; /* file offset of segment in blocks */
+--- a/fs/xfs/xfs_iomap.c
++++ b/fs/xfs/xfs_iomap.c
+@@ -1126,12 +1126,8 @@ xfs_buffered_write_delalloc_punch(
+ loff_t offset,
+ loff_t length)
+ {
+- struct xfs_mount *mp = XFS_M(inode->i_sb);
+- xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset);
+- xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length);
+-
+- return xfs_bmap_punch_delalloc_range(XFS_I(inode), start_fsb,
+- end_fsb - start_fsb);
++ return xfs_bmap_punch_delalloc_range(XFS_I(inode), offset,
++ offset + length);
+ }
+
+ static int