From d438c60c8dbe85e41f969b08e190b202c5ca1dda Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 23 May 2024 13:19:00 +0200 Subject: [PATCH] 6.1-stable patches added patches: iomap-buffered-write-failure-should-not-truncate-the-page-cache.patch iomap-write-iomap-validity-checks.patch keys-trusted-fix-memory-leak-in-tpm2_key_encode.patch mmc-core-add-hs400-tuning-in-hs400es-initialization.patch xfs-allow-inode-inactivation-during-a-ro-mount-log-recovery.patch xfs-attach-dquots-to-inode-before-reading-data-cow-fork-mappings.patch xfs-drop-write-error-injection-is-unfixable-remove-it.patch xfs-estimate-post-merge-refcounts-correctly.patch xfs-fix-incorrect-error-out-in-xfs_remove.patch xfs-fix-incorrect-i_nlink-caused-by-inode-racing.patch xfs-fix-log-recovery-when-unknown-rocompat-bits-are-set.patch xfs-fix-off-by-one-block-in-xfs_discard_folio.patch xfs-fix-sb-write-verify-for-lazysbcount.patch xfs-fix-super-block-buf-log-item-uaf-during-force-shutdown.patch xfs-get-root-inode-correctly-at-bulkstat.patch xfs-hoist-refcount-record-merge-predicates.patch xfs-invalidate-block-device-page-cache-during-unmount.patch xfs-invalidate-xfs_bufs-when-allocating-cow-extents.patch xfs-iomap-move-delalloc-punching-to-iomap.patch xfs-punching-delalloc-extents-on-write-failure-is-racy.patch xfs-short-circuit-xfs_growfs_data_private-if-delta-is-zero.patch xfs-use-byte-ranges-for-write-cleanup-ranges.patch xfs-use-iomap_valid-method-to-detect-stale-cached-iomaps.patch xfs-wait-iclog-complete-before-tearing-down-ail.patch xfs-write-page-faults-in-iomap-are-not-buffered-writes.patch xfs-xfs_bmap_punch_delalloc_range-should-take-a-byte-range.patch --- ...e-should-not-truncate-the-page-cache.patch | 295 +++++++++++++ .../iomap-write-iomap-validity-checks.patch | 263 ++++++++++++ ...d-fix-memory-leak-in-tpm2_key_encode.patch | 76 ++++ ...400-tuning-in-hs400es-initialization.patch | 40 ++ queue-6.1/series | 26 ++ ...ation-during-a-ro-mount-log-recovery.patch | 64 +++ ...efore-reading-data-cow-fork-mappings.patch | 158 +++++++ ...ror-injection-is-unfixable-remove-it.patch | 212 ++++++++++ ...imate-post-merge-refcounts-correctly.patch | 114 ++++++ ...ix-incorrect-error-out-in-xfs_remove.patch | 36 ++ ...rrect-i_nlink-caused-by-inode-racing.patch | 81 ++++ ...y-when-unknown-rocompat-bits-are-set.patch | 95 +++++ ...ff-by-one-block-in-xfs_discard_folio.patch | 83 ++++ ...-fix-sb-write-verify-for-lazysbcount.patch | 152 +++++++ ...f-log-item-uaf-during-force-shutdown.patch | 147 +++++++ ...get-root-inode-correctly-at-bulkstat.patch | 49 +++ ...ist-refcount-record-merge-predicates.patch | 188 +++++++++ ...ock-device-page-cache-during-unmount.patch | 71 ++++ ...xfs_bufs-when-allocating-cow-extents.patch | 55 +++ ...omap-move-delalloc-punching-to-iomap.patch | 188 +++++++++ ...loc-extents-on-write-failure-is-racy.patch | 117 ++++++ ...growfs_data_private-if-delta-is-zero.patch | 51 +++ ...byte-ranges-for-write-cleanup-ranges.patch | 112 +++++ ...method-to-detect-stale-cached-iomaps.patch | 387 ++++++++++++++++++ ...log-complete-before-tearing-down-ail.patch | 185 +++++++++ ...lts-in-iomap-are-not-buffered-writes.patch | 134 ++++++ ...alloc_range-should-take-a-byte-range.patch | 126 ++++++ 27 files changed, 3505 insertions(+) create mode 100644 queue-6.1/iomap-buffered-write-failure-should-not-truncate-the-page-cache.patch create mode 100644 queue-6.1/iomap-write-iomap-validity-checks.patch create mode 100644 queue-6.1/keys-trusted-fix-memory-leak-in-tpm2_key_encode.patch create mode 100644 queue-6.1/mmc-core-add-hs400-tuning-in-hs400es-initialization.patch create mode 100644 queue-6.1/xfs-allow-inode-inactivation-during-a-ro-mount-log-recovery.patch create mode 100644 queue-6.1/xfs-attach-dquots-to-inode-before-reading-data-cow-fork-mappings.patch create mode 100644 queue-6.1/xfs-drop-write-error-injection-is-unfixable-remove-it.patch create mode 100644 queue-6.1/xfs-estimate-post-merge-refcounts-correctly.patch create mode 100644 queue-6.1/xfs-fix-incorrect-error-out-in-xfs_remove.patch create mode 100644 queue-6.1/xfs-fix-incorrect-i_nlink-caused-by-inode-racing.patch create mode 100644 queue-6.1/xfs-fix-log-recovery-when-unknown-rocompat-bits-are-set.patch create mode 100644 queue-6.1/xfs-fix-off-by-one-block-in-xfs_discard_folio.patch create mode 100644 queue-6.1/xfs-fix-sb-write-verify-for-lazysbcount.patch create mode 100644 queue-6.1/xfs-fix-super-block-buf-log-item-uaf-during-force-shutdown.patch create mode 100644 queue-6.1/xfs-get-root-inode-correctly-at-bulkstat.patch create mode 100644 queue-6.1/xfs-hoist-refcount-record-merge-predicates.patch create mode 100644 queue-6.1/xfs-invalidate-block-device-page-cache-during-unmount.patch create mode 100644 queue-6.1/xfs-invalidate-xfs_bufs-when-allocating-cow-extents.patch create mode 100644 queue-6.1/xfs-iomap-move-delalloc-punching-to-iomap.patch create mode 100644 queue-6.1/xfs-punching-delalloc-extents-on-write-failure-is-racy.patch create mode 100644 queue-6.1/xfs-short-circuit-xfs_growfs_data_private-if-delta-is-zero.patch create mode 100644 queue-6.1/xfs-use-byte-ranges-for-write-cleanup-ranges.patch create mode 100644 queue-6.1/xfs-use-iomap_valid-method-to-detect-stale-cached-iomaps.patch create mode 100644 queue-6.1/xfs-wait-iclog-complete-before-tearing-down-ail.patch create mode 100644 queue-6.1/xfs-write-page-faults-in-iomap-are-not-buffered-writes.patch create mode 100644 queue-6.1/xfs-xfs_bmap_punch_delalloc_range-should-take-a-byte-range.patch diff --git a/queue-6.1/iomap-buffered-write-failure-should-not-truncate-the-page-cache.patch b/queue-6.1/iomap-buffered-write-failure-should-not-truncate-the-page-cache.patch new file mode 100644 index 00000000000..6a988176d2b --- /dev/null +++ b/queue-6.1/iomap-buffered-write-failure-should-not-truncate-the-page-cache.patch @@ -0,0 +1,295 @@ +From stable+bounces-42894-greg=kroah.com@vger.kernel.org Wed May 1 20:41:36 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:40:53 -0700 +Subject: iomap: buffered write failure should not truncate the page cache +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner , "Darrick J . Wong" , Leah Rumancik +Message-ID: <20240501184112.3799035-5-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit f43dc4dc3eff028b5ddddd99f3a66c5a6bdd4e78 ] + +iomap_file_buffered_write_punch_delalloc() currently invalidates the +page cache over the unused range of the delalloc extent that was +allocated. While the write allocated the delalloc extent, it does +not own it exclusively as the write does not hold any locks that +prevent either writeback or mmap page faults from changing the state +of either the page cache or the extent state backing this range. + +Whilst xfs_bmap_punch_delalloc_range() already handles races in +extent conversion - it will only punch out delalloc extents and it +ignores any other type of extent - the page cache truncate does not +discriminate between data written by this write or some other task. +As a result, truncating the page cache can result in data corruption +if the write races with mmap modifications to the file over the same +range. + +generic/346 exercises this workload, and if we randomly fail writes +(as will happen when iomap gets stale iomap detection later in the +patchset), it will randomly corrupt the file data because it removes +data written by mmap() in the same page as the write() that failed. + +Hence we do not want to punch out the page cache over the range of +the extent we failed to write to - what we actually need to do is +detect the ranges that have dirty data in cache over them and *not +punch them out*. + +To do this, we have to walk the page cache over the range of the +delalloc extent we want to remove. This is made complex by the fact +we have to handle partially up-to-date folios correctly and this can +happen even when the FSB size == PAGE_SIZE because we now support +multi-page folios in the page cache. + +Because we are only interested in discovering the edges of data +ranges in the page cache (i.e. hole-data boundaries) we can make use +of mapping_seek_hole_data() to find those transitions in the page +cache. As we hold the invalidate_lock, we know that the boundaries +are not going to change while we walk the range. This interface is +also byte-based and is sub-page block aware, so we can find the data +ranges in the cache based on byte offsets rather than page, folio or +fs block sized chunks. This greatly simplifies the logic of finding +dirty cached ranges in the page cache. + +Once we've identified a range that contains cached data, we can then +iterate the range folio by folio. This allows us to determine if the +data is dirty and hence perform the correct delalloc extent punching +operations. The seek interface we use to iterate data ranges will +give us sub-folio start/end granularity, so we may end up looking up +the same folio multiple times as the seek interface iterates across +each discontiguous data region in the folio. + +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/iomap/buffered-io.c | 195 +++++++++++++++++++++++++++++++++++++++++++++---- + 1 file changed, 180 insertions(+), 15 deletions(-) + +--- a/fs/iomap/buffered-io.c ++++ b/fs/iomap/buffered-io.c +@@ -828,6 +828,165 @@ iomap_file_buffered_write(struct kiocb * + EXPORT_SYMBOL_GPL(iomap_file_buffered_write); + + /* ++ * Scan the data range passed to us for dirty page cache folios. If we find a ++ * dirty folio, punch out the preceeding range and update the offset from which ++ * the next punch will start from. ++ * ++ * We can punch out storage reservations under clean pages because they either ++ * contain data that has been written back - in which case the delalloc punch ++ * over that range is a no-op - or they have been read faults in which case they ++ * contain zeroes and we can remove the delalloc backing range and any new ++ * writes to those pages will do the normal hole filling operation... ++ * ++ * This makes the logic simple: we only need to keep the delalloc extents only ++ * over the dirty ranges of the page cache. ++ * ++ * This function uses [start_byte, end_byte) intervals (i.e. open ended) to ++ * simplify range iterations. ++ */ ++static int iomap_write_delalloc_scan(struct inode *inode, ++ loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, ++ int (*punch)(struct inode *inode, loff_t offset, loff_t length)) ++{ ++ while (start_byte < end_byte) { ++ struct folio *folio; ++ ++ /* grab locked page */ ++ folio = filemap_lock_folio(inode->i_mapping, ++ start_byte >> PAGE_SHIFT); ++ if (!folio) { ++ start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) + ++ PAGE_SIZE; ++ continue; ++ } ++ ++ /* if dirty, punch up to offset */ ++ if (folio_test_dirty(folio)) { ++ if (start_byte > *punch_start_byte) { ++ int error; ++ ++ error = punch(inode, *punch_start_byte, ++ start_byte - *punch_start_byte); ++ if (error) { ++ folio_unlock(folio); ++ folio_put(folio); ++ return error; ++ } ++ } ++ ++ /* ++ * Make sure the next punch start is correctly bound to ++ * the end of this data range, not the end of the folio. ++ */ ++ *punch_start_byte = min_t(loff_t, end_byte, ++ folio_next_index(folio) << PAGE_SHIFT); ++ } ++ ++ /* move offset to start of next folio in range */ ++ start_byte = folio_next_index(folio) << PAGE_SHIFT; ++ folio_unlock(folio); ++ folio_put(folio); ++ } ++ return 0; ++} ++ ++/* ++ * Punch out all the delalloc blocks in the range given except for those that ++ * have dirty data still pending in the page cache - those are going to be ++ * written and so must still retain the delalloc backing for writeback. ++ * ++ * As we are scanning the page cache for data, we don't need to reimplement the ++ * wheel - mapping_seek_hole_data() does exactly what we need to identify the ++ * start and end of data ranges correctly even for sub-folio block sizes. This ++ * byte range based iteration is especially convenient because it means we ++ * don't have to care about variable size folios, nor where the start or end of ++ * the data range lies within a folio, if they lie within the same folio or even ++ * if there are multiple discontiguous data ranges within the folio. ++ * ++ * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so ++ * can return data ranges that exist in the cache beyond EOF. e.g. a page fault ++ * spanning EOF will initialise the post-EOF data to zeroes and mark it up to ++ * date. A write page fault can then mark it dirty. If we then fail a write() ++ * beyond EOF into that up to date cached range, we allocate a delalloc block ++ * beyond EOF and then have to punch it out. Because the range is up to date, ++ * mapping_seek_hole_data() will return it, and we will skip the punch because ++ * the folio is dirty. THis is incorrect - we always need to punch out delalloc ++ * beyond EOF in this case as writeback will never write back and covert that ++ * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF, ++ * resulting in always punching out the range from the EOF to the end of the ++ * range the iomap spans. ++ * ++ * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it ++ * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA ++ * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte) ++ * returns the end of the data range (data_end). Using closed intervals would ++ * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose ++ * the code to subtle off-by-one bugs.... ++ */ ++static int iomap_write_delalloc_release(struct inode *inode, ++ loff_t start_byte, loff_t end_byte, ++ int (*punch)(struct inode *inode, loff_t pos, loff_t length)) ++{ ++ loff_t punch_start_byte = start_byte; ++ loff_t scan_end_byte = min(i_size_read(inode), end_byte); ++ int error = 0; ++ ++ /* ++ * Lock the mapping to avoid races with page faults re-instantiating ++ * folios and dirtying them via ->page_mkwrite whilst we walk the ++ * cache and perform delalloc extent removal. Failing to do this can ++ * leave dirty pages with no space reservation in the cache. ++ */ ++ filemap_invalidate_lock(inode->i_mapping); ++ while (start_byte < scan_end_byte) { ++ loff_t data_end; ++ ++ start_byte = mapping_seek_hole_data(inode->i_mapping, ++ start_byte, scan_end_byte, SEEK_DATA); ++ /* ++ * If there is no more data to scan, all that is left is to ++ * punch out the remaining range. ++ */ ++ if (start_byte == -ENXIO || start_byte == scan_end_byte) ++ break; ++ if (start_byte < 0) { ++ error = start_byte; ++ goto out_unlock; ++ } ++ WARN_ON_ONCE(start_byte < punch_start_byte); ++ WARN_ON_ONCE(start_byte > scan_end_byte); ++ ++ /* ++ * We find the end of this contiguous cached data range by ++ * seeking from start_byte to the beginning of the next hole. ++ */ ++ data_end = mapping_seek_hole_data(inode->i_mapping, start_byte, ++ scan_end_byte, SEEK_HOLE); ++ if (data_end < 0) { ++ error = data_end; ++ goto out_unlock; ++ } ++ WARN_ON_ONCE(data_end <= start_byte); ++ WARN_ON_ONCE(data_end > scan_end_byte); ++ ++ error = iomap_write_delalloc_scan(inode, &punch_start_byte, ++ start_byte, data_end, punch); ++ if (error) ++ goto out_unlock; ++ ++ /* The next data search starts at the end of this one. */ ++ start_byte = data_end; ++ } ++ ++ if (punch_start_byte < end_byte) ++ error = punch(inode, punch_start_byte, ++ end_byte - punch_start_byte); ++out_unlock: ++ filemap_invalidate_unlock(inode->i_mapping); ++ return error; ++} ++ ++/* + * When a short write occurs, the filesystem may need to remove reserved space + * that was allocated in ->iomap_begin from it's ->iomap_end method. For + * filesystems that use delayed allocation, we need to punch out delalloc +@@ -837,8 +996,25 @@ EXPORT_SYMBOL_GPL(iomap_file_buffered_wr + * allocated for this iomap. + * + * This function uses [start_byte, end_byte) intervals (i.e. open ended) to +- * simplify range iterations, but converts them back to {offset,len} tuples for +- * the punch callback. ++ * simplify range iterations. ++ * ++ * The punch() callback *must* only punch delalloc extents in the range passed ++ * to it. It must skip over all other types of extents in the range and leave ++ * them completely unchanged. It must do this punch atomically with respect to ++ * other extent modifications. ++ * ++ * The punch() callback may be called with a folio locked to prevent writeback ++ * extent allocation racing at the edge of the range we are currently punching. ++ * The locked folio may or may not cover the range being punched, so it is not ++ * safe for the punch() callback to lock folios itself. ++ * ++ * Lock order is: ++ * ++ * inode->i_rwsem (shared or exclusive) ++ * inode->i_mapping->invalidate_lock (exclusive) ++ * folio_lock() ++ * ->punch ++ * internal filesystem allocation lock + */ + int iomap_file_buffered_write_punch_delalloc(struct inode *inode, + struct iomap *iomap, loff_t pos, loff_t length, +@@ -848,7 +1024,6 @@ int iomap_file_buffered_write_punch_dela + loff_t start_byte; + loff_t end_byte; + int blocksize = i_blocksize(inode); +- int error = 0; + + if (iomap->type != IOMAP_DELALLOC) + return 0; +@@ -872,18 +1047,8 @@ int iomap_file_buffered_write_punch_dela + if (start_byte >= end_byte) + return 0; + +- /* +- * Lock the mapping to avoid races with page faults re-instantiating +- * folios and dirtying them via ->page_mkwrite between the page cache +- * truncation and the delalloc extent removal. Failing to do this can +- * leave dirty pages with no space reservation in the cache. +- */ +- filemap_invalidate_lock(inode->i_mapping); +- truncate_pagecache_range(inode, start_byte, end_byte - 1); +- error = punch(inode, start_byte, end_byte - start_byte); +- filemap_invalidate_unlock(inode->i_mapping); +- +- return error; ++ return iomap_write_delalloc_release(inode, start_byte, end_byte, ++ punch); + } + EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); + diff --git a/queue-6.1/iomap-write-iomap-validity-checks.patch b/queue-6.1/iomap-write-iomap-validity-checks.patch new file mode 100644 index 00000000000..63324d61f16 --- /dev/null +++ b/queue-6.1/iomap-write-iomap-validity-checks.patch @@ -0,0 +1,263 @@ +From stable+bounces-42896-greg=kroah.com@vger.kernel.org Wed May 1 20:41:39 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:40:55 -0700 +Subject: iomap: write iomap validity checks +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner , Christoph Hellwig , "Darrick J . Wong" , Leah Rumancik +Message-ID: <20240501184112.3799035-7-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit d7b64041164ca177170191d2ad775da074ab2926 ] + +A recent multithreaded write data corruption has been uncovered in +the iomap write code. The core of the problem is partial folio +writes can be flushed to disk while a new racing write can map it +and fill the rest of the page: + +writeback new write + +allocate blocks + blocks are unwritten +submit IO +..... + map blocks + iomap indicates UNWRITTEN range + loop { + lock folio + copyin data +..... +IO completes + runs unwritten extent conv + blocks are marked written + + get next folio + } + +Now add memory pressure such that memory reclaim evicts the +partially written folio that has already been written to disk. + +When the new write finally gets to the last partial page of the new +write, it does not find it in cache, so it instantiates a new page, +sees the iomap is unwritten, and zeros the part of the page that +it does not have data from. This overwrites the data on disk that +was originally written. + +The full description of the corruption mechanism can be found here: + +https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/ + +To solve this problem, we need to check whether the iomap is still +valid after we lock each folio during the write. We have to do it +after we lock the page so that we don't end up with state changes +occurring while we wait for the folio to be locked. + +Hence we need a mechanism to be able to check that the cached iomap +is still valid (similar to what we already do in buffered +writeback), and we need a way for ->begin_write to back out and +tell the high level iomap iterator that we need to remap the +remaining write range. + +The iomap needs to grow some storage for the validity cookie that +the filesystem provides to travel with the iomap. XFS, in +particular, also needs to know some more information about what the +iomap maps (attribute extents rather than file data extents) to for +the validity cookie to cover all the types of iomaps we might need +to validate. + +Signed-off-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/iomap/buffered-io.c | 29 ++++++++++++++++++++++++++++- + fs/iomap/iter.c | 19 ++++++++++++++++++- + include/linux/iomap.h | 43 +++++++++++++++++++++++++++++++++++-------- + 3 files changed, 81 insertions(+), 10 deletions(-) + +--- a/fs/iomap/buffered-io.c ++++ b/fs/iomap/buffered-io.c +@@ -579,7 +579,7 @@ static int iomap_write_begin_inline(cons + return iomap_read_inline_data(iter, folio); + } + +-static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, ++static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, + size_t len, struct folio **foliop) + { + const struct iomap_page_ops *page_ops = iter->iomap.page_ops; +@@ -613,6 +613,27 @@ static int iomap_write_begin(const struc + status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM; + goto out_no_page; + } ++ ++ /* ++ * Now we have a locked folio, before we do anything with it we need to ++ * check that the iomap we have cached is not stale. The inode extent ++ * mapping can change due to concurrent IO in flight (e.g. ++ * IOMAP_UNWRITTEN state can change and memory reclaim could have ++ * reclaimed a previously partially written page at this index after IO ++ * completion before this write reaches this file offset) and hence we ++ * could do the wrong thing here (zero a page range incorrectly or fail ++ * to zero) and corrupt data. ++ */ ++ if (page_ops && page_ops->iomap_valid) { ++ bool iomap_valid = page_ops->iomap_valid(iter->inode, ++ &iter->iomap); ++ if (!iomap_valid) { ++ iter->iomap.flags |= IOMAP_F_STALE; ++ status = 0; ++ goto out_unlock; ++ } ++ } ++ + if (pos + len > folio_pos(folio) + folio_size(folio)) + len = folio_pos(folio) + folio_size(folio) - pos; + +@@ -768,6 +789,8 @@ again: + status = iomap_write_begin(iter, pos, bytes, &folio); + if (unlikely(status)) + break; ++ if (iter->iomap.flags & IOMAP_F_STALE) ++ break; + + page = folio_file_page(folio, pos >> PAGE_SHIFT); + if (mapping_writably_mapped(mapping)) +@@ -1076,6 +1099,8 @@ static loff_t iomap_unshare_iter(struct + status = iomap_write_begin(iter, pos, bytes, &folio); + if (unlikely(status)) + return status; ++ if (iter->iomap.flags & IOMAP_F_STALE) ++ break; + + status = iomap_write_end(iter, pos, bytes, bytes, folio); + if (WARN_ON_ONCE(status == 0)) +@@ -1131,6 +1156,8 @@ static loff_t iomap_zero_iter(struct iom + status = iomap_write_begin(iter, pos, bytes, &folio); + if (status) + return status; ++ if (iter->iomap.flags & IOMAP_F_STALE) ++ break; + + offset = offset_in_folio(folio, pos); + if (bytes > folio_size(folio) - offset) +--- a/fs/iomap/iter.c ++++ b/fs/iomap/iter.c +@@ -7,12 +7,28 @@ + #include + #include "trace.h" + ++/* ++ * Advance to the next range we need to map. ++ * ++ * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully ++ * processed - it was aborted because the extent the iomap spanned may have been ++ * changed during the operation. In this case, the iteration behaviour is to ++ * remap the unprocessed range of the iter, and that means we may need to remap ++ * even when we've made no progress (i.e. iter->processed = 0). Hence the ++ * "finished iterating" case needs to distinguish between ++ * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we ++ * need to remap the entire remaining range. ++ */ + static inline int iomap_iter_advance(struct iomap_iter *iter) + { ++ bool stale = iter->iomap.flags & IOMAP_F_STALE; ++ + /* handle the previous iteration (if any) */ + if (iter->iomap.length) { +- if (iter->processed <= 0) ++ if (iter->processed < 0) + return iter->processed; ++ if (!iter->processed && !stale) ++ return 0; + if (WARN_ON_ONCE(iter->processed > iomap_length(iter))) + return -EIO; + iter->pos += iter->processed; +@@ -33,6 +49,7 @@ static inline void iomap_iter_done(struc + WARN_ON_ONCE(iter->iomap.offset > iter->pos); + WARN_ON_ONCE(iter->iomap.length == 0); + WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); ++ WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE); + + trace_iomap_iter_dstmap(iter->inode, &iter->iomap); + if (iter->srcmap.type != IOMAP_HOLE) +--- a/include/linux/iomap.h ++++ b/include/linux/iomap.h +@@ -49,26 +49,35 @@ struct vm_fault; + * + * IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of + * buffer heads for this mapping. ++ * ++ * IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent ++ * rather than a file data extent. + */ +-#define IOMAP_F_NEW 0x01 +-#define IOMAP_F_DIRTY 0x02 +-#define IOMAP_F_SHARED 0x04 +-#define IOMAP_F_MERGED 0x08 +-#define IOMAP_F_BUFFER_HEAD 0x10 +-#define IOMAP_F_ZONE_APPEND 0x20 ++#define IOMAP_F_NEW (1U << 0) ++#define IOMAP_F_DIRTY (1U << 1) ++#define IOMAP_F_SHARED (1U << 2) ++#define IOMAP_F_MERGED (1U << 3) ++#define IOMAP_F_BUFFER_HEAD (1U << 4) ++#define IOMAP_F_ZONE_APPEND (1U << 5) ++#define IOMAP_F_XATTR (1U << 6) + + /* + * Flags set by the core iomap code during operations: + * + * IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size + * has changed as the result of this write operation. ++ * ++ * IOMAP_F_STALE indicates that the iomap is not valid any longer and the file ++ * range it covers needs to be remapped by the high level before the operation ++ * can proceed. + */ +-#define IOMAP_F_SIZE_CHANGED 0x100 ++#define IOMAP_F_SIZE_CHANGED (1U << 8) ++#define IOMAP_F_STALE (1U << 9) + + /* + * Flags from 0x1000 up are for file system specific usage: + */ +-#define IOMAP_F_PRIVATE 0x1000 ++#define IOMAP_F_PRIVATE (1U << 12) + + + /* +@@ -89,6 +98,7 @@ struct iomap { + void *inline_data; + void *private; /* filesystem private */ + const struct iomap_page_ops *page_ops; ++ u64 validity_cookie; /* used with .iomap_valid() */ + }; + + static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos) +@@ -128,6 +138,23 @@ struct iomap_page_ops { + int (*page_prepare)(struct inode *inode, loff_t pos, unsigned len); + void (*page_done)(struct inode *inode, loff_t pos, unsigned copied, + struct page *page); ++ ++ /* ++ * Check that the cached iomap still maps correctly to the filesystem's ++ * internal extent map. FS internal extent maps can change while iomap ++ * is iterating a cached iomap, so this hook allows iomap to detect that ++ * the iomap needs to be refreshed during a long running write ++ * operation. ++ * ++ * The filesystem can store internal state (e.g. a sequence number) in ++ * iomap->validity_cookie when the iomap is first mapped to be able to ++ * detect changes between mapping time and whenever .iomap_valid() is ++ * called. ++ * ++ * This is called with the folio over the specified file position held ++ * locked by the iomap code. ++ */ ++ bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap); + }; + + /* diff --git a/queue-6.1/keys-trusted-fix-memory-leak-in-tpm2_key_encode.patch b/queue-6.1/keys-trusted-fix-memory-leak-in-tpm2_key_encode.patch new file mode 100644 index 00000000000..ddd0917991f --- /dev/null +++ b/queue-6.1/keys-trusted-fix-memory-leak-in-tpm2_key_encode.patch @@ -0,0 +1,76 @@ +From ffcaa2172cc1a85ddb8b783de96d38ca8855e248 Mon Sep 17 00:00:00 2001 +From: Jarkko Sakkinen +Date: Mon, 20 May 2024 02:31:53 +0300 +Subject: KEYS: trusted: Fix memory leak in tpm2_key_encode() + +From: Jarkko Sakkinen + +commit ffcaa2172cc1a85ddb8b783de96d38ca8855e248 upstream. + +'scratch' is never freed. Fix this by calling kfree() in the success, and +in the error case. + +Cc: stable@vger.kernel.org # +v5.13 +Fixes: f2219745250f ("security: keys: trusted: use ASN.1 TPM2 key format for the blobs") +Signed-off-by: Jarkko Sakkinen +Signed-off-by: Greg Kroah-Hartman +--- + security/keys/trusted-keys/trusted_tpm2.c | 24 ++++++++++++++++++------ + 1 file changed, 18 insertions(+), 6 deletions(-) + +--- a/security/keys/trusted-keys/trusted_tpm2.c ++++ b/security/keys/trusted-keys/trusted_tpm2.c +@@ -38,6 +38,7 @@ static int tpm2_key_encode(struct truste + u8 *end_work = scratch + SCRATCH_SIZE; + u8 *priv, *pub; + u16 priv_len, pub_len; ++ int ret; + + priv_len = get_unaligned_be16(src) + 2; + priv = src; +@@ -57,8 +58,10 @@ static int tpm2_key_encode(struct truste + unsigned char bool[3], *w = bool; + /* tag 0 is emptyAuth */ + w = asn1_encode_boolean(w, w + sizeof(bool), true); +- if (WARN(IS_ERR(w), "BUG: Boolean failed to encode")) +- return PTR_ERR(w); ++ if (WARN(IS_ERR(w), "BUG: Boolean failed to encode")) { ++ ret = PTR_ERR(w); ++ goto err; ++ } + work = asn1_encode_tag(work, end_work, 0, bool, w - bool); + } + +@@ -69,8 +72,10 @@ static int tpm2_key_encode(struct truste + * trigger, so if it does there's something nefarious going on + */ + if (WARN(work - scratch + pub_len + priv_len + 14 > SCRATCH_SIZE, +- "BUG: scratch buffer is too small")) +- return -EINVAL; ++ "BUG: scratch buffer is too small")) { ++ ret = -EINVAL; ++ goto err; ++ } + + work = asn1_encode_integer(work, end_work, options->keyhandle); + work = asn1_encode_octet_string(work, end_work, pub, pub_len); +@@ -79,10 +84,17 @@ static int tpm2_key_encode(struct truste + work1 = payload->blob; + work1 = asn1_encode_sequence(work1, work1 + sizeof(payload->blob), + scratch, work - scratch); +- if (WARN(IS_ERR(work1), "BUG: ASN.1 encoder failed")) +- return PTR_ERR(work1); ++ if (WARN(IS_ERR(work1), "BUG: ASN.1 encoder failed")) { ++ ret = PTR_ERR(work1); ++ goto err; ++ } + ++ kfree(scratch); + return work1 - payload->blob; ++ ++err: ++ kfree(scratch); ++ return ret; + } + + struct tpm2_key_context { diff --git a/queue-6.1/mmc-core-add-hs400-tuning-in-hs400es-initialization.patch b/queue-6.1/mmc-core-add-hs400-tuning-in-hs400es-initialization.patch new file mode 100644 index 00000000000..7f752510dfc --- /dev/null +++ b/queue-6.1/mmc-core-add-hs400-tuning-in-hs400es-initialization.patch @@ -0,0 +1,40 @@ +From 77e01b49e35f24ebd1659096d5fc5c3b75975545 Mon Sep 17 00:00:00 2001 +From: Mengqi Zhang +Date: Mon, 25 Dec 2023 17:38:40 +0800 +Subject: mmc: core: Add HS400 tuning in HS400es initialization + +From: Mengqi Zhang + +commit 77e01b49e35f24ebd1659096d5fc5c3b75975545 upstream. + +During the initialization to HS400es stage, add a HS400 tuning flow as an +optional process. For Mediatek IP, the HS400es mode requires a specific +tuning to ensure the correct HS400 timing setting. + +Signed-off-by: Mengqi Zhang +Link: https://lore.kernel.org/r/20231225093839.22931-2-mengqi.zhang@mediatek.com +Signed-off-by: Ulf Hansson +Cc: "Lin Gui (桂林)" +Signed-off-by: Greg Kroah-Hartman +--- + drivers/mmc/core/mmc.c | 9 +++++++-- + 1 file changed, 7 insertions(+), 2 deletions(-) + +--- a/drivers/mmc/core/mmc.c ++++ b/drivers/mmc/core/mmc.c +@@ -1819,8 +1819,13 @@ static int mmc_init_card(struct mmc_host + + if (err) + goto free_card; +- +- } else if (!mmc_card_hs400es(card)) { ++ } else if (mmc_card_hs400es(card)) { ++ if (host->ops->execute_hs400_tuning) { ++ err = host->ops->execute_hs400_tuning(host, card); ++ if (err) ++ goto free_card; ++ } ++ } else { + /* Select the desired bus width optionally */ + err = mmc_select_bus_width(card); + if (err > 0 && mmc_card_hs(card)) { diff --git a/queue-6.1/series b/queue-6.1/series index 18d55dd23b0..1538bc3e5dd 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -5,3 +5,29 @@ ice-remove-unnecessary-duplicate-checks-for-vf-vsi-id.patch pinctrl-core-handle-radix_tree_insert-errors-in-pinctrl_register_one_pin.patch mfd-stpmic1-fix-swapped-mask-unmask-in-irq-chip.patch nfsd-don-t-allow-nfsd-threads-to-be-signalled.patch +keys-trusted-fix-memory-leak-in-tpm2_key_encode.patch +mmc-core-add-hs400-tuning-in-hs400es-initialization.patch +xfs-write-page-faults-in-iomap-are-not-buffered-writes.patch +xfs-punching-delalloc-extents-on-write-failure-is-racy.patch +xfs-use-byte-ranges-for-write-cleanup-ranges.patch +xfs-iomap-move-delalloc-punching-to-iomap.patch +iomap-buffered-write-failure-should-not-truncate-the-page-cache.patch +xfs-xfs_bmap_punch_delalloc_range-should-take-a-byte-range.patch +iomap-write-iomap-validity-checks.patch +xfs-use-iomap_valid-method-to-detect-stale-cached-iomaps.patch +xfs-drop-write-error-injection-is-unfixable-remove-it.patch +xfs-fix-off-by-one-block-in-xfs_discard_folio.patch +xfs-fix-incorrect-error-out-in-xfs_remove.patch +xfs-fix-sb-write-verify-for-lazysbcount.patch +xfs-fix-incorrect-i_nlink-caused-by-inode-racing.patch +xfs-invalidate-block-device-page-cache-during-unmount.patch +xfs-attach-dquots-to-inode-before-reading-data-cow-fork-mappings.patch +xfs-wait-iclog-complete-before-tearing-down-ail.patch +xfs-fix-super-block-buf-log-item-uaf-during-force-shutdown.patch +xfs-hoist-refcount-record-merge-predicates.patch +xfs-estimate-post-merge-refcounts-correctly.patch +xfs-invalidate-xfs_bufs-when-allocating-cow-extents.patch +xfs-allow-inode-inactivation-during-a-ro-mount-log-recovery.patch +xfs-fix-log-recovery-when-unknown-rocompat-bits-are-set.patch +xfs-get-root-inode-correctly-at-bulkstat.patch +xfs-short-circuit-xfs_growfs_data_private-if-delta-is-zero.patch diff --git a/queue-6.1/xfs-allow-inode-inactivation-during-a-ro-mount-log-recovery.patch b/queue-6.1/xfs-allow-inode-inactivation-during-a-ro-mount-log-recovery.patch new file mode 100644 index 00000000000..5375a0b18f9 --- /dev/null +++ b/queue-6.1/xfs-allow-inode-inactivation-during-a-ro-mount-log-recovery.patch @@ -0,0 +1,64 @@ +From stable+bounces-42910-greg=kroah.com@vger.kernel.org Wed May 1 20:42:14 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:41:09 -0700 +Subject: xfs: allow inode inactivation during a ro mount log recovery +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" , Dave Chinner , Leah Rumancik +Message-ID: <20240501184112.3799035-21-leah.rumancik@gmail.com> + +From: "Darrick J. Wong" + +[ Upstream commit 76e589013fec672c3587d6314f2d1f0aeddc26d9 ] + +In the next patch, we're going to prohibit log recovery if the primary +superblock contains an unrecognized rocompat feature bit even on +readonly mounts. This requires removing all the code in the log +mounting process that temporarily disables the readonly state. + +Unfortunately, inode inactivation disables itself on readonly mounts. +Clearing the iunlinked lists after log recovery needs inactivation to +run to free the unreferenced inodes, which (AFAICT) is the only reason +why log mounting plays games with the readonly state in the first place. + +Therefore, change the inactivation predicates to allow inactivation +during log recovery of a readonly mount. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_inode.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -1652,8 +1652,11 @@ xfs_inode_needs_inactive( + if (VFS_I(ip)->i_mode == 0) + return false; + +- /* If this is a read-only mount, don't do this (would generate I/O) */ +- if (xfs_is_readonly(mp)) ++ /* ++ * If this is a read-only mount, don't do this (would generate I/O) ++ * unless we're in log recovery and cleaning the iunlinked list. ++ */ ++ if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log)) + return false; + + /* If the log isn't running, push inodes straight to reclaim. */ +@@ -1713,8 +1716,11 @@ xfs_inactive( + mp = ip->i_mount; + ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); + +- /* If this is a read-only mount, don't do this (would generate I/O) */ +- if (xfs_is_readonly(mp)) ++ /* ++ * If this is a read-only mount, don't do this (would generate I/O) ++ * unless we're in log recovery and cleaning the iunlinked list. ++ */ ++ if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log)) + goto out; + + /* Metadata inodes require explicit resource cleanup. */ diff --git a/queue-6.1/xfs-attach-dquots-to-inode-before-reading-data-cow-fork-mappings.patch b/queue-6.1/xfs-attach-dquots-to-inode-before-reading-data-cow-fork-mappings.patch new file mode 100644 index 00000000000..da77037cdf2 --- /dev/null +++ b/queue-6.1/xfs-attach-dquots-to-inode-before-reading-data-cow-fork-mappings.patch @@ -0,0 +1,158 @@ +From stable+bounces-42904-greg=kroah.com@vger.kernel.org Wed May 1 20:42:01 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:41:03 -0700 +Subject: xfs: attach dquots to inode before reading data/cow fork mappings +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" , Dave Chinner , Leah Rumancik +Message-ID: <20240501184112.3799035-15-leah.rumancik@gmail.com> + +From: "Darrick J. Wong" + +[ Upstream commit 4c6dbfd2756bd83a0085ed804e2bb7be9cc16bc5 ] + +I've been running near-continuous integration testing of online fsck, +and I've noticed that once a day, one of the ARM VMs will fail the test +with out of order records in the data fork. + +xfs/804 races fsstress with online scrub (aka scan but do not change +anything), so I think this might be a bug in the core xfs code. This +also only seems to trigger if one runs the test for more than ~6 minutes +via TIME_FACTOR=13 or something. +https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfstests-dev.git/tree/tests/xfs/804?h=djwong-wtf + +I added a debugging patch to the kernel to check the data fork extents +after taking the ILOCK, before dropping ILOCK, and before and after each +bmapping operation. So far I've narrowed it down to the delalloc code +inserting a record in the wrong place in the iext tree: + +xfs_bmap_add_extent_hole_delay, near line 2691: + + case 0: + /* + * New allocation is not contiguous with another + * delayed allocation. + * Insert a new entry. + */ + oldlen = newlen = 0; + xfs_iunlock_check_datafork(ip); <-- ok here + xfs_iext_insert(ip, icur, new, state); + xfs_iunlock_check_datafork(ip); <-- bad here + break; + } + +I recorded the state of the data fork mappings and iext cursor state +when a corrupt data fork is detected immediately after the +xfs_bmap_add_extent_hole_delay call in xfs_bmapi_reserve_delalloc: + +ino 0x140bb3 func xfs_bmapi_reserve_delalloc line 4164 data fork: + ino 0x140bb3 nr 0x0 nr_real 0x0 offset 0xb9 blockcount 0x1f startblock 0x935de2 state 1 + ino 0x140bb3 nr 0x1 nr_real 0x1 offset 0xe6 blockcount 0xa startblock 0xffffffffe0007 state 0 + ino 0x140bb3 nr 0x2 nr_real 0x1 offset 0xd8 blockcount 0xe startblock 0x935e01 state 0 + +Here we see that a delalloc extent was inserted into the wrong position +in the iext leaf, same as all the other times. The extra trace data I +collected are as follows: + +ino 0x140bb3 fork 0 oldoff 0xe6 oldlen 0x4 oldprealloc 0x6 isize 0xe6000 + ino 0x140bb3 oldgotoff 0xea oldgotstart 0xfffffffffffffffe oldgotcount 0x0 oldgotstate 0 + ino 0x140bb3 crapgotoff 0x0 crapgotstart 0x0 crapgotcount 0x0 crapgotstate 0 + ino 0x140bb3 freshgotoff 0xd8 freshgotstart 0x935e01 freshgotcount 0xe freshgotstate 0 + ino 0x140bb3 nowgotoff 0xe6 nowgotstart 0xffffffffe0007 nowgotcount 0xa nowgotstate 0 + ino 0x140bb3 oldicurpos 1 oldleafnr 2 oldleaf 0xfffffc00f0609a00 + ino 0x140bb3 crapicurpos 2 crapleafnr 2 crapleaf 0xfffffc00f0609a00 + ino 0x140bb3 freshicurpos 1 freshleafnr 2 freshleaf 0xfffffc00f0609a00 + ino 0x140bb3 newicurpos 1 newleafnr 3 newleaf 0xfffffc00f0609a00 + +The first line shows that xfs_bmapi_reserve_delalloc was called with +whichfork=XFS_DATA_FORK, off=0xe6, len=0x4, prealloc=6. + +The second line ("oldgot") shows the contents of @got at the beginning +of the call, which are the results of the first iext lookup in +xfs_buffered_write_iomap_begin. + +Line 3 ("crapgot") is the result of duplicating the cursor at the start +of the body of xfs_bmapi_reserve_delalloc and performing a fresh lookup +at @off. + +Line 4 ("freshgot") is the result of a new xfs_iext_get_extent right +before the call to xfs_bmap_add_extent_hole_delay. Totally garbage. + +Line 5 ("nowgot") is contents of @got after the +xfs_bmap_add_extent_hole_delay call. + +Line 6 is the contents of @icur at the beginning fo the call. Lines 7-9 +are the contents of the iext cursors at the point where the block +mappings were sampled. + +I think @oldgot is a HOLESTARTBLOCK extent because the first lookup +didn't find anything, so we filled in imap with "fake hole until the +end". At the time of the first lookup, I suspect that there's only one +32-block unwritten extent in the mapping (hence oldicurpos==1) but by +the time we get to recording crapgot, crapicurpos==2. + +Dave then added: + +Ok, that's much simpler to reason about, and implies the smoke is +coming from xfs_buffered_write_iomap_begin() or +xfs_bmapi_reserve_delalloc(). I suspect the former - it does a lot +of stuff with the ILOCK_EXCL held..... + +.... including calling xfs_qm_dqattach_locked(). + +xfs_buffered_write_iomap_begin + ILOCK_EXCL + look up icur + xfs_qm_dqattach_locked + xfs_qm_dqattach_one + xfs_qm_dqget_inode + dquot cache miss + xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp); + xfs_ilock(ip, XFS_ILOCK_EXCL); + .... + xfs_bmapi_reserve_delalloc(icur) + +Yup, that's what is letting the magic smoke out - +xfs_qm_dqattach_locked() can cycle the ILOCK. If that happens, we +can pass a stale icur to xfs_bmapi_reserve_delalloc() and it all +goes downhill from there. + +Back to Darrick now: + +So. Fix this by moving the dqattach_locked call up before we take the +ILOCK, like all the other callers in that file. + +Fixes: a526c85c2236 ("xfs: move xfs_file_iomap_begin_delay around") # goes further back than this +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_iomap.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -968,6 +968,10 @@ xfs_buffered_write_iomap_begin( + + ASSERT(!XFS_IS_REALTIME_INODE(ip)); + ++ error = xfs_qm_dqattach(ip); ++ if (error) ++ return error; ++ + error = xfs_ilock_for_iomap(ip, flags, &lockmode); + if (error) + return error; +@@ -1071,10 +1075,6 @@ xfs_buffered_write_iomap_begin( + allocfork = XFS_COW_FORK; + } + +- error = xfs_qm_dqattach_locked(ip, false); +- if (error) +- goto out_unlock; +- + if (eof && offset + count > XFS_ISIZE(ip)) { + /* + * Determine the initial size of the preallocation. diff --git a/queue-6.1/xfs-drop-write-error-injection-is-unfixable-remove-it.patch b/queue-6.1/xfs-drop-write-error-injection-is-unfixable-remove-it.patch new file mode 100644 index 00000000000..9c6ed27a246 --- /dev/null +++ b/queue-6.1/xfs-drop-write-error-injection-is-unfixable-remove-it.patch @@ -0,0 +1,212 @@ +From stable+bounces-42897-greg=kroah.com@vger.kernel.org Wed May 1 20:41:44 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:40:57 -0700 +Subject: xfs: drop write error injection is unfixable, remove it +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner , "Darrick J . Wong" , Leah Rumancik +Message-ID: <20240501184112.3799035-9-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit 6e8af15ccdc4e138a5b529c1901a0013e1dcaa09 ] + +With the changes to scan the page cache for dirty data to avoid data +corruptions from partial write cleanup racing with other page cache +operations, the drop writes error injection no longer works the same +way it used to and causes xfs/196 to fail. This is because xfs/196 +writes to the file and populates the page cache before it turns on +the error injection and starts failing -overwrites-. + +The result is that the original drop-writes code failed writes only +-after- overwriting the data in the cache, followed by invalidates +the cached data, then punching out the delalloc extent from under +that data. + +On the surface, this looks fine. The problem is that page cache +invalidation *doesn't guarantee that it removes anything from the +page cache* and it doesn't change the dirty state of the folio. When +block size == page size and we do page aligned IO (as xfs/196 does) +everything happens to align perfectly and page cache invalidation +removes the single page folios that span the written data. Hence the +followup delalloc punch pass does not find cached data over that +range and it can punch the extent out. + +IOWs, xfs/196 "works" for block size == page size with the new +code. I say "works", because it actually only works for the case +where IO is page aligned, and no data was read from disk before +writes occur. Because the moment we actually read data first, the +readahead code allocates multipage folios and suddenly the +invalidate code goes back to zeroing subfolio ranges without +changing dirty state. + +Hence, with multipage folios in play, block size == page size is +functionally identical to block size < page size behaviour, and +drop-writes is manifestly broken w.r.t to this case. Invalidation of +a subfolio range doesn't result in the folio being removed from the +cache, just the range gets zeroed. Hence after we've sequentially +walked over a folio that we've dirtied (via write data) and then +invalidated, we end up with a dirty folio full of zeroed data. + +And because the new code skips punching ranges that have dirty +folios covering them, we end up leaving the delalloc range intact +after failing all the writes. Hence failed writes now end up +writing zeroes to disk in the cases where invalidation zeroes folios +rather than removing them from cache. + +This is a fundamental change of behaviour that is needed to avoid +the data corruption vectors that exist in the old write fail path, +and it renders the drop-writes injection non-functional and +unworkable as it stands. + +As it is, I think the error injection is also now unnecessary, as +partial writes that need delalloc extent are going to be a lot more +common with stale iomap detection in place. Hence this patch removes +the drop-writes error injection completely. xfs/196 can remain for +testing kernels that don't have this data corruption fix, but those +that do will report: + +xfs/196 3s ... [not run] XFS error injection drop_writes unknown on this kernel. + +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_errortag.h | 12 +++++------- + fs/xfs/xfs_error.c | 27 ++++++++++++++++++++------- + fs/xfs/xfs_iomap.c | 9 --------- + 3 files changed, 25 insertions(+), 23 deletions(-) + +--- a/fs/xfs/libxfs/xfs_errortag.h ++++ b/fs/xfs/libxfs/xfs_errortag.h +@@ -40,13 +40,12 @@ + #define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25 + #define XFS_ERRTAG_BMAP_FINISH_ONE 26 + #define XFS_ERRTAG_AG_RESV_CRITICAL 27 ++ + /* +- * DEBUG mode instrumentation to test and/or trigger delayed allocation +- * block killing in the event of failed writes. When enabled, all +- * buffered writes are silenty dropped and handled as if they failed. +- * All delalloc blocks in the range of the write (including pre-existing +- * delalloc blocks!) are tossed as part of the write failure error +- * handling sequence. ++ * Drop-writes support removed because write error handling cannot trash ++ * pre-existing delalloc extents in any useful way anymore. We retain the ++ * definition so that we can reject it as an invalid value in ++ * xfs_errortag_valid(). + */ + #define XFS_ERRTAG_DROP_WRITES 28 + #define XFS_ERRTAG_LOG_BAD_CRC 29 +@@ -95,7 +94,6 @@ + #define XFS_RANDOM_REFCOUNT_FINISH_ONE 1 + #define XFS_RANDOM_BMAP_FINISH_ONE 1 + #define XFS_RANDOM_AG_RESV_CRITICAL 4 +-#define XFS_RANDOM_DROP_WRITES 1 + #define XFS_RANDOM_LOG_BAD_CRC 1 + #define XFS_RANDOM_LOG_ITEM_PIN 1 + #define XFS_RANDOM_BUF_LRU_REF 2 +--- a/fs/xfs/xfs_error.c ++++ b/fs/xfs/xfs_error.c +@@ -46,7 +46,7 @@ static unsigned int xfs_errortag_random_ + XFS_RANDOM_REFCOUNT_FINISH_ONE, + XFS_RANDOM_BMAP_FINISH_ONE, + XFS_RANDOM_AG_RESV_CRITICAL, +- XFS_RANDOM_DROP_WRITES, ++ 0, /* XFS_RANDOM_DROP_WRITES has been removed */ + XFS_RANDOM_LOG_BAD_CRC, + XFS_RANDOM_LOG_ITEM_PIN, + XFS_RANDOM_BUF_LRU_REF, +@@ -162,7 +162,6 @@ XFS_ERRORTAG_ATTR_RW(refcount_continue_u + XFS_ERRORTAG_ATTR_RW(refcount_finish_one, XFS_ERRTAG_REFCOUNT_FINISH_ONE); + XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); + XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); +-XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES); + XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); + XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); + XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); +@@ -206,7 +205,6 @@ static struct attribute *xfs_errortag_at + XFS_ERRORTAG_ATTR_LIST(refcount_finish_one), + XFS_ERRORTAG_ATTR_LIST(bmap_finish_one), + XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), +- XFS_ERRORTAG_ATTR_LIST(drop_writes), + XFS_ERRORTAG_ATTR_LIST(log_bad_crc), + XFS_ERRORTAG_ATTR_LIST(log_item_pin), + XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), +@@ -256,6 +254,19 @@ xfs_errortag_del( + kmem_free(mp->m_errortag); + } + ++static bool ++xfs_errortag_valid( ++ unsigned int error_tag) ++{ ++ if (error_tag >= XFS_ERRTAG_MAX) ++ return false; ++ ++ /* Error out removed injection types */ ++ if (error_tag == XFS_ERRTAG_DROP_WRITES) ++ return false; ++ return true; ++} ++ + bool + xfs_errortag_test( + struct xfs_mount *mp, +@@ -277,7 +288,9 @@ xfs_errortag_test( + if (!mp->m_errortag) + return false; + +- ASSERT(error_tag < XFS_ERRTAG_MAX); ++ if (!xfs_errortag_valid(error_tag)) ++ return false; ++ + randfactor = mp->m_errortag[error_tag]; + if (!randfactor || prandom_u32_max(randfactor)) + return false; +@@ -293,7 +306,7 @@ xfs_errortag_get( + struct xfs_mount *mp, + unsigned int error_tag) + { +- if (error_tag >= XFS_ERRTAG_MAX) ++ if (!xfs_errortag_valid(error_tag)) + return -EINVAL; + + return mp->m_errortag[error_tag]; +@@ -305,7 +318,7 @@ xfs_errortag_set( + unsigned int error_tag, + unsigned int tag_value) + { +- if (error_tag >= XFS_ERRTAG_MAX) ++ if (!xfs_errortag_valid(error_tag)) + return -EINVAL; + + mp->m_errortag[error_tag] = tag_value; +@@ -319,7 +332,7 @@ xfs_errortag_add( + { + BUILD_BUG_ON(ARRAY_SIZE(xfs_errortag_random_default) != XFS_ERRTAG_MAX); + +- if (error_tag >= XFS_ERRTAG_MAX) ++ if (!xfs_errortag_valid(error_tag)) + return -EINVAL; + + return xfs_errortag_set(mp, error_tag, +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -1190,15 +1190,6 @@ xfs_buffered_write_iomap_end( + struct xfs_mount *mp = XFS_M(inode->i_sb); + int error; + +- /* +- * Behave as if the write failed if drop writes is enabled. Set the NEW +- * flag to force delalloc cleanup. +- */ +- if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_DROP_WRITES)) { +- iomap->flags |= IOMAP_F_NEW; +- written = 0; +- } +- + error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset, + length, written, &xfs_buffered_write_delalloc_punch); + if (error && !xfs_is_shutdown(mp)) { diff --git a/queue-6.1/xfs-estimate-post-merge-refcounts-correctly.patch b/queue-6.1/xfs-estimate-post-merge-refcounts-correctly.patch new file mode 100644 index 00000000000..8562e2f1284 --- /dev/null +++ b/queue-6.1/xfs-estimate-post-merge-refcounts-correctly.patch @@ -0,0 +1,114 @@ +From stable+bounces-42908-greg=kroah.com@vger.kernel.org Wed May 1 20:42:09 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:41:07 -0700 +Subject: xfs: estimate post-merge refcounts correctly +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" , Dave Chinner , Xiao Yang , Leah Rumancik +Message-ID: <20240501184112.3799035-19-leah.rumancik@gmail.com> + +From: "Darrick J. Wong" + +[ Upstream commit b25d1984aa884fc91a73a5a407b9ac976d441e9b ] + +Upon enabling fsdax + reflink for XFS, xfs/179 began to report refcount +metadata corruptions after being run. Specifically, xfs_repair noticed +single-block refcount records that could be combined but had not been. + +The root cause of this is improper MAXREFCOUNT edge case handling in +xfs_refcount_merge_extents. When we're trying to find candidates for a +refcount btree record merge, we compute the refcount attribute of the +merged record, but we fail to account for the fact that once a record +hits rc_refcount == MAXREFCOUNT, it is pinned that way forever. Hence +the computed refcount is wrong, and we fail to merge the extents. + +Fix this by adjusting the merge predicates to compute the adjusted +refcount correctly. + +Fixes: 3172725814f9 ("xfs: adjust refcount of an extent of blocks in refcount btree") +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Reviewed-by: Xiao Yang +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_refcount.c | 25 +++++++++++++++++++++---- + 1 file changed, 21 insertions(+), 4 deletions(-) + +--- a/fs/xfs/libxfs/xfs_refcount.c ++++ b/fs/xfs/libxfs/xfs_refcount.c +@@ -820,6 +820,17 @@ xfs_refc_valid( + return rc->rc_startblock != NULLAGBLOCK; + } + ++static inline xfs_nlink_t ++xfs_refc_merge_refcount( ++ const struct xfs_refcount_irec *irec, ++ enum xfs_refc_adjust_op adjust) ++{ ++ /* Once a record hits MAXREFCOUNT, it is pinned there forever */ ++ if (irec->rc_refcount == MAXREFCOUNT) ++ return MAXREFCOUNT; ++ return irec->rc_refcount + adjust; ++} ++ + static inline bool + xfs_refc_want_merge_center( + const struct xfs_refcount_irec *left, +@@ -831,6 +842,7 @@ xfs_refc_want_merge_center( + unsigned long long *ulenp) + { + unsigned long long ulen = left->rc_blockcount; ++ xfs_nlink_t new_refcount; + + /* + * To merge with a center record, both shoulder records must be +@@ -846,9 +858,10 @@ xfs_refc_want_merge_center( + return false; + + /* The shoulder record refcounts must match the new refcount. */ +- if (left->rc_refcount != cleft->rc_refcount + adjust) ++ new_refcount = xfs_refc_merge_refcount(cleft, adjust); ++ if (left->rc_refcount != new_refcount) + return false; +- if (right->rc_refcount != cleft->rc_refcount + adjust) ++ if (right->rc_refcount != new_refcount) + return false; + + /* +@@ -871,6 +884,7 @@ xfs_refc_want_merge_left( + enum xfs_refc_adjust_op adjust) + { + unsigned long long ulen = left->rc_blockcount; ++ xfs_nlink_t new_refcount; + + /* + * For a left merge, the left shoulder record must be adjacent to the +@@ -881,7 +895,8 @@ xfs_refc_want_merge_left( + return false; + + /* Left shoulder record refcount must match the new refcount. */ +- if (left->rc_refcount != cleft->rc_refcount + adjust) ++ new_refcount = xfs_refc_merge_refcount(cleft, adjust); ++ if (left->rc_refcount != new_refcount) + return false; + + /* +@@ -903,6 +918,7 @@ xfs_refc_want_merge_right( + enum xfs_refc_adjust_op adjust) + { + unsigned long long ulen = right->rc_blockcount; ++ xfs_nlink_t new_refcount; + + /* + * For a right merge, the right shoulder record must be adjacent to the +@@ -913,7 +929,8 @@ xfs_refc_want_merge_right( + return false; + + /* Right shoulder record refcount must match the new refcount. */ +- if (right->rc_refcount != cright->rc_refcount + adjust) ++ new_refcount = xfs_refc_merge_refcount(cright, adjust); ++ if (right->rc_refcount != new_refcount) + return false; + + /* diff --git a/queue-6.1/xfs-fix-incorrect-error-out-in-xfs_remove.patch b/queue-6.1/xfs-fix-incorrect-error-out-in-xfs_remove.patch new file mode 100644 index 00000000000..ab484f545e6 --- /dev/null +++ b/queue-6.1/xfs-fix-incorrect-error-out-in-xfs_remove.patch @@ -0,0 +1,36 @@ +From stable+bounces-42900-greg=kroah.com@vger.kernel.org Wed May 1 20:41:50 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:40:59 -0700 +Subject: xfs: fix incorrect error-out in xfs_remove +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" , Andrey Albershteyn , Leah Rumancik +Message-ID: <20240501184112.3799035-11-leah.rumancik@gmail.com> + +From: "Darrick J. Wong" + +[ Upstream commit 2653d53345bda90604f673bb211dd060a5a5c232 ] + +Clean up resources if resetting the dotdot entry doesn't succeed. +Observed through code inspection. + +Fixes: 5838d0356bb3 ("xfs: reset child dir '..' entry when unlinking child") +Signed-off-by: Darrick J. Wong +Reviewed-by: Andrey Albershteyn +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_inode.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xfs/xfs_inode.c ++++ b/fs/xfs/xfs_inode.c +@@ -2479,7 +2479,7 @@ xfs_remove( + error = xfs_dir_replace(tp, ip, &xfs_name_dotdot, + tp->t_mountp->m_sb.sb_rootino, 0); + if (error) +- return error; ++ goto out_trans_cancel; + } + } else { + /* diff --git a/queue-6.1/xfs-fix-incorrect-i_nlink-caused-by-inode-racing.patch b/queue-6.1/xfs-fix-incorrect-i_nlink-caused-by-inode-racing.patch new file mode 100644 index 00000000000..db2681d519c --- /dev/null +++ b/queue-6.1/xfs-fix-incorrect-i_nlink-caused-by-inode-racing.patch @@ -0,0 +1,81 @@ +From stable+bounces-42902-greg=kroah.com@vger.kernel.org Wed May 1 20:41:56 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:41:01 -0700 +Subject: xfs: fix incorrect i_nlink caused by inode racing +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Long Li , "Darrick J . Wong" , Leah Rumancik +Message-ID: <20240501184112.3799035-13-leah.rumancik@gmail.com> + +From: Long Li + +[ Upstream commit 28b4b0596343d19d140da059eee0e5c2b5328731 ] + +The following error occurred during the fsstress test: + +XFS: Assertion failed: VFS_I(ip)->i_nlink >= 2, file: fs/xfs/xfs_inode.c, line: 2452 + +The problem was that inode race condition causes incorrect i_nlink to be +written to disk, and then it is read into memory. Consider the following +call graph, inodes that are marked as both XFS_IFLUSHING and +XFS_IRECLAIMABLE, i_nlink will be reset to 1 and then restored to original +value in xfs_reinit_inode(). Therefore, the i_nlink of directory on disk +may be set to 1. + + xfsaild + xfs_inode_item_push + xfs_iflush_cluster + xfs_iflush + xfs_inode_to_disk + + xfs_iget + xfs_iget_cache_hit + xfs_iget_recycle + xfs_reinit_inode + inode_init_always + +xfs_reinit_inode() needs to hold the ILOCK_EXCL as it is changing internal +inode state and can race with other RCU protected inode lookups. On the +read side, xfs_iflush_cluster() grabs the ILOCK_SHARED while under rcu + +ip->i_flags_lock, and so xfs_iflush/xfs_inode_to_disk() are protected from +racing inode updates (during transactions) by that lock. + +Fixes: ff7bebeb91f8 ("xfs: refactor the inode recycling code") # goes further back than this +Signed-off-by: Long Li +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_icache.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/fs/xfs/xfs_icache.c ++++ b/fs/xfs/xfs_icache.c +@@ -342,6 +342,9 @@ xfs_iget_recycle( + + trace_xfs_iget_recycle(ip); + ++ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) ++ return -EAGAIN; ++ + /* + * We need to make it look like the inode is being reclaimed to prevent + * the actual reclaim workers from stomping over us while we recycle +@@ -355,6 +358,7 @@ xfs_iget_recycle( + + ASSERT(!rwsem_is_locked(&inode->i_rwsem)); + error = xfs_reinit_inode(mp, inode); ++ xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) { + /* + * Re-initializing the inode failed, and we are in deep +@@ -523,6 +527,8 @@ xfs_iget_cache_hit( + if (ip->i_flags & XFS_IRECLAIMABLE) { + /* Drops i_flags_lock and RCU read lock. */ + error = xfs_iget_recycle(pag, ip); ++ if (error == -EAGAIN) ++ goto out_skip; + if (error) + return error; + } else { diff --git a/queue-6.1/xfs-fix-log-recovery-when-unknown-rocompat-bits-are-set.patch b/queue-6.1/xfs-fix-log-recovery-when-unknown-rocompat-bits-are-set.patch new file mode 100644 index 00000000000..077e24f8087 --- /dev/null +++ b/queue-6.1/xfs-fix-log-recovery-when-unknown-rocompat-bits-are-set.patch @@ -0,0 +1,95 @@ +From stable+bounces-42911-greg=kroah.com@vger.kernel.org Wed May 1 20:42:18 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:41:10 -0700 +Subject: xfs: fix log recovery when unknown rocompat bits are set +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" , Dave Chinner , Leah Rumancik +Message-ID: <20240501184112.3799035-22-leah.rumancik@gmail.com> + +From: "Darrick J. Wong" + +[ Upstream commit 74ad4693b6473950e971b3dc525b5ee7570e05d0 ] + +Log recovery has always run on read only mounts, even where the primary +superblock advertises unknown rocompat bits. Due to a misunderstanding +between Eric and Darrick back in 2018, we accidentally changed the +superblock write verifier to shutdown the fs over that exact scenario. +As a result, the log cleaning that occurs at the end of the mounting +process fails if there are unknown rocompat bits set. + +As we now allow writing of the superblock if there are unknown rocompat +bits set on a RO mount, we no longer want to turn off RO state to allow +log recovery to succeed on a RO mount. Hence we also remove all the +(now unnecessary) RO state toggling from the log recovery path. + +Fixes: 9e037cb7972f ("xfs: check for unknown v5 feature bits in superblock write verifier" +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_sb.c | 3 ++- + fs/xfs/xfs_log.c | 17 ----------------- + 2 files changed, 2 insertions(+), 18 deletions(-) + +--- a/fs/xfs/libxfs/xfs_sb.c ++++ b/fs/xfs/libxfs/xfs_sb.c +@@ -266,7 +266,8 @@ xfs_validate_sb_write( + return -EFSCORRUPTED; + } + +- if (xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { ++ if (!xfs_is_readonly(mp) && ++ xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + xfs_alert(mp, + "Corruption detected in superblock read-only compatible features (0x%x)!", + (sbp->sb_features_ro_compat & +--- a/fs/xfs/xfs_log.c ++++ b/fs/xfs/xfs_log.c +@@ -730,15 +730,7 @@ xfs_log_mount( + * just worked. + */ + if (!xfs_has_norecovery(mp)) { +- /* +- * log recovery ignores readonly state and so we need to clear +- * mount-based read only state so it can write to disk. +- */ +- bool readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, +- &mp->m_opstate); + error = xlog_recover(log); +- if (readonly) +- set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); + if (error) { + xfs_warn(mp, "log mount/recovery failed: error %d", + error); +@@ -787,7 +779,6 @@ xfs_log_mount_finish( + struct xfs_mount *mp) + { + struct xlog *log = mp->m_log; +- bool readonly; + int error = 0; + + if (xfs_has_norecovery(mp)) { +@@ -796,12 +787,6 @@ xfs_log_mount_finish( + } + + /* +- * log recovery ignores readonly state and so we need to clear +- * mount-based read only state so it can write to disk. +- */ +- readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); +- +- /* + * During the second phase of log recovery, we need iget and + * iput to behave like they do for an active filesystem. + * xfs_fs_drop_inode needs to be able to prevent the deletion +@@ -850,8 +835,6 @@ xfs_log_mount_finish( + xfs_buftarg_drain(mp->m_ddev_targp); + + clear_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate); +- if (readonly) +- set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); + + /* Make sure the log is dead if we're returning failure. */ + ASSERT(!error || xlog_is_shutdown(log)); diff --git a/queue-6.1/xfs-fix-off-by-one-block-in-xfs_discard_folio.patch b/queue-6.1/xfs-fix-off-by-one-block-in-xfs_discard_folio.patch new file mode 100644 index 00000000000..f015f03db06 --- /dev/null +++ b/queue-6.1/xfs-fix-off-by-one-block-in-xfs_discard_folio.patch @@ -0,0 +1,83 @@ +From stable+bounces-42898-greg=kroah.com@vger.kernel.org Wed May 1 20:41:46 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:40:58 -0700 +Subject: xfs: fix off-by-one-block in xfs_discard_folio() +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner , Pengfei Xu , Brian Foster , "Darrick J . Wong" , Leah Rumancik +Message-ID: <20240501184112.3799035-10-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit 8ac5b996bf5199f15b7687ceae989f8b2a410dda ] + +The recent writeback corruption fixes changed the code in +xfs_discard_folio() to calculate a byte range to for punching +delalloc extents. A mistake was made in using round_up(pos) for the +end offset, because when pos points at the first byte of a block, it +does not get rounded up to point to the end byte of the block. hence +the punch range is short, and this leads to unexpected behaviour in +certain cases in xfs_bmap_punch_delalloc_range. + +e.g. pos = 0 means we call xfs_bmap_punch_delalloc_range(0,0), so +there is no previous extent and it rounds up the punch to the end of +the delalloc extent it found at offset 0, not the end of the range +given to xfs_bmap_punch_delalloc_range(). + +Fix this by handling the zero block offset case correctly. + +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=217030 +Link: https://lore.kernel.org/linux-xfs/Y+vOfaxIWX1c%2Fyy9@bfoster/ +Fixes: 7348b322332d ("xfs: xfs_bmap_punch_delalloc_range() should take a byte range") +Reported-by: Pengfei Xu +Found-by: Brian Foster +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_aops.c | 21 ++++++++++++++------- + 1 file changed, 14 insertions(+), 7 deletions(-) + +--- a/fs/xfs/xfs_aops.c ++++ b/fs/xfs/xfs_aops.c +@@ -439,15 +439,17 @@ xfs_prepare_ioend( + } + + /* +- * If the page has delalloc blocks on it, we need to punch them out before we +- * invalidate the page. If we don't, we leave a stale delalloc mapping on the +- * inode that can trip up a later direct I/O read operation on the same region. ++ * If the folio has delalloc blocks on it, the caller is asking us to punch them ++ * out. If we don't, we can leave a stale delalloc mapping covered by a clean ++ * page that needs to be dirtied again before the delalloc mapping can be ++ * converted. This stale delalloc mapping can trip up a later direct I/O read ++ * operation on the same region. + * +- * We prevent this by truncating away the delalloc regions on the page. Because ++ * We prevent this by truncating away the delalloc regions on the folio. Because + * they are delalloc, we can do this without needing a transaction. Indeed - if + * we get ENOSPC errors, we have to be able to do this truncation without a +- * transaction as there is no space left for block reservation (typically why we +- * see a ENOSPC in writeback). ++ * transaction as there is no space left for block reservation (typically why ++ * we see a ENOSPC in writeback). + */ + static void + xfs_discard_folio( +@@ -465,8 +467,13 @@ xfs_discard_folio( + "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", + folio, ip->i_ino, pos); + ++ /* ++ * The end of the punch range is always the offset of the the first ++ * byte of the next folio. Hence the end offset is only dependent on the ++ * folio itself and not the start offset that is passed in. ++ */ + error = xfs_bmap_punch_delalloc_range(ip, pos, +- round_up(pos, folio_size(folio))); ++ folio_pos(folio) + folio_size(folio)); + + if (error && !xfs_is_shutdown(mp)) + xfs_alert(mp, "page discard unable to remove delalloc mapping."); diff --git a/queue-6.1/xfs-fix-sb-write-verify-for-lazysbcount.patch b/queue-6.1/xfs-fix-sb-write-verify-for-lazysbcount.patch new file mode 100644 index 00000000000..d9d55903f5a --- /dev/null +++ b/queue-6.1/xfs-fix-sb-write-verify-for-lazysbcount.patch @@ -0,0 +1,152 @@ +From stable+bounces-42901-greg=kroah.com@vger.kernel.org Wed May 1 20:41:52 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:41:00 -0700 +Subject: xfs: fix sb write verify for lazysbcount +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Long Li , "Darrick J . Wong" , Leah Rumancik +Message-ID: <20240501184112.3799035-12-leah.rumancik@gmail.com> + +From: Long Li + +[ Upstream commit 59f6ab40fd8735c9a1a15401610a31cc06a0bbd6 ] + +When lazysbcount is enabled, fsstress and loop mount/unmount test report +the following problems: + +XFS (loop0): SB summary counter sanity check failed +XFS (loop0): Metadata corruption detected at xfs_sb_write_verify+0x13b/0x460, + xfs_sb block 0x0 +XFS (loop0): Unmount and run xfs_repair +XFS (loop0): First 128 bytes of corrupted metadata buffer: +00000000: 58 46 53 42 00 00 10 00 00 00 00 00 00 28 00 00 XFSB.........(.. +00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ +00000020: 69 fb 7c cd 5f dc 44 af 85 74 e0 cc d4 e3 34 5a i.|._.D..t....4Z +00000030: 00 00 00 00 00 20 00 06 00 00 00 00 00 00 00 80 ..... .......... +00000040: 00 00 00 00 00 00 00 81 00 00 00 00 00 00 00 82 ................ +00000050: 00 00 00 01 00 0a 00 00 00 00 00 04 00 00 00 00 ................ +00000060: 00 00 0a 00 b4 b5 02 00 02 00 00 08 00 00 00 00 ................ +00000070: 00 00 00 00 00 00 00 00 0c 09 09 03 14 00 00 19 ................ +XFS (loop0): Corruption of in-memory data (0x8) detected at _xfs_buf_ioapply + +0xe1e/0x10e0 (fs/xfs/xfs_buf.c:1580). Shutting down filesystem. +XFS (loop0): Please unmount the filesystem and rectify the problem(s) +XFS (loop0): log mount/recovery failed: error -117 +XFS (loop0): log mount failed + +This corruption will shutdown the file system and the file system will +no longer be mountable. The following script can reproduce the problem, +but it may take a long time. + + #!/bin/bash + + device=/dev/sda + testdir=/mnt/test + round=0 + + function fail() + { + echo "$*" + exit 1 + } + + mkdir -p $testdir + while [ $round -lt 10000 ] + do + echo "******* round $round ********" + mkfs.xfs -f $device + mount $device $testdir || fail "mount failed!" + fsstress -d $testdir -l 0 -n 10000 -p 4 >/dev/null & + sleep 4 + killall -w fsstress + umount $testdir + xfs_repair -e $device > /dev/null + if [ $? -eq 2 ];then + echo "ERR CODE 2: Dirty log exception during repair." + exit 1 + fi + round=$(($round+1)) + done + +With lazysbcount is enabled, There is no additional lock protection for +reading m_ifree and m_icount in xfs_log_sb(), if other cpu modifies the +m_ifree, this will make the m_ifree greater than m_icount. For example, +consider the following sequence and ifreedelta is postive: + + CPU0 CPU1 + xfs_log_sb xfs_trans_unreserve_and_mod_sb + ---------- ------------------------------ + percpu_counter_sum(&mp->m_icount) + percpu_counter_add_batch(&mp->m_icount, + idelta, XFS_ICOUNT_BATCH) + percpu_counter_add(&mp->m_ifree, ifreedelta); + percpu_counter_sum(&mp->m_ifree) + +After this, incorrect inode count (sb_ifree > sb_icount) will be writen to +the log. In the subsequent writing of sb, incorrect inode count (sb_ifree > +sb_icount) will fail to pass the boundary check in xfs_validate_sb_write() +that cause the file system shutdown. + +When lazysbcount is enabled, we don't need to guarantee that Lazy sb +counters are completely correct, but we do need to guarantee that sb_ifree +<= sb_icount. On the other hand, the constraint that m_ifree <= m_icount +must be satisfied any time that there /cannot/ be other threads allocating +or freeing inode chunks. If the constraint is violated under these +circumstances, sb_i{count,free} (the ondisk superblock inode counters) +maybe incorrect and need to be marked sick at unmount, the count will +be rebuilt on the next mount. + +Fixes: 8756a5af1819 ("libxfs: add more bounds checking to sb sanity checks") +Signed-off-by: Long Li +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_sb.c | 4 +++- + fs/xfs/xfs_mount.c | 15 +++++++++++++++ + 2 files changed, 18 insertions(+), 1 deletion(-) + +--- a/fs/xfs/libxfs/xfs_sb.c ++++ b/fs/xfs/libxfs/xfs_sb.c +@@ -973,7 +973,9 @@ xfs_log_sb( + */ + if (xfs_has_lazysbcount(mp)) { + mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); +- mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); ++ mp->m_sb.sb_ifree = min_t(uint64_t, ++ percpu_counter_sum(&mp->m_ifree), ++ mp->m_sb.sb_icount); + mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + } + +--- a/fs/xfs/xfs_mount.c ++++ b/fs/xfs/xfs_mount.c +@@ -538,6 +538,20 @@ xfs_check_summary_counts( + return 0; + } + ++static void ++xfs_unmount_check( ++ struct xfs_mount *mp) ++{ ++ if (xfs_is_shutdown(mp)) ++ return; ++ ++ if (percpu_counter_sum(&mp->m_ifree) > ++ percpu_counter_sum(&mp->m_icount)) { ++ xfs_alert(mp, "ifree/icount mismatch at unmount"); ++ xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS); ++ } ++} ++ + /* + * Flush and reclaim dirty inodes in preparation for unmount. Inodes and + * internal inode structures can be sitting in the CIL and AIL at this point, +@@ -1077,6 +1091,7 @@ xfs_unmountfs( + if (error) + xfs_warn(mp, "Unable to free reserved block pool. " + "Freespace may not be correct on next mount."); ++ xfs_unmount_check(mp); + + xfs_log_unmount(mp); + xfs_da_unmount(mp); diff --git a/queue-6.1/xfs-fix-super-block-buf-log-item-uaf-during-force-shutdown.patch b/queue-6.1/xfs-fix-super-block-buf-log-item-uaf-during-force-shutdown.patch new file mode 100644 index 00000000000..922b8ba5bde --- /dev/null +++ b/queue-6.1/xfs-fix-super-block-buf-log-item-uaf-during-force-shutdown.patch @@ -0,0 +1,147 @@ +From stable+bounces-42906-greg=kroah.com@vger.kernel.org Wed May 1 20:42:06 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:41:05 -0700 +Subject: xfs: fix super block buf log item UAF during force shutdown +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Guo Xuenan , "Darrick J . Wong" , Leah Rumancik +Message-ID: <20240501184112.3799035-17-leah.rumancik@gmail.com> + +From: Guo Xuenan + +[ Upstream commit 575689fc0ffa6c4bb4e72fd18e31a6525a6124e0 ] + +xfs log io error will trigger xlog shut down, and end_io worker call +xlog_state_shutdown_callbacks to unpin and release the buf log item. +The race condition is that when there are some thread doing transaction +commit and happened not to be intercepted by xlog_is_shutdown, then, +these log item will be insert into CIL, when unpin and release these +buf log item, UAF will occur. BTW, add delay before `xlog_cil_commit` +can increase recurrence probability. + +The following call graph actually encountered this bad situation. +fsstress io end worker kworker/0:1H-216 + xlog_ioend_work + ->xlog_force_shutdown + ->xlog_state_shutdown_callbacks + ->xlog_cil_process_committed + ->xlog_cil_committed + ->xfs_trans_committed_bulk +->xfs_trans_apply_sb_deltas ->li_ops->iop_unpin(lip, 1); + ->xfs_trans_getsb + ->_xfs_trans_bjoin + ->xfs_buf_item_init + ->if (bip) { return 0;} //relog +->xlog_cil_commit + ->xlog_cil_insert_items //insert into CIL + ->xfs_buf_ioend_fail(bp); + ->xfs_buf_ioend + ->xfs_buf_item_done + ->xfs_buf_item_relse + ->xfs_buf_item_free + +when cil push worker gather percpu cil and insert super block buf log item +into ctx->log_items then uaf occurs. + +================================================================== +BUG: KASAN: use-after-free in xlog_cil_push_work+0x1c8f/0x22f0 +Write of size 8 at addr ffff88801800f3f0 by task kworker/u4:4/105 + +CPU: 0 PID: 105 Comm: kworker/u4:4 Tainted: G W +6.1.0-rc1-00001-g274115149b42 #136 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS +1.13.0-1ubuntu1.1 04/01/2014 +Workqueue: xfs-cil/sda xlog_cil_push_work +Call Trace: + + dump_stack_lvl+0x4d/0x66 + print_report+0x171/0x4a6 + kasan_report+0xb3/0x130 + xlog_cil_push_work+0x1c8f/0x22f0 + process_one_work+0x6f9/0xf70 + worker_thread+0x578/0xf30 + kthread+0x28c/0x330 + ret_from_fork+0x1f/0x30 + + +Allocated by task 2145: + kasan_save_stack+0x1e/0x40 + kasan_set_track+0x21/0x30 + __kasan_slab_alloc+0x54/0x60 + kmem_cache_alloc+0x14a/0x510 + xfs_buf_item_init+0x160/0x6d0 + _xfs_trans_bjoin+0x7f/0x2e0 + xfs_trans_getsb+0xb6/0x3f0 + xfs_trans_apply_sb_deltas+0x1f/0x8c0 + __xfs_trans_commit+0xa25/0xe10 + xfs_symlink+0xe23/0x1660 + xfs_vn_symlink+0x157/0x280 + vfs_symlink+0x491/0x790 + do_symlinkat+0x128/0x220 + __x64_sys_symlink+0x7a/0x90 + do_syscall_64+0x35/0x80 + entry_SYSCALL_64_after_hwframe+0x63/0xcd + +Freed by task 216: + kasan_save_stack+0x1e/0x40 + kasan_set_track+0x21/0x30 + kasan_save_free_info+0x2a/0x40 + __kasan_slab_free+0x105/0x1a0 + kmem_cache_free+0xb6/0x460 + xfs_buf_ioend+0x1e9/0x11f0 + xfs_buf_item_unpin+0x3d6/0x840 + xfs_trans_committed_bulk+0x4c2/0x7c0 + xlog_cil_committed+0xab6/0xfb0 + xlog_cil_process_committed+0x117/0x1e0 + xlog_state_shutdown_callbacks+0x208/0x440 + xlog_force_shutdown+0x1b3/0x3a0 + xlog_ioend_work+0xef/0x1d0 + process_one_work+0x6f9/0xf70 + worker_thread+0x578/0xf30 + kthread+0x28c/0x330 + ret_from_fork+0x1f/0x30 + +The buggy address belongs to the object at ffff88801800f388 + which belongs to the cache xfs_buf_item of size 272 +The buggy address is located 104 bytes inside of + 272-byte region [ffff88801800f388, ffff88801800f498) + +The buggy address belongs to the physical page: +page:ffffea0000600380 refcount:1 mapcount:0 mapping:0000000000000000 +index:0xffff88801800f208 pfn:0x1800e +head:ffffea0000600380 order:1 compound_mapcount:0 compound_pincount:0 +flags: 0x1fffff80010200(slab|head|node=0|zone=1|lastcpupid=0x1fffff) +raw: 001fffff80010200 ffffea0000699788 ffff88801319db50 ffff88800fb50640 +raw: ffff88801800f208 000000000015000a 00000001ffffffff 0000000000000000 +page dumped because: kasan: bad access detected + +Memory state around the buggy address: + ffff88801800f280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ffff88801800f300: fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc fc +>ffff88801800f380: fc fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ^ + ffff88801800f400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ffff88801800f480: fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc fc +================================================================== +Disabling lock debugging due to kernel taint + +Signed-off-by: Guo Xuenan +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_buf_item.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/xfs/xfs_buf_item.c ++++ b/fs/xfs/xfs_buf_item.c +@@ -1018,6 +1018,8 @@ xfs_buf_item_relse( + trace_xfs_buf_item_relse(bp, _RET_IP_); + ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); + ++ if (atomic_read(&bip->bli_refcount)) ++ return; + bp->b_log_item = NULL; + xfs_buf_rele(bp); + xfs_buf_item_free(bip); diff --git a/queue-6.1/xfs-get-root-inode-correctly-at-bulkstat.patch b/queue-6.1/xfs-get-root-inode-correctly-at-bulkstat.patch new file mode 100644 index 00000000000..0b40f761ffe --- /dev/null +++ b/queue-6.1/xfs-get-root-inode-correctly-at-bulkstat.patch @@ -0,0 +1,49 @@ +From stable+bounces-42912-greg=kroah.com@vger.kernel.org Wed May 1 20:42:18 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:41:11 -0700 +Subject: xfs: get root inode correctly at bulkstat +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Hironori Shiina , Hironori Shiina , "Darrick J . Wong" , Leah Rumancik +Message-ID: <20240501184112.3799035-23-leah.rumancik@gmail.com> + +From: Hironori Shiina + +[ Upstream commit 817644fa4525258992f17fecf4f1d6cdd2e1b731 ] + +The root inode number should be set to `breq->startino` for getting stat +information of the root when XFS_BULK_IREQ_SPECIAL_ROOT is used. +Otherwise, the inode search is started from 1 +(XFS_BULK_IREQ_SPECIAL_ROOT) and the inode with the lowest number in a +filesystem is returned. + +Fixes: bf3cb3944792 ("xfs: allow single bulkstat of special inodes") +Signed-off-by: Hironori Shiina +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_ioctl.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/xfs/xfs_ioctl.c ++++ b/fs/xfs/xfs_ioctl.c +@@ -754,7 +754,7 @@ xfs_bulkstat_fmt( + static int + xfs_bulk_ireq_setup( + struct xfs_mount *mp, +- struct xfs_bulk_ireq *hdr, ++ const struct xfs_bulk_ireq *hdr, + struct xfs_ibulk *breq, + void __user *ubuffer) + { +@@ -780,7 +780,7 @@ xfs_bulk_ireq_setup( + + switch (hdr->ino) { + case XFS_BULK_IREQ_SPECIAL_ROOT: +- hdr->ino = mp->m_sb.sb_rootino; ++ breq->startino = mp->m_sb.sb_rootino; + break; + default: + return -EINVAL; diff --git a/queue-6.1/xfs-hoist-refcount-record-merge-predicates.patch b/queue-6.1/xfs-hoist-refcount-record-merge-predicates.patch new file mode 100644 index 00000000000..74372197b3f --- /dev/null +++ b/queue-6.1/xfs-hoist-refcount-record-merge-predicates.patch @@ -0,0 +1,188 @@ +From stable+bounces-42907-greg=kroah.com@vger.kernel.org Wed May 1 20:42:07 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:41:06 -0700 +Subject: xfs: hoist refcount record merge predicates +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" , Dave Chinner , Xiao Yang , Leah Rumancik +Message-ID: <20240501184112.3799035-18-leah.rumancik@gmail.com> + +From: "Darrick J. Wong" + +[ Upstream commit 9d720a5a658f5135861773f26e927449bef93d61 ] + +Hoist these multiline conditionals into separate static inline helpers +to improve readability and set the stage for corruption fixes that will +be introduced in the next patch. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Reviewed-by: Xiao Yang +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_refcount.c | 129 +++++++++++++++++++++++++++++++++++++------ + 1 file changed, 113 insertions(+), 16 deletions(-) + +--- a/fs/xfs/libxfs/xfs_refcount.c ++++ b/fs/xfs/libxfs/xfs_refcount.c +@@ -815,11 +815,119 @@ out_error: + /* Is this extent valid? */ + static inline bool + xfs_refc_valid( +- struct xfs_refcount_irec *rc) ++ const struct xfs_refcount_irec *rc) + { + return rc->rc_startblock != NULLAGBLOCK; + } + ++static inline bool ++xfs_refc_want_merge_center( ++ const struct xfs_refcount_irec *left, ++ const struct xfs_refcount_irec *cleft, ++ const struct xfs_refcount_irec *cright, ++ const struct xfs_refcount_irec *right, ++ bool cleft_is_cright, ++ enum xfs_refc_adjust_op adjust, ++ unsigned long long *ulenp) ++{ ++ unsigned long long ulen = left->rc_blockcount; ++ ++ /* ++ * To merge with a center record, both shoulder records must be ++ * adjacent to the record we want to adjust. This is only true if ++ * find_left and find_right made all four records valid. ++ */ ++ if (!xfs_refc_valid(left) || !xfs_refc_valid(right) || ++ !xfs_refc_valid(cleft) || !xfs_refc_valid(cright)) ++ return false; ++ ++ /* There must only be one record for the entire range. */ ++ if (!cleft_is_cright) ++ return false; ++ ++ /* The shoulder record refcounts must match the new refcount. */ ++ if (left->rc_refcount != cleft->rc_refcount + adjust) ++ return false; ++ if (right->rc_refcount != cleft->rc_refcount + adjust) ++ return false; ++ ++ /* ++ * The new record cannot exceed the max length. ulen is a ULL as the ++ * individual record block counts can be up to (u32 - 1) in length ++ * hence we need to catch u32 addition overflows here. ++ */ ++ ulen += cleft->rc_blockcount + right->rc_blockcount; ++ if (ulen >= MAXREFCEXTLEN) ++ return false; ++ ++ *ulenp = ulen; ++ return true; ++} ++ ++static inline bool ++xfs_refc_want_merge_left( ++ const struct xfs_refcount_irec *left, ++ const struct xfs_refcount_irec *cleft, ++ enum xfs_refc_adjust_op adjust) ++{ ++ unsigned long long ulen = left->rc_blockcount; ++ ++ /* ++ * For a left merge, the left shoulder record must be adjacent to the ++ * start of the range. If this is true, find_left made left and cleft ++ * contain valid contents. ++ */ ++ if (!xfs_refc_valid(left) || !xfs_refc_valid(cleft)) ++ return false; ++ ++ /* Left shoulder record refcount must match the new refcount. */ ++ if (left->rc_refcount != cleft->rc_refcount + adjust) ++ return false; ++ ++ /* ++ * The new record cannot exceed the max length. ulen is a ULL as the ++ * individual record block counts can be up to (u32 - 1) in length ++ * hence we need to catch u32 addition overflows here. ++ */ ++ ulen += cleft->rc_blockcount; ++ if (ulen >= MAXREFCEXTLEN) ++ return false; ++ ++ return true; ++} ++ ++static inline bool ++xfs_refc_want_merge_right( ++ const struct xfs_refcount_irec *cright, ++ const struct xfs_refcount_irec *right, ++ enum xfs_refc_adjust_op adjust) ++{ ++ unsigned long long ulen = right->rc_blockcount; ++ ++ /* ++ * For a right merge, the right shoulder record must be adjacent to the ++ * end of the range. If this is true, find_right made cright and right ++ * contain valid contents. ++ */ ++ if (!xfs_refc_valid(right) || !xfs_refc_valid(cright)) ++ return false; ++ ++ /* Right shoulder record refcount must match the new refcount. */ ++ if (right->rc_refcount != cright->rc_refcount + adjust) ++ return false; ++ ++ /* ++ * The new record cannot exceed the max length. ulen is a ULL as the ++ * individual record block counts can be up to (u32 - 1) in length ++ * hence we need to catch u32 addition overflows here. ++ */ ++ ulen += cright->rc_blockcount; ++ if (ulen >= MAXREFCEXTLEN) ++ return false; ++ ++ return true; ++} ++ + /* + * Try to merge with any extents on the boundaries of the adjustment range. + */ +@@ -861,23 +969,15 @@ xfs_refcount_merge_extents( + (cleft.rc_blockcount == cright.rc_blockcount); + + /* Try to merge left, cleft, and right. cleft must == cright. */ +- ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount + +- right.rc_blockcount; +- if (xfs_refc_valid(&left) && xfs_refc_valid(&right) && +- xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal && +- left.rc_refcount == cleft.rc_refcount + adjust && +- right.rc_refcount == cleft.rc_refcount + adjust && +- ulen < MAXREFCEXTLEN) { ++ if (xfs_refc_want_merge_center(&left, &cleft, &cright, &right, cequal, ++ adjust, &ulen)) { + *shape_changed = true; + return xfs_refcount_merge_center_extents(cur, &left, &cleft, + &right, ulen, aglen); + } + + /* Try to merge left and cleft. */ +- ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount; +- if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) && +- left.rc_refcount == cleft.rc_refcount + adjust && +- ulen < MAXREFCEXTLEN) { ++ if (xfs_refc_want_merge_left(&left, &cleft, adjust)) { + *shape_changed = true; + error = xfs_refcount_merge_left_extent(cur, &left, &cleft, + agbno, aglen); +@@ -893,10 +993,7 @@ xfs_refcount_merge_extents( + } + + /* Try to merge cright and right. */ +- ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount; +- if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) && +- right.rc_refcount == cright.rc_refcount + adjust && +- ulen < MAXREFCEXTLEN) { ++ if (xfs_refc_want_merge_right(&cright, &right, adjust)) { + *shape_changed = true; + return xfs_refcount_merge_right_extent(cur, &right, &cright, + aglen); diff --git a/queue-6.1/xfs-invalidate-block-device-page-cache-during-unmount.patch b/queue-6.1/xfs-invalidate-block-device-page-cache-during-unmount.patch new file mode 100644 index 00000000000..32c666b9c00 --- /dev/null +++ b/queue-6.1/xfs-invalidate-block-device-page-cache-during-unmount.patch @@ -0,0 +1,71 @@ +From stable+bounces-42903-greg=kroah.com@vger.kernel.org Wed May 1 20:41:59 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:41:02 -0700 +Subject: xfs: invalidate block device page cache during unmount +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" , Gao Xiang , Dave Chinner , Leah Rumancik +Message-ID: <20240501184112.3799035-14-leah.rumancik@gmail.com> + +From: "Darrick J. Wong" + +[ Upstream commit 032e160305f6872e590c77f11896fb28365c6d6c ] + +Every now and then I see fstests failures on aarch64 (64k pages) that +trigger on the following sequence: + +mkfs.xfs $dev +mount $dev $mnt +touch $mnt/a +umount $mnt +xfs_db -c 'path /a' -c 'print' $dev + +99% of the time this succeeds, but every now and then xfs_db cannot find +/a and fails. This turns out to be a race involving udev/blkid, the +page cache for the block device, and the xfs_db process. + +udev is triggered whenever anyone closes a block device or unmounts it. +The default udev rules invoke blkid to read the fs super and create +symlinks to the bdev under /dev/disk. For this, it uses buffered reads +through the page cache. + +xfs_db also uses buffered reads to examine metadata. There is no +coordination between xfs_db and udev, which means that they can run +concurrently. Note there is no coordination between the kernel and +blkid either. + +On a system with 64k pages, the page cache can cache the superblock and +the root inode (and hence the root dir) with the same 64k page. If +udev spawns blkid after the mkfs and the system is busy enough that it +is still running when xfs_db starts up, they'll both read from the same +page in the pagecache. + +The unmount writes updated inode metadata to disk directly. The XFS +buffer cache does not use the bdev pagecache, nor does it invalidate the +pagecache on umount. If the above scenario occurs, the pagecache no +longer reflects what's on disk, xfs_db reads the stale metadata, and +fails to find /a. Most of the time this succeeds because closing a bdev +invalidates the page cache, but when processes race, everyone loses. + +Fix the problem by invalidating the bdev pagecache after flushing the +bdev, so that xfs_db will see up to date metadata. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Gao Xiang +Reviewed-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_buf.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/xfs/xfs_buf.c ++++ b/fs/xfs/xfs_buf.c +@@ -1945,6 +1945,7 @@ xfs_free_buftarg( + list_lru_destroy(&btp->bt_lru); + + blkdev_issue_flush(btp->bt_bdev); ++ invalidate_bdev(btp->bt_bdev); + fs_put_dax(btp->bt_daxdev, btp->bt_mount); + + kmem_free(btp); diff --git a/queue-6.1/xfs-invalidate-xfs_bufs-when-allocating-cow-extents.patch b/queue-6.1/xfs-invalidate-xfs_bufs-when-allocating-cow-extents.patch new file mode 100644 index 00000000000..5fc94e64cb4 --- /dev/null +++ b/queue-6.1/xfs-invalidate-xfs_bufs-when-allocating-cow-extents.patch @@ -0,0 +1,55 @@ +From stable+bounces-42909-greg=kroah.com@vger.kernel.org Wed May 1 20:42:12 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:41:08 -0700 +Subject: xfs: invalidate xfs_bufs when allocating cow extents +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, "Darrick J. Wong" , Dave Chinner , Leah Rumancik +Message-ID: <20240501184112.3799035-20-leah.rumancik@gmail.com> + +From: "Darrick J. Wong" + +[ Upstream commit ddfdd530e43fcb3f7a0a69966e5f6c33497b4ae3 ] + +While investigating test failures in xfs/17[1-3] in alwayscow mode, I +noticed through code inspection that xfs_bmap_alloc_userdata isn't +setting XFS_ALLOC_USERDATA when allocating extents for a file's CoW +fork. COW staging extents should be flagged as USERDATA, since user +data are persisted to these blocks before being remapped into a file. + +This mis-classification has a few impacts on the behavior of the system. +First, the filestreams allocator is supposed to keep allocating from a +chosen AG until it runs out of space in that AG. However, it only does +that for USERDATA allocations, which means that COW allocations aren't +tied to the filestreams AG. Fortunately, few people use filestreams, so +nobody's noticed. + +A more serious problem is that xfs_alloc_ag_vextent_small looks for a +buffer to invalidate *if* the USERDATA flag is set and the AG is so full +that the allocation had to come from the AGFL because the cntbt is +empty. The consequences of not invalidating the buffer are severe -- +if the AIL incorrectly checkpoints a buffer that is now being used to +store user data, that action will clobber the user's written data. + +Fix filestreams and yet another data corruption vector by flagging COW +allocations as USERDATA. + +Signed-off-by: Darrick J. Wong +Reviewed-by: Dave Chinner +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_bmap.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -4058,7 +4058,7 @@ xfs_bmap_alloc_userdata( + * the busy list. + */ + bma->datatype = XFS_ALLOC_NOBUSY; +- if (whichfork == XFS_DATA_FORK) { ++ if (whichfork == XFS_DATA_FORK || whichfork == XFS_COW_FORK) { + bma->datatype |= XFS_ALLOC_USERDATA; + if (bma->offset == 0) + bma->datatype |= XFS_ALLOC_INITIAL_USER_DATA; diff --git a/queue-6.1/xfs-iomap-move-delalloc-punching-to-iomap.patch b/queue-6.1/xfs-iomap-move-delalloc-punching-to-iomap.patch new file mode 100644 index 00000000000..8f55ff6c8f6 --- /dev/null +++ b/queue-6.1/xfs-iomap-move-delalloc-punching-to-iomap.patch @@ -0,0 +1,188 @@ +From stable+bounces-42893-greg=kroah.com@vger.kernel.org Wed May 1 20:41:32 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:40:52 -0700 +Subject: xfs,iomap: move delalloc punching to iomap +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner , "Darrick J . Wong" , Leah Rumancik +Message-ID: <20240501184112.3799035-4-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit 9c7babf94a0d686b552e53aded8d4703d1b8b92b ] + +Because that's what Christoph wants for this error handling path +only XFS uses. + +It requires a new iomap export for handling errors over delalloc +ranges. This is basically the XFS code as is stands, but even though +Christoph wants this as iomap funcitonality, we still have +to call it from the filesystem specific ->iomap_end callback, and +call into the iomap code with yet another filesystem specific +callback to punch the delalloc extent within the defined ranges. + +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/iomap/buffered-io.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++ + fs/xfs/xfs_iomap.c | 47 ++++++-------------------------------- + include/linux/iomap.h | 4 +++ + 3 files changed, 72 insertions(+), 39 deletions(-) + +--- a/fs/iomap/buffered-io.c ++++ b/fs/iomap/buffered-io.c +@@ -827,6 +827,66 @@ iomap_file_buffered_write(struct kiocb * + } + EXPORT_SYMBOL_GPL(iomap_file_buffered_write); + ++/* ++ * When a short write occurs, the filesystem may need to remove reserved space ++ * that was allocated in ->iomap_begin from it's ->iomap_end method. For ++ * filesystems that use delayed allocation, we need to punch out delalloc ++ * extents from the range that are not dirty in the page cache. As the write can ++ * race with page faults, there can be dirty pages over the delalloc extent ++ * outside the range of a short write but still within the delalloc extent ++ * allocated for this iomap. ++ * ++ * This function uses [start_byte, end_byte) intervals (i.e. open ended) to ++ * simplify range iterations, but converts them back to {offset,len} tuples for ++ * the punch callback. ++ */ ++int iomap_file_buffered_write_punch_delalloc(struct inode *inode, ++ struct iomap *iomap, loff_t pos, loff_t length, ++ ssize_t written, ++ int (*punch)(struct inode *inode, loff_t pos, loff_t length)) ++{ ++ loff_t start_byte; ++ loff_t end_byte; ++ int blocksize = i_blocksize(inode); ++ int error = 0; ++ ++ if (iomap->type != IOMAP_DELALLOC) ++ return 0; ++ ++ /* If we didn't reserve the blocks, we're not allowed to punch them. */ ++ if (!(iomap->flags & IOMAP_F_NEW)) ++ return 0; ++ ++ /* ++ * start_byte refers to the first unused block after a short write. If ++ * nothing was written, round offset down to point at the first block in ++ * the range. ++ */ ++ if (unlikely(!written)) ++ start_byte = round_down(pos, blocksize); ++ else ++ start_byte = round_up(pos + written, blocksize); ++ end_byte = round_up(pos + length, blocksize); ++ ++ /* Nothing to do if we've written the entire delalloc extent */ ++ if (start_byte >= end_byte) ++ return 0; ++ ++ /* ++ * Lock the mapping to avoid races with page faults re-instantiating ++ * folios and dirtying them via ->page_mkwrite between the page cache ++ * truncation and the delalloc extent removal. Failing to do this can ++ * leave dirty pages with no space reservation in the cache. ++ */ ++ filemap_invalidate_lock(inode->i_mapping); ++ truncate_pagecache_range(inode, start_byte, end_byte - 1); ++ error = punch(inode, start_byte, end_byte - start_byte); ++ filemap_invalidate_unlock(inode->i_mapping); ++ ++ return error; ++} ++EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); ++ + static loff_t iomap_unshare_iter(struct iomap_iter *iter) + { + struct iomap *iomap = &iter->iomap; +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -1123,12 +1123,12 @@ out_unlock: + static int + xfs_buffered_write_delalloc_punch( + struct inode *inode, +- loff_t start_byte, +- loff_t end_byte) ++ loff_t offset, ++ loff_t length) + { + struct xfs_mount *mp = XFS_M(inode->i_sb); +- xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte); +- xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); ++ xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset); ++ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); + + return xfs_bmap_punch_delalloc_range(XFS_I(inode), start_fsb, + end_fsb - start_fsb); +@@ -1143,13 +1143,9 @@ xfs_buffered_write_iomap_end( + unsigned flags, + struct iomap *iomap) + { +- struct xfs_mount *mp = XFS_M(inode->i_sb); +- loff_t start_byte; +- loff_t end_byte; +- int error = 0; + +- if (iomap->type != IOMAP_DELALLOC) +- return 0; ++ struct xfs_mount *mp = XFS_M(inode->i_sb); ++ int error; + + /* + * Behave as if the write failed if drop writes is enabled. Set the NEW +@@ -1160,35 +1156,8 @@ xfs_buffered_write_iomap_end( + written = 0; + } + +- /* If we didn't reserve the blocks, we're not allowed to punch them. */ +- if (!(iomap->flags & IOMAP_F_NEW)) +- return 0; +- +- /* +- * start_fsb refers to the first unused block after a short write. If +- * nothing was written, round offset down to point at the first block in +- * the range. +- */ +- if (unlikely(!written)) +- start_byte = round_down(offset, mp->m_sb.sb_blocksize); +- else +- start_byte = round_up(offset + written, mp->m_sb.sb_blocksize); +- end_byte = round_up(offset + length, mp->m_sb.sb_blocksize); +- +- /* Nothing to do if we've written the entire delalloc extent */ +- if (start_byte >= end_byte) +- return 0; +- +- /* +- * Lock the mapping to avoid races with page faults re-instantiating +- * folios and dirtying them via ->page_mkwrite between the page cache +- * truncation and the delalloc extent removal. Failing to do this can +- * leave dirty pages with no space reservation in the cache. +- */ +- filemap_invalidate_lock(inode->i_mapping); +- truncate_pagecache_range(inode, start_byte, end_byte - 1); +- error = xfs_buffered_write_delalloc_punch(inode, start_byte, end_byte); +- filemap_invalidate_unlock(inode->i_mapping); ++ error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset, ++ length, written, &xfs_buffered_write_delalloc_punch); + if (error && !xfs_is_shutdown(mp)) { + xfs_alert(mp, "%s: unable to clean up ino 0x%llx", + __func__, XFS_I(inode)->i_ino); +--- a/include/linux/iomap.h ++++ b/include/linux/iomap.h +@@ -226,6 +226,10 @@ static inline const struct iomap *iomap_ + + ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from, + const struct iomap_ops *ops); ++int iomap_file_buffered_write_punch_delalloc(struct inode *inode, ++ struct iomap *iomap, loff_t pos, loff_t length, ssize_t written, ++ int (*punch)(struct inode *inode, loff_t pos, loff_t length)); ++ + int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops); + void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops); + bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count); diff --git a/queue-6.1/xfs-punching-delalloc-extents-on-write-failure-is-racy.patch b/queue-6.1/xfs-punching-delalloc-extents-on-write-failure-is-racy.patch new file mode 100644 index 00000000000..164ca8ddcb8 --- /dev/null +++ b/queue-6.1/xfs-punching-delalloc-extents-on-write-failure-is-racy.patch @@ -0,0 +1,117 @@ +From stable+bounces-42891-greg=kroah.com@vger.kernel.org Wed May 1 20:41:26 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:40:50 -0700 +Subject: xfs: punching delalloc extents on write failure is racy +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner , Christoph Hellwig , "Darrick J . Wong" , Leah Rumancik +Message-ID: <20240501184112.3799035-2-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit 198dd8aedee6a7d2de0dfa739f9a008a938f6848 ] + +xfs_buffered_write_iomap_end() has a comment about the safety of +punching delalloc extents based holding the IOLOCK_EXCL. This +comment is wrong, and punching delalloc extents is not race free. + +When we punch out a delalloc extent after a write failure in +xfs_buffered_write_iomap_end(), we punch out the page cache with +truncate_pagecache_range() before we punch out the delalloc extents. +At this point, we only hold the IOLOCK_EXCL, so there is nothing +stopping mmap() write faults racing with this cleanup operation, +reinstantiating a folio over the range we are about to punch and +hence requiring the delalloc extent to be kept. + +If this race condition is hit, we can end up with a dirty page in +the page cache that has no delalloc extent or space reservation +backing it. This leads to bad things happening at writeback time. + +To avoid this race condition, we need the page cache truncation to +be atomic w.r.t. the extent manipulation. We can do this by holding +the mapping->invalidate_lock exclusively across this operation - +this will prevent new pages from being inserted into the page cache +whilst we are removing the pages and the backing extent and space +reservation. + +Taking the mapping->invalidate_lock exclusively in the buffered +write IO path is safe - it naturally nests inside the IOLOCK (see +truncate and fallocate paths). iomap_zero_range() can be called from +under the mapping->invalidate_lock (from the truncate path via +either xfs_zero_eof() or xfs_truncate_page(), but iomap_zero_iter() +will not instantiate new delalloc pages (because it skips holes) and +hence will not ever need to punch out delalloc extents on failure. + +Fix the locking issue, and clean up the code logic a little to avoid +unnecessary work if we didn't allocate the delalloc extent or wrote +the entire region we allocated. + +Signed-off-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_iomap.c | 41 +++++++++++++++++++++++------------------ + 1 file changed, 23 insertions(+), 18 deletions(-) + +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -1147,6 +1147,10 @@ xfs_buffered_write_iomap_end( + written = 0; + } + ++ /* If we didn't reserve the blocks, we're not allowed to punch them. */ ++ if (!(iomap->flags & IOMAP_F_NEW)) ++ return 0; ++ + /* + * start_fsb refers to the first unused block after a short write. If + * nothing was written, round offset down to point at the first block in +@@ -1158,27 +1162,28 @@ xfs_buffered_write_iomap_end( + start_fsb = XFS_B_TO_FSB(mp, offset + written); + end_fsb = XFS_B_TO_FSB(mp, offset + length); + ++ /* Nothing to do if we've written the entire delalloc extent */ ++ if (start_fsb >= end_fsb) ++ return 0; ++ + /* +- * Trim delalloc blocks if they were allocated by this write and we +- * didn't manage to write the whole range. +- * +- * We don't need to care about racing delalloc as we hold i_mutex +- * across the reserve/allocate/unreserve calls. If there are delalloc +- * blocks in the range, they are ours. ++ * Lock the mapping to avoid races with page faults re-instantiating ++ * folios and dirtying them via ->page_mkwrite between the page cache ++ * truncation and the delalloc extent removal. Failing to do this can ++ * leave dirty pages with no space reservation in the cache. + */ +- if ((iomap->flags & IOMAP_F_NEW) && start_fsb < end_fsb) { +- truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), +- XFS_FSB_TO_B(mp, end_fsb) - 1); +- +- error = xfs_bmap_punch_delalloc_range(ip, start_fsb, +- end_fsb - start_fsb); +- if (error && !xfs_is_shutdown(mp)) { +- xfs_alert(mp, "%s: unable to clean up ino %lld", +- __func__, ip->i_ino); +- return error; +- } +- } ++ filemap_invalidate_lock(inode->i_mapping); ++ truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), ++ XFS_FSB_TO_B(mp, end_fsb) - 1); + ++ error = xfs_bmap_punch_delalloc_range(ip, start_fsb, ++ end_fsb - start_fsb); ++ filemap_invalidate_unlock(inode->i_mapping); ++ if (error && !xfs_is_shutdown(mp)) { ++ xfs_alert(mp, "%s: unable to clean up ino %lld", ++ __func__, ip->i_ino); ++ return error; ++ } + return 0; + } + diff --git a/queue-6.1/xfs-short-circuit-xfs_growfs_data_private-if-delta-is-zero.patch b/queue-6.1/xfs-short-circuit-xfs_growfs_data_private-if-delta-is-zero.patch new file mode 100644 index 00000000000..eb1c41c392f --- /dev/null +++ b/queue-6.1/xfs-short-circuit-xfs_growfs_data_private-if-delta-is-zero.patch @@ -0,0 +1,51 @@ +From stable+bounces-42913-greg=kroah.com@vger.kernel.org Wed May 1 20:42:22 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:41:12 -0700 +Subject: xfs: short circuit xfs_growfs_data_private() if delta is zero +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Eric Sandeen , "Darrick J. Wong" , Chandan Babu R , Leah Rumancik +Message-ID: <20240501184112.3799035-24-leah.rumancik@gmail.com> + +From: Eric Sandeen + +[ Upstream commit 84712492e6dab803bf595fb8494d11098b74a652 ] + +Although xfs_growfs_data() doesn't call xfs_growfs_data_private() +if in->newblocks == mp->m_sb.sb_dblocks, xfs_growfs_data_private() +further massages the new block count so that we don't i.e. try +to create a too-small new AG. + +This may lead to a delta of "0" in xfs_growfs_data_private(), so +we end up in the shrink case and emit the EXPERIMENTAL warning +even if we're not changing anything at all. + +Fix this by returning straightaway if the block delta is zero. + +(nb: in older kernels, the result of entering the shrink case +with delta == 0 may actually let an -ENOSPC escape to userspace, +which is confusing for users.) + +Fixes: fb2fc1720185 ("xfs: support shrinking unused space in the last AG") +Signed-off-by: Eric Sandeen +Reviewed-by: "Darrick J. Wong" +Signed-off-by: Chandan Babu R +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_fsops.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/fs/xfs/xfs_fsops.c ++++ b/fs/xfs/xfs_fsops.c +@@ -129,6 +129,10 @@ xfs_growfs_data_private( + if (delta < 0 && nagcount < 2) + return -EINVAL; + ++ /* No work to do */ ++ if (delta == 0) ++ return 0; ++ + oagcount = mp->m_sb.sb_agcount; + /* allocate the new per-ag structures */ + if (nagcount > oagcount) { diff --git a/queue-6.1/xfs-use-byte-ranges-for-write-cleanup-ranges.patch b/queue-6.1/xfs-use-byte-ranges-for-write-cleanup-ranges.patch new file mode 100644 index 00000000000..2da1ed768cc --- /dev/null +++ b/queue-6.1/xfs-use-byte-ranges-for-write-cleanup-ranges.patch @@ -0,0 +1,112 @@ +From stable+bounces-42892-greg=kroah.com@vger.kernel.org Wed May 1 20:41:29 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:40:51 -0700 +Subject: xfs: use byte ranges for write cleanup ranges +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner , "Darrick J . Wong" , Leah Rumancik +Message-ID: <20240501184112.3799035-3-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit b71f889c18ada210a97aa3eb5e00c0de552234c6 ] + +xfs_buffered_write_iomap_end() currently converts the byte ranges +passed to it to filesystem blocks to pass them to the bmap code to +punch out delalloc blocks, but then has to convert filesytem +blocks back to byte ranges for page cache truncate. + +We're about to make the page cache truncate go away and replace it +with a page cache walk, so having to convert everything to/from/to +filesystem blocks is messy and error-prone. It is much easier to +pass around byte ranges and convert to page indexes and/or +filesystem blocks only where those units are needed. + +In preparation for the page cache walk being added, add a helper +that converts byte ranges to filesystem blocks and calls +xfs_bmap_punch_delalloc_range() and convert +xfs_buffered_write_iomap_end() to calculate limits in byte ranges. + +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_iomap.c | 40 +++++++++++++++++++++++++--------------- + 1 file changed, 25 insertions(+), 15 deletions(-) + +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -1121,6 +1121,20 @@ out_unlock: + } + + static int ++xfs_buffered_write_delalloc_punch( ++ struct inode *inode, ++ loff_t start_byte, ++ loff_t end_byte) ++{ ++ struct xfs_mount *mp = XFS_M(inode->i_sb); ++ xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte); ++ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); ++ ++ return xfs_bmap_punch_delalloc_range(XFS_I(inode), start_fsb, ++ end_fsb - start_fsb); ++} ++ ++static int + xfs_buffered_write_iomap_end( + struct inode *inode, + loff_t offset, +@@ -1129,10 +1143,9 @@ xfs_buffered_write_iomap_end( + unsigned flags, + struct iomap *iomap) + { +- struct xfs_inode *ip = XFS_I(inode); +- struct xfs_mount *mp = ip->i_mount; +- xfs_fileoff_t start_fsb; +- xfs_fileoff_t end_fsb; ++ struct xfs_mount *mp = XFS_M(inode->i_sb); ++ loff_t start_byte; ++ loff_t end_byte; + int error = 0; + + if (iomap->type != IOMAP_DELALLOC) +@@ -1157,13 +1170,13 @@ xfs_buffered_write_iomap_end( + * the range. + */ + if (unlikely(!written)) +- start_fsb = XFS_B_TO_FSBT(mp, offset); ++ start_byte = round_down(offset, mp->m_sb.sb_blocksize); + else +- start_fsb = XFS_B_TO_FSB(mp, offset + written); +- end_fsb = XFS_B_TO_FSB(mp, offset + length); ++ start_byte = round_up(offset + written, mp->m_sb.sb_blocksize); ++ end_byte = round_up(offset + length, mp->m_sb.sb_blocksize); + + /* Nothing to do if we've written the entire delalloc extent */ +- if (start_fsb >= end_fsb) ++ if (start_byte >= end_byte) + return 0; + + /* +@@ -1173,15 +1186,12 @@ xfs_buffered_write_iomap_end( + * leave dirty pages with no space reservation in the cache. + */ + filemap_invalidate_lock(inode->i_mapping); +- truncate_pagecache_range(VFS_I(ip), XFS_FSB_TO_B(mp, start_fsb), +- XFS_FSB_TO_B(mp, end_fsb) - 1); +- +- error = xfs_bmap_punch_delalloc_range(ip, start_fsb, +- end_fsb - start_fsb); ++ truncate_pagecache_range(inode, start_byte, end_byte - 1); ++ error = xfs_buffered_write_delalloc_punch(inode, start_byte, end_byte); + filemap_invalidate_unlock(inode->i_mapping); + if (error && !xfs_is_shutdown(mp)) { +- xfs_alert(mp, "%s: unable to clean up ino %lld", +- __func__, ip->i_ino); ++ xfs_alert(mp, "%s: unable to clean up ino 0x%llx", ++ __func__, XFS_I(inode)->i_ino); + return error; + } + return 0; diff --git a/queue-6.1/xfs-use-iomap_valid-method-to-detect-stale-cached-iomaps.patch b/queue-6.1/xfs-use-iomap_valid-method-to-detect-stale-cached-iomaps.patch new file mode 100644 index 00000000000..2e778b0a00d --- /dev/null +++ b/queue-6.1/xfs-use-iomap_valid-method-to-detect-stale-cached-iomaps.patch @@ -0,0 +1,387 @@ +From stable+bounces-42899-greg=kroah.com@vger.kernel.org Wed May 1 20:41:46 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:40:56 -0700 +Subject: xfs: use iomap_valid method to detect stale cached iomaps +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner , "Darrick J . Wong" , Leah Rumancik +Message-ID: <20240501184112.3799035-8-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit 304a68b9c63bbfc1f6e159d68e8892fc54a06067 ] + +Now that iomap supports a mechanism to validate cached iomaps for +buffered write operations, hook it up to the XFS buffered write ops +so that we can avoid data corruptions that result from stale cached +iomaps. See: + +https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/ + +or the ->iomap_valid() introduction commit for exact details of the +corruption vector. + +The validity cookie we store in the iomap is based on the type of +iomap we return. It is expected that the iomap->flags we set in +xfs_bmbt_to_iomap() is not perturbed by the iomap core and are +returned to us in the iomap passed via the .iomap_valid() callback. +This ensures that the validity cookie is always checking the correct +inode fork sequence numbers to detect potential changes that affect +the extent cached by the iomap. + +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/libxfs/xfs_bmap.c | 6 +- + fs/xfs/xfs_aops.c | 2 + fs/xfs/xfs_iomap.c | 95 +++++++++++++++++++++++++++++++++++++---------- + fs/xfs/xfs_iomap.h | 5 +- + fs/xfs/xfs_pnfs.c | 6 +- + 5 files changed, 87 insertions(+), 27 deletions(-) + +--- a/fs/xfs/libxfs/xfs_bmap.c ++++ b/fs/xfs/libxfs/xfs_bmap.c +@@ -4551,7 +4551,8 @@ xfs_bmapi_convert_delalloc( + * the extent. Just return the real extent at this offset. + */ + if (!isnullstartblock(bma.got.br_startblock)) { +- xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); ++ xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, ++ xfs_iomap_inode_sequence(ip, flags)); + *seq = READ_ONCE(ifp->if_seq); + goto out_trans_cancel; + } +@@ -4599,7 +4600,8 @@ xfs_bmapi_convert_delalloc( + XFS_STATS_INC(mp, xs_xstrat_quick); + + ASSERT(!isnullstartblock(bma.got.br_startblock)); +- xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags); ++ xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, ++ xfs_iomap_inode_sequence(ip, flags)); + *seq = READ_ONCE(ifp->if_seq); + + if (whichfork == XFS_COW_FORK) +--- a/fs/xfs/xfs_aops.c ++++ b/fs/xfs/xfs_aops.c +@@ -372,7 +372,7 @@ retry: + isnullstartblock(imap.br_startblock)) + goto allocate_blocks; + +- xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0); ++ xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, XFS_WPC(wpc)->data_seq); + trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); + return 0; + allocate_blocks: +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -48,13 +48,45 @@ xfs_alert_fsblock_zero( + return -EFSCORRUPTED; + } + ++u64 ++xfs_iomap_inode_sequence( ++ struct xfs_inode *ip, ++ u16 iomap_flags) ++{ ++ u64 cookie = 0; ++ ++ if (iomap_flags & IOMAP_F_XATTR) ++ return READ_ONCE(ip->i_af.if_seq); ++ if ((iomap_flags & IOMAP_F_SHARED) && ip->i_cowfp) ++ cookie = (u64)READ_ONCE(ip->i_cowfp->if_seq) << 32; ++ return cookie | READ_ONCE(ip->i_df.if_seq); ++} ++ ++/* ++ * Check that the iomap passed to us is still valid for the given offset and ++ * length. ++ */ ++static bool ++xfs_iomap_valid( ++ struct inode *inode, ++ const struct iomap *iomap) ++{ ++ return iomap->validity_cookie == ++ xfs_iomap_inode_sequence(XFS_I(inode), iomap->flags); ++} ++ ++const struct iomap_page_ops xfs_iomap_page_ops = { ++ .iomap_valid = xfs_iomap_valid, ++}; ++ + int + xfs_bmbt_to_iomap( + struct xfs_inode *ip, + struct iomap *iomap, + struct xfs_bmbt_irec *imap, + unsigned int mapping_flags, +- u16 iomap_flags) ++ u16 iomap_flags, ++ u64 sequence_cookie) + { + struct xfs_mount *mp = ip->i_mount; + struct xfs_buftarg *target = xfs_inode_buftarg(ip); +@@ -91,6 +123,9 @@ xfs_bmbt_to_iomap( + if (xfs_ipincount(ip) && + (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) + iomap->flags |= IOMAP_F_DIRTY; ++ ++ iomap->validity_cookie = sequence_cookie; ++ iomap->page_ops = &xfs_iomap_page_ops; + return 0; + } + +@@ -195,7 +230,8 @@ xfs_iomap_write_direct( + xfs_fileoff_t offset_fsb, + xfs_fileoff_t count_fsb, + unsigned int flags, +- struct xfs_bmbt_irec *imap) ++ struct xfs_bmbt_irec *imap, ++ u64 *seq) + { + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; +@@ -285,6 +321,7 @@ xfs_iomap_write_direct( + error = xfs_alert_fsblock_zero(ip, imap); + + out_unlock: ++ *seq = xfs_iomap_inode_sequence(ip, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + +@@ -743,6 +780,7 @@ xfs_direct_write_iomap_begin( + bool shared = false; + u16 iomap_flags = 0; + unsigned int lockmode = XFS_ILOCK_SHARED; ++ u64 seq; + + ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); + +@@ -811,9 +849,10 @@ xfs_direct_write_iomap_begin( + goto out_unlock; + } + ++ seq = xfs_iomap_inode_sequence(ip, iomap_flags); + xfs_iunlock(ip, lockmode); + trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); +- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags); ++ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq); + + allocate_blocks: + error = -EAGAIN; +@@ -839,24 +878,26 @@ allocate_blocks: + xfs_iunlock(ip, lockmode); + + error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, +- flags, &imap); ++ flags, &imap, &seq); + if (error) + return error; + + trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, +- iomap_flags | IOMAP_F_NEW); ++ iomap_flags | IOMAP_F_NEW, seq); + + out_found_cow: +- xfs_iunlock(ip, lockmode); + length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); + trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); + if (imap.br_startblock != HOLESTARTBLOCK) { +- error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); ++ seq = xfs_iomap_inode_sequence(ip, 0); ++ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); + if (error) +- return error; ++ goto out_unlock; + } +- return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED); ++ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); ++ xfs_iunlock(ip, lockmode); ++ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); + + out_unlock: + if (lockmode) +@@ -915,6 +956,7 @@ xfs_buffered_write_iomap_begin( + int allocfork = XFS_DATA_FORK; + int error = 0; + unsigned int lockmode = XFS_ILOCK_EXCL; ++ u64 seq; + + if (xfs_is_shutdown(mp)) + return -EIO; +@@ -1094,26 +1136,31 @@ retry: + * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch + * them out if the write happens to fail. + */ ++ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); +- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW); ++ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); + + found_imap: ++ seq = xfs_iomap_inode_sequence(ip, 0); + xfs_iunlock(ip, XFS_ILOCK_EXCL); +- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); ++ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); + + found_cow: +- xfs_iunlock(ip, XFS_ILOCK_EXCL); ++ seq = xfs_iomap_inode_sequence(ip, 0); + if (imap.br_startoff <= offset_fsb) { +- error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0); ++ error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); + if (error) +- return error; ++ goto out_unlock; ++ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); ++ xfs_iunlock(ip, XFS_ILOCK_EXCL); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, +- IOMAP_F_SHARED); ++ IOMAP_F_SHARED, seq); + } + + xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); +- return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0); ++ xfs_iunlock(ip, XFS_ILOCK_EXCL); ++ return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); + + out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); +@@ -1193,6 +1240,7 @@ xfs_read_iomap_begin( + int nimaps = 1, error = 0; + bool shared = false; + unsigned int lockmode = XFS_ILOCK_SHARED; ++ u64 seq; + + ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); + +@@ -1206,13 +1254,14 @@ xfs_read_iomap_begin( + &nimaps, 0); + if (!error && (flags & IOMAP_REPORT)) + error = xfs_reflink_trim_around_shared(ip, &imap, &shared); ++ seq = xfs_iomap_inode_sequence(ip, shared ? IOMAP_F_SHARED : 0); + xfs_iunlock(ip, lockmode); + + if (error) + return error; + trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); + return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, +- shared ? IOMAP_F_SHARED : 0); ++ shared ? IOMAP_F_SHARED : 0, seq); + } + + const struct iomap_ops xfs_read_iomap_ops = { +@@ -1237,6 +1286,7 @@ xfs_seek_iomap_begin( + struct xfs_bmbt_irec imap, cmap; + int error = 0; + unsigned lockmode; ++ u64 seq; + + if (xfs_is_shutdown(mp)) + return -EIO; +@@ -1271,8 +1321,9 @@ xfs_seek_iomap_begin( + if (data_fsb < cow_fsb + cmap.br_blockcount) + end_fsb = min(end_fsb, data_fsb); + xfs_trim_extent(&cmap, offset_fsb, end_fsb); ++ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, +- IOMAP_F_SHARED); ++ IOMAP_F_SHARED, seq); + /* + * This is a COW extent, so we must probe the page cache + * because there could be dirty page cache being backed +@@ -1293,8 +1344,9 @@ xfs_seek_iomap_begin( + imap.br_startblock = HOLESTARTBLOCK; + imap.br_state = XFS_EXT_NORM; + done: ++ seq = xfs_iomap_inode_sequence(ip, 0); + xfs_trim_extent(&imap, offset_fsb, end_fsb); +- error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); ++ error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); + out_unlock: + xfs_iunlock(ip, lockmode); + return error; +@@ -1320,6 +1372,7 @@ xfs_xattr_iomap_begin( + struct xfs_bmbt_irec imap; + int nimaps = 1, error = 0; + unsigned lockmode; ++ int seq; + + if (xfs_is_shutdown(mp)) + return -EIO; +@@ -1336,12 +1389,14 @@ xfs_xattr_iomap_begin( + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, + &nimaps, XFS_BMAPI_ATTRFORK); + out_unlock: ++ ++ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_XATTR); + xfs_iunlock(ip, lockmode); + + if (error) + return error; + ASSERT(nimaps); +- return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0); ++ return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_XATTR, seq); + } + + const struct iomap_ops xfs_xattr_iomap_ops = { +--- a/fs/xfs/xfs_iomap.h ++++ b/fs/xfs/xfs_iomap.h +@@ -13,14 +13,15 @@ struct xfs_bmbt_irec; + + int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, + xfs_fileoff_t count_fsb, unsigned int flags, +- struct xfs_bmbt_irec *imap); ++ struct xfs_bmbt_irec *imap, u64 *sequence); + int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t, bool); + xfs_fileoff_t xfs_iomap_eof_align_last_fsb(struct xfs_inode *ip, + xfs_fileoff_t end_fsb); + ++u64 xfs_iomap_inode_sequence(struct xfs_inode *ip, u16 iomap_flags); + int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap, + struct xfs_bmbt_irec *imap, unsigned int mapping_flags, +- u16 iomap_flags); ++ u16 iomap_flags, u64 sequence_cookie); + + int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, + bool *did_zero); +--- a/fs/xfs/xfs_pnfs.c ++++ b/fs/xfs/xfs_pnfs.c +@@ -125,6 +125,7 @@ xfs_fs_map_blocks( + int nimaps = 1; + uint lock_flags; + int error = 0; ++ u64 seq; + + if (xfs_is_shutdown(mp)) + return -EIO; +@@ -176,6 +177,7 @@ xfs_fs_map_blocks( + lock_flags = xfs_ilock_data_map_shared(ip); + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, + &imap, &nimaps, bmapi_flags); ++ seq = xfs_iomap_inode_sequence(ip, 0); + + ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK); + +@@ -189,7 +191,7 @@ xfs_fs_map_blocks( + xfs_iunlock(ip, lock_flags); + + error = xfs_iomap_write_direct(ip, offset_fsb, +- end_fsb - offset_fsb, 0, &imap); ++ end_fsb - offset_fsb, 0, &imap, &seq); + if (error) + goto out_unlock; + +@@ -209,7 +211,7 @@ xfs_fs_map_blocks( + } + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + +- error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0); ++ error = xfs_bmbt_to_iomap(ip, iomap, &imap, 0, 0, seq); + *device_generation = mp->m_generation; + return error; + out_unlock: diff --git a/queue-6.1/xfs-wait-iclog-complete-before-tearing-down-ail.patch b/queue-6.1/xfs-wait-iclog-complete-before-tearing-down-ail.patch new file mode 100644 index 00000000000..eefa8904233 --- /dev/null +++ b/queue-6.1/xfs-wait-iclog-complete-before-tearing-down-ail.patch @@ -0,0 +1,185 @@ +From stable+bounces-42905-greg=kroah.com@vger.kernel.org Wed May 1 20:42:02 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:41:04 -0700 +Subject: xfs: wait iclog complete before tearing down AIL +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Guo Xuenan , "Darrick J . Wong" , Leah Rumancik +Message-ID: <20240501184112.3799035-16-leah.rumancik@gmail.com> + +From: Guo Xuenan + +[ Upstream commit 1eb52a6a71981b80f9acbd915acd6a05a5037196 ] + +Fix uaf in xfs_trans_ail_delete during xlog force shutdown. +In commit cd6f79d1fb32 ("xfs: run callbacks before waking waiters in +xlog_state_shutdown_callbacks") changed the order of running callbacks +and wait for iclog completion to avoid unmount path untimely destroy AIL. +But which seems not enough to ensue this, adding mdelay in +`xfs_buf_item_unpin` can prove that. + +The reproduction is as follows. To ensure destroy AIL safely, +we should wait all xlog ioend workers done and sync the AIL. + +================================================================== +BUG: KASAN: use-after-free in xfs_trans_ail_delete+0x240/0x2a0 +Read of size 8 at addr ffff888023169400 by task kworker/1:1H/43 + +CPU: 1 PID: 43 Comm: kworker/1:1H Tainted: G W +6.1.0-rc1-00002-gc28266863c4a #137 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS +1.13.0-1ubuntu1.1 04/01/2014 +Workqueue: xfs-log/sda xlog_ioend_work +Call Trace: + + dump_stack_lvl+0x4d/0x66 + print_report+0x171/0x4a6 + kasan_report+0xb3/0x130 + xfs_trans_ail_delete+0x240/0x2a0 + xfs_buf_item_done+0x7b/0xa0 + xfs_buf_ioend+0x1e9/0x11f0 + xfs_buf_item_unpin+0x4c8/0x860 + xfs_trans_committed_bulk+0x4c2/0x7c0 + xlog_cil_committed+0xab6/0xfb0 + xlog_cil_process_committed+0x117/0x1e0 + xlog_state_shutdown_callbacks+0x208/0x440 + xlog_force_shutdown+0x1b3/0x3a0 + xlog_ioend_work+0xef/0x1d0 + process_one_work+0x6f9/0xf70 + worker_thread+0x578/0xf30 + kthread+0x28c/0x330 + ret_from_fork+0x1f/0x30 + + +Allocated by task 9606: + kasan_save_stack+0x1e/0x40 + kasan_set_track+0x21/0x30 + __kasan_kmalloc+0x7a/0x90 + __kmalloc+0x59/0x140 + kmem_alloc+0xb2/0x2f0 + xfs_trans_ail_init+0x20/0x320 + xfs_log_mount+0x37e/0x690 + xfs_mountfs+0xe36/0x1b40 + xfs_fs_fill_super+0xc5c/0x1a70 + get_tree_bdev+0x3c5/0x6c0 + vfs_get_tree+0x85/0x250 + path_mount+0xec3/0x1830 + do_mount+0xef/0x110 + __x64_sys_mount+0x150/0x1f0 + do_syscall_64+0x35/0x80 + entry_SYSCALL_64_after_hwframe+0x63/0xcd + +Freed by task 9662: + kasan_save_stack+0x1e/0x40 + kasan_set_track+0x21/0x30 + kasan_save_free_info+0x2a/0x40 + __kasan_slab_free+0x105/0x1a0 + __kmem_cache_free+0x99/0x2d0 + kvfree+0x3a/0x40 + xfs_log_unmount+0x60/0xf0 + xfs_unmountfs+0xf3/0x1d0 + xfs_fs_put_super+0x78/0x300 + generic_shutdown_super+0x151/0x400 + kill_block_super+0x9a/0xe0 + deactivate_locked_super+0x82/0xe0 + deactivate_super+0x91/0xb0 + cleanup_mnt+0x32a/0x4a0 + task_work_run+0x15f/0x240 + exit_to_user_mode_prepare+0x188/0x190 + syscall_exit_to_user_mode+0x12/0x30 + do_syscall_64+0x42/0x80 + entry_SYSCALL_64_after_hwframe+0x63/0xcd + +The buggy address belongs to the object at ffff888023169400 + which belongs to the cache kmalloc-128 of size 128 +The buggy address is located 0 bytes inside of + 128-byte region [ffff888023169400, ffff888023169480) + +The buggy address belongs to the physical page: +page:ffffea00008c5a00 refcount:1 mapcount:0 mapping:0000000000000000 +index:0xffff888023168f80 pfn:0x23168 +head:ffffea00008c5a00 order:1 compound_mapcount:0 compound_pincount:0 +flags: 0x1fffff80010200(slab|head|node=0|zone=1|lastcpupid=0x1fffff) +raw: 001fffff80010200 ffffea00006b3988 ffffea0000577a88 ffff88800f842ac0 +raw: ffff888023168f80 0000000000150007 00000001ffffffff 0000000000000000 +page dumped because: kasan: bad access detected + +Memory state around the buggy address: + ffff888023169300: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc + ffff888023169380: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc +>ffff888023169400: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb + ^ + ffff888023169480: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc + ffff888023169500: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc +================================================================== +Disabling lock debugging due to kernel taint + +Fixes: cd6f79d1fb32 ("xfs: run callbacks before waking waiters in xlog_state_shutdown_callbacks") +Signed-off-by: Guo Xuenan +Reviewed-by: Darrick J. Wong +Signed-off-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_log.c | 36 +++++++++++++++++++++++++----------- + 1 file changed, 25 insertions(+), 11 deletions(-) + +--- a/fs/xfs/xfs_log.c ++++ b/fs/xfs/xfs_log.c +@@ -887,6 +887,23 @@ xlog_force_iclog( + } + + /* ++ * Cycle all the iclogbuf locks to make sure all log IO completion ++ * is done before we tear down these buffers. ++ */ ++static void ++xlog_wait_iclog_completion(struct xlog *log) ++{ ++ int i; ++ struct xlog_in_core *iclog = log->l_iclog; ++ ++ for (i = 0; i < log->l_iclog_bufs; i++) { ++ down(&iclog->ic_sema); ++ up(&iclog->ic_sema); ++ iclog = iclog->ic_next; ++ } ++} ++ ++/* + * Wait for the iclog and all prior iclogs to be written disk as required by the + * log force state machine. Waiting on ic_force_wait ensures iclog completions + * have been ordered and callbacks run before we are woken here, hence +@@ -1111,6 +1128,14 @@ xfs_log_unmount( + { + xfs_log_clean(mp); + ++ /* ++ * If shutdown has come from iclog IO context, the log ++ * cleaning will have been skipped and so we need to wait ++ * for the iclog to complete shutdown processing before we ++ * tear anything down. ++ */ ++ xlog_wait_iclog_completion(mp->m_log); ++ + xfs_buftarg_drain(mp->m_ddev_targp); + + xfs_trans_ail_destroy(mp); +@@ -2114,17 +2139,6 @@ xlog_dealloc_log( + int i; + + /* +- * Cycle all the iclogbuf locks to make sure all log IO completion +- * is done before we tear down these buffers. +- */ +- iclog = log->l_iclog; +- for (i = 0; i < log->l_iclog_bufs; i++) { +- down(&iclog->ic_sema); +- up(&iclog->ic_sema); +- iclog = iclog->ic_next; +- } +- +- /* + * Destroy the CIL after waiting for iclog IO completion because an + * iclog EIO error will try to shut down the log, which accesses the + * CIL to wake up the waiters. diff --git a/queue-6.1/xfs-write-page-faults-in-iomap-are-not-buffered-writes.patch b/queue-6.1/xfs-write-page-faults-in-iomap-are-not-buffered-writes.patch new file mode 100644 index 00000000000..c1722703b88 --- /dev/null +++ b/queue-6.1/xfs-write-page-faults-in-iomap-are-not-buffered-writes.patch @@ -0,0 +1,134 @@ +From stable+bounces-42890-greg=kroah.com@vger.kernel.org Wed May 1 20:41:25 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:40:49 -0700 +Subject: xfs: write page faults in iomap are not buffered writes +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner , Christoph Hellwig , "Darrick J . Wong" , Leah Rumancik +Message-ID: <20240501184112.3799035-1-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit 118e021b4b66f758f8e8f21dc0e5e0a4c721e69e ] + +When we reserve a delalloc region in xfs_buffered_write_iomap_begin, +we mark the iomap as IOMAP_F_NEW so that the the write context +understands that it allocated the delalloc region. + +If we then fail that buffered write, xfs_buffered_write_iomap_end() +checks for the IOMAP_F_NEW flag and if it is set, it punches out +the unused delalloc region that was allocated for the write. + +The assumption this code makes is that all buffered write operations +that can allocate space are run under an exclusive lock (i_rwsem). +This is an invalid assumption: page faults in mmap()d regions call +through this same function pair to map the file range being faulted +and this runs only holding the inode->i_mapping->invalidate_lock in +shared mode. + +IOWs, we can have races between page faults and write() calls that +fail the nested page cache write operation that result in data loss. +That is, the failing iomap_end call will punch out the data that +the other racing iomap iteration brought into the page cache. This +can be reproduced with generic/34[46] if we arbitrarily fail page +cache copy-in operations from write() syscalls. + +Code analysis tells us that the iomap_page_mkwrite() function holds +the already instantiated and uptodate folio locked across the iomap +mapping iterations. Hence the folio cannot be removed from memory +whilst we are mapping the range it covers, and as such we do not +care if the mapping changes state underneath the iomap iteration +loop: + +1. if the folio is not already dirty, there is no writeback races + possible. +2. if we allocated the mapping (delalloc or unwritten), the folio + cannot already be dirty. See #1. +3. If the folio is already dirty, it must be up to date. As we hold + it locked, it cannot be reclaimed from memory. Hence we always + have valid data in the page cache while iterating the mapping. +4. Valid data in the page cache can exist when the underlying + mapping is DELALLOC, UNWRITTEN or WRITTEN. Having the mapping + change from DELALLOC->UNWRITTEN or UNWRITTEN->WRITTEN does not + change the data in the page - it only affects actions if we are + initialising a new page. Hence #3 applies and we don't care + about these extent map transitions racing with + iomap_page_mkwrite(). +5. iomap_page_mkwrite() checks for page invalidation races + (truncate, hole punch, etc) after it locks the folio. We also + hold the mapping->invalidation_lock here, and hence the mapping + cannot change due to extent removal operations while we are + iterating the folio. + +As such, filesystems that don't use bufferheads will never fail +the iomap_folio_mkwrite_iter() operation on the current mapping, +regardless of whether the iomap should be considered stale. + +Further, the range we are asked to iterate is limited to the range +inside EOF that the folio spans. Hence, for XFS, we will only map +the exact range we are asked for, and we will only do speculative +preallocation with delalloc if we are mapping a hole at the EOF +page. The iterator will consume the entire range of the folio that +is within EOF, and anything beyond the EOF block cannot be accessed. +We never need to truncate this post-EOF speculative prealloc away in +the context of the iomap_page_mkwrite() iterator because if it +remains unused we'll remove it when the last reference to the inode +goes away. + +Hence we don't actually need an .iomap_end() cleanup/error handling +path at all for iomap_page_mkwrite() for XFS. This means we can +separate the page fault processing from the complexity of the +.iomap_end() processing in the buffered write path. This also means +that the buffered write path will also be able to take the +mapping->invalidate_lock as necessary. + +Signed-off-by: Dave Chinner +Reviewed-by: Christoph Hellwig +Reviewed-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_file.c | 2 +- + fs/xfs/xfs_iomap.c | 9 +++++++++ + fs/xfs/xfs_iomap.h | 1 + + 3 files changed, 11 insertions(+), 1 deletion(-) + +--- a/fs/xfs/xfs_file.c ++++ b/fs/xfs/xfs_file.c +@@ -1325,7 +1325,7 @@ __xfs_filemap_fault( + if (write_fault) { + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + ret = iomap_page_mkwrite(vmf, +- &xfs_buffered_write_iomap_ops); ++ &xfs_page_mkwrite_iomap_ops); + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + } else { + ret = filemap_fault(vmf); +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -1187,6 +1187,15 @@ const struct iomap_ops xfs_buffered_writ + .iomap_end = xfs_buffered_write_iomap_end, + }; + ++/* ++ * iomap_page_mkwrite() will never fail in a way that requires delalloc extents ++ * that it allocated to be revoked. Hence we do not need an .iomap_end method ++ * for this operation. ++ */ ++const struct iomap_ops xfs_page_mkwrite_iomap_ops = { ++ .iomap_begin = xfs_buffered_write_iomap_begin, ++}; ++ + static int + xfs_read_iomap_begin( + struct inode *inode, +--- a/fs/xfs/xfs_iomap.h ++++ b/fs/xfs/xfs_iomap.h +@@ -47,6 +47,7 @@ xfs_aligned_fsb_count( + } + + extern const struct iomap_ops xfs_buffered_write_iomap_ops; ++extern const struct iomap_ops xfs_page_mkwrite_iomap_ops; + extern const struct iomap_ops xfs_direct_write_iomap_ops; + extern const struct iomap_ops xfs_read_iomap_ops; + extern const struct iomap_ops xfs_seek_iomap_ops; diff --git a/queue-6.1/xfs-xfs_bmap_punch_delalloc_range-should-take-a-byte-range.patch b/queue-6.1/xfs-xfs_bmap_punch_delalloc_range-should-take-a-byte-range.patch new file mode 100644 index 00000000000..3fc293326e5 --- /dev/null +++ b/queue-6.1/xfs-xfs_bmap_punch_delalloc_range-should-take-a-byte-range.patch @@ -0,0 +1,126 @@ +From stable+bounces-42895-greg=kroah.com@vger.kernel.org Wed May 1 20:41:37 2024 +From: Leah Rumancik +Date: Wed, 1 May 2024 11:40:54 -0700 +Subject: xfs: xfs_bmap_punch_delalloc_range() should take a byte range +To: stable@vger.kernel.org +Cc: linux-xfs@vger.kernel.org, amir73il@gmail.com, chandan.babu@oracle.com, fred@cloudflare.com, Dave Chinner , "Darrick J . Wong" , Leah Rumancik +Message-ID: <20240501184112.3799035-6-leah.rumancik@gmail.com> + +From: Dave Chinner + +[ Upstream commit 7348b322332d8602a4133f0b861334ea021b134a ] + +All the callers of xfs_bmap_punch_delalloc_range() jump through +hoops to convert a byte range to filesystem blocks before calling +xfs_bmap_punch_delalloc_range(). Instead, pass the byte range to +xfs_bmap_punch_delalloc_range() and have it do the conversion to +filesystem blocks internally. + +Signed-off-by: Dave Chinner +Reviewed-by: Darrick J. Wong +Signed-off-by: Leah Rumancik +Acked-by: Darrick J. Wong +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/xfs_aops.c | 16 ++++++---------- + fs/xfs/xfs_bmap_util.c | 10 ++++++---- + fs/xfs/xfs_bmap_util.h | 2 +- + fs/xfs/xfs_iomap.c | 8 ++------ + 4 files changed, 15 insertions(+), 21 deletions(-) + +--- a/fs/xfs/xfs_aops.c ++++ b/fs/xfs/xfs_aops.c +@@ -114,9 +114,8 @@ xfs_end_ioend( + if (unlikely(error)) { + if (ioend->io_flags & IOMAP_F_SHARED) { + xfs_reflink_cancel_cow_range(ip, offset, size, true); +- xfs_bmap_punch_delalloc_range(ip, +- XFS_B_TO_FSBT(mp, offset), +- XFS_B_TO_FSB(mp, size)); ++ xfs_bmap_punch_delalloc_range(ip, offset, ++ offset + size); + } + goto done; + } +@@ -455,12 +454,8 @@ xfs_discard_folio( + struct folio *folio, + loff_t pos) + { +- struct inode *inode = folio->mapping->host; +- struct xfs_inode *ip = XFS_I(inode); ++ struct xfs_inode *ip = XFS_I(folio->mapping->host); + struct xfs_mount *mp = ip->i_mount; +- size_t offset = offset_in_folio(folio, pos); +- xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, pos); +- xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, offset); + int error; + + if (xfs_is_shutdown(mp)) +@@ -470,8 +465,9 @@ xfs_discard_folio( + "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.", + folio, ip->i_ino, pos); + +- error = xfs_bmap_punch_delalloc_range(ip, start_fsb, +- i_blocks_per_folio(inode, folio) - pageoff_fsb); ++ error = xfs_bmap_punch_delalloc_range(ip, pos, ++ round_up(pos, folio_size(folio))); ++ + if (error && !xfs_is_shutdown(mp)) + xfs_alert(mp, "page discard unable to remove delalloc mapping."); + } +--- a/fs/xfs/xfs_bmap_util.c ++++ b/fs/xfs/xfs_bmap_util.c +@@ -590,11 +590,13 @@ out_unlock_iolock: + int + xfs_bmap_punch_delalloc_range( + struct xfs_inode *ip, +- xfs_fileoff_t start_fsb, +- xfs_fileoff_t length) ++ xfs_off_t start_byte, ++ xfs_off_t end_byte) + { ++ struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = &ip->i_df; +- xfs_fileoff_t end_fsb = start_fsb + length; ++ xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, start_byte); ++ xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, end_byte); + struct xfs_bmbt_irec got, del; + struct xfs_iext_cursor icur; + int error = 0; +@@ -607,7 +609,7 @@ xfs_bmap_punch_delalloc_range( + + while (got.br_startoff + got.br_blockcount > start_fsb) { + del = got; +- xfs_trim_extent(&del, start_fsb, length); ++ xfs_trim_extent(&del, start_fsb, end_fsb - start_fsb); + + /* + * A delete can push the cursor forward. Step back to the +--- a/fs/xfs/xfs_bmap_util.h ++++ b/fs/xfs/xfs_bmap_util.h +@@ -31,7 +31,7 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap + #endif /* CONFIG_XFS_RT */ + + int xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, +- xfs_fileoff_t start_fsb, xfs_fileoff_t length); ++ xfs_off_t start_byte, xfs_off_t end_byte); + + struct kgetbmap { + __s64 bmv_offset; /* file offset of segment in blocks */ +--- a/fs/xfs/xfs_iomap.c ++++ b/fs/xfs/xfs_iomap.c +@@ -1126,12 +1126,8 @@ xfs_buffered_write_delalloc_punch( + loff_t offset, + loff_t length) + { +- struct xfs_mount *mp = XFS_M(inode->i_sb); +- xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, offset); +- xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); +- +- return xfs_bmap_punch_delalloc_range(XFS_I(inode), start_fsb, +- end_fsb - start_fsb); ++ return xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, ++ offset + length); + } + + static int -- 2.47.3