Fixes for 6.15

author Sasha Levin <sashal@kernel.org>

Sun, 15 Jun 2025 22:47:44 +0000 (18:47 -0400)

committer Sasha Levin <sashal@kernel.org>

Sun, 15 Jun 2025 22:47:44 +0000 (18:47 -0400)
author Sasha Levin <sashal@kernel.org>
Sun, 15 Jun 2025 22:47:44 +0000 (18:47 -0400)
committer Sasha Levin <sashal@kernel.org>
Sun, 15 Jun 2025 22:47:44 +0000 (18:47 -0400)
diff --git a/queue-6.15/bio-fix-bio_first_folio-for-sparsemem-without-vmemma.patch b/queue-6.15/bio-fix-bio_first_folio-for-sparsemem-without-vmemma.patch

new file mode 100644 (file)

index 0000000..2e13cf8
--- /dev/null
+++ b/queue-6.15/bio-fix-bio_first_folio-for-sparsemem-without-vmemma.patch
@@ -0,0 +1,39 @@
+From ebfba348111722463033598894a92ac6f0cdd202 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 12 Jun 2025 15:41:25 +0100
+Subject: bio: Fix bio_first_folio() for SPARSEMEM without VMEMMAP
+
+From: Matthew Wilcox (Oracle) <willy@infradead.org>
+
+[ Upstream commit f826ec7966a63d48e16e0868af4e038bf9a1a3ae ]
+
+It is possible for physically contiguous folios to have discontiguous
+struct pages if SPARSEMEM is enabled and SPARSEMEM_VMEMMAP is not.
+This is correctly handled by folio_page_idx(), so remove this open-coded
+implementation.
+
+Fixes: 640d1930bef4 (block: Add bio_for_each_folio_all())
+Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Link: https://lore.kernel.org/r/20250612144126.2849931-1-willy@infradead.org
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/bio.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/include/linux/bio.h b/include/linux/bio.h
+index b786ec5bcc81d..b474a47ec7eef 100644
+--- a/include/linux/bio.h
++++ b/include/linux/bio.h
+@@ -291,7 +291,7 @@ static inline void bio_first_folio(struct folio_iter *fi, struct bio *bio,
+ 
+       fi->folio = page_folio(bvec->bv_page);
+       fi->offset = bvec->bv_offset +
+-                      PAGE_SIZE * (bvec->bv_page - &fi->folio->page);
++                      PAGE_SIZE * folio_page_idx(fi->folio, bvec->bv_page);
+       fi->_seg_count = bvec->bv_len;
+       fi->length = min(folio_size(fi->folio) - fi->offset, fi->_seg_count);
+       fi->_next = folio_next(fi->folio);
+-- 
+2.39.5
+
diff --git a/queue-6.15/block-don-t-use-submit_bio_noacct_nocheck-in-blk_zon.patch b/queue-6.15/block-don-t-use-submit_bio_noacct_nocheck-in-blk_zon.patch

new file mode 100644 (file)

index 0000000..f3288ac
--- /dev/null
+++ b/queue-6.15/block-don-t-use-submit_bio_noacct_nocheck-in-blk_zon.patch
@@ -0,0 +1,61 @@
+From fd690870c62846af88fc1c37bfaeee55e262a3fb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 11 Jun 2025 06:44:16 +0200
+Subject: block: don't use submit_bio_noacct_nocheck in blk_zone_wplug_bio_work
+
+From: Christoph Hellwig <hch@lst.de>
+
+[ Upstream commit cf625013d8741c01407bbb4a60c111b61b9fa69d ]
+
+Bios queued up in the zone write plug have already gone through all all
+preparation in the submit_bio path, including the freeze protection.
+
+Submitting them through submit_bio_noacct_nocheck duplicates the work
+and can can cause deadlocks when freezing a queue with pending bio
+write plugs.
+
+Go straight to ->submit_bio or blk_mq_submit_bio to bypass the
+superfluous extra freeze protection and checks.
+
+Fixes: 9b1ce7f0c6f8 ("block: Implement zone append emulation")
+Reported-by: Bart Van Assche <bvanassche@acm.org>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
+Tested-by: Damien Le Moal <dlemoal@kernel.org>
+Link: https://lore.kernel.org/r/20250611044416.2351850-1-hch@lst.de
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ block/blk-zoned.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/block/blk-zoned.c b/block/blk-zoned.c
+index 8f15d1aa6eb89..45c91016cef38 100644
+--- a/block/blk-zoned.c
++++ b/block/blk-zoned.c
+@@ -1306,7 +1306,6 @@ static void blk_zone_wplug_bio_work(struct work_struct *work)
+       spin_unlock_irqrestore(&zwplug->lock, flags);
+ 
+       bdev = bio->bi_bdev;
+-      submit_bio_noacct_nocheck(bio);
+ 
+       /*
+        * blk-mq devices will reuse the extra reference on the request queue
+@@ -1314,8 +1313,12 @@ static void blk_zone_wplug_bio_work(struct work_struct *work)
+        * path for BIO-based devices will not do that. So drop this extra
+        * reference here.
+        */
+-      if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO))
++      if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) {
++              bdev->bd_disk->fops->submit_bio(bio);
+               blk_queue_exit(bdev->bd_disk->queue);
++      } else {
++              blk_mq_submit_bio(bio);
++      }
+ 
+ put_zwplug:
+       /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
+-- 
+2.39.5
+
diff --git a/queue-6.15/block-fix-bvec_set_folio-for-very-large-folios.patch b/queue-6.15/block-fix-bvec_set_folio-for-very-large-folios.patch

new file mode 100644 (file)

index 0000000..b2088a9
--- /dev/null
+++ b/queue-6.15/block-fix-bvec_set_folio-for-very-large-folios.patch
@@ -0,0 +1,46 @@
+From abcf568604922548b04d4aec04a121898c367682 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 12 Jun 2025 15:42:53 +0100
+Subject: block: Fix bvec_set_folio() for very large folios
+
+From: Matthew Wilcox (Oracle) <willy@infradead.org>
+
+[ Upstream commit 5e223e06ee7c6d8f630041a0645ac90e39a42cc6 ]
+
+Similarly to 26064d3e2b4d ("block: fix adding folio to bio"), if
+we attempt to add a folio that is larger than 4GB, we'll silently
+truncate the offset and len.  Widen the parameters to size_t, assert
+that the length is less than 4GB and set the first page that contains
+the interesting data rather than the first page of the folio.
+
+Fixes: 26db5ee15851 (block: add a bvec_set_folio helper)
+Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
+Link: https://lore.kernel.org/r/20250612144255.2850278-1-willy@infradead.org
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/bvec.h | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/include/linux/bvec.h b/include/linux/bvec.h
+index 204b22a99c4ba..0a80e1f9aa201 100644
+--- a/include/linux/bvec.h
++++ b/include/linux/bvec.h
+@@ -57,9 +57,12 @@ static inline void bvec_set_page(struct bio_vec *bv, struct page *page,
+  * @offset:   offset into the folio
+  */
+ static inline void bvec_set_folio(struct bio_vec *bv, struct folio *folio,
+-              unsigned int len, unsigned int offset)
++              size_t len, size_t offset)
+ {
+-      bvec_set_page(bv, &folio->page, len, offset);
++      unsigned long nr = offset / PAGE_SIZE;
++
++      WARN_ON_ONCE(len > UINT_MAX);
++      bvec_set_page(bv, folio_page(folio, nr), len, offset % PAGE_SIZE);
+ }
+ 
+ /**
+-- 
+2.39.5
+
diff --git a/queue-6.15/block-use-q-elevator-with-elevator_lock-held-in-elv_.patch b/queue-6.15/block-use-q-elevator-with-elevator_lock-held-in-elv_.patch

new file mode 100644 (file)

index 0000000..ff7b430
--- /dev/null
+++ b/queue-6.15/block-use-q-elevator-with-elevator_lock-held-in-elv_.patch
@@ -0,0 +1,49 @@
+From e180cb3328b585872e7886cabded927fd7817cb9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 5 May 2025 22:17:42 +0800
+Subject: block: use q->elevator with ->elevator_lock held in
+ elv_iosched_show()
+
+From: Ming Lei <ming.lei@redhat.com>
+
+[ Upstream commit 94209d27d14104ed828ca88cd5403a99162fe51a ]
+
+Use q->elevator with ->elevator_lock held in elv_iosched_show(), since
+the local cached elevator reference may become stale after getting
+->elevator_lock.
+
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Reviewed-by: Nilay Shroff <nilay@linux.ibm.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Ming Lei <ming.lei@redhat.com>
+Link: https://lore.kernel.org/r/20250505141805.2751237-5-ming.lei@redhat.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ block/elevator.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+diff --git a/block/elevator.c b/block/elevator.c
+index b4d08026b02ce..dc4cadef728e5 100644
+--- a/block/elevator.c
++++ b/block/elevator.c
+@@ -744,7 +744,6 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
+ ssize_t elv_iosched_show(struct gendisk *disk, char *name)
+ {
+       struct request_queue *q = disk->queue;
+-      struct elevator_queue *eq = q->elevator;
+       struct elevator_type *cur = NULL, *e;
+       int len = 0;
+ 
+@@ -753,7 +752,7 @@ ssize_t elv_iosched_show(struct gendisk *disk, char *name)
+               len += sprintf(name+len, "[none] ");
+       } else {
+               len += sprintf(name+len, "none ");
+-              cur = eq->type;
++              cur = q->elevator->type;
+       }
+ 
+       spin_lock(&elv_list_lock);
+-- 
+2.39.5
+
diff --git a/queue-6.15/btrfs-exit-after-state-insertion-failure-at-btrfs_co.patch b/queue-6.15/btrfs-exit-after-state-insertion-failure-at-btrfs_co.patch

new file mode 100644 (file)

index 0000000..4b3fac7
--- /dev/null
+++ b/queue-6.15/btrfs-exit-after-state-insertion-failure-at-btrfs_co.patch
@@ -0,0 +1,43 @@
+From f6d3bebd2424dd94262bea320f0a1f1d76f9a88c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 10 Apr 2025 17:11:14 +0100
+Subject: btrfs: exit after state insertion failure at
+ btrfs_convert_extent_bit()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 3bf179e36da917c5d9bec71c714573ed1649b7c1 ]
+
+If insert_state() state failed it returns an error pointer and we call
+extent_io_tree_panic() which will trigger a BUG() call. However if
+CONFIG_BUG is disabled, which is an uncommon and exotic scenario, then
+we fallthrough and call cache_state() which will dereference the error
+pointer, resulting in an invalid memory access.
+
+So jump to the 'out' label after calling extent_io_tree_panic(), it also
+makes the code more clear besides dealing with the exotic scenario where
+CONFIG_BUG is disabled.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/extent-io-tree.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
+index 13de6af279e52..92cfde37b1d33 100644
+--- a/fs/btrfs/extent-io-tree.c
++++ b/fs/btrfs/extent-io-tree.c
+@@ -1456,6 +1456,7 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+               if (IS_ERR(inserted_state)) {
+                       ret = PTR_ERR(inserted_state);
+                       extent_io_tree_panic(tree, prealloc, "insert", ret);
++                      goto out;
+               }
+               cache_state(inserted_state, cached_state);
+               if (inserted_state == prealloc)
+-- 
+2.39.5
+
diff --git a/queue-6.15/btrfs-exit-after-state-split-error-at-set_extent_bit.patch b/queue-6.15/btrfs-exit-after-state-split-error-at-set_extent_bit.patch

new file mode 100644 (file)

index 0000000..5bbf55d
--- /dev/null
+++ b/queue-6.15/btrfs-exit-after-state-split-error-at-set_extent_bit.patch
@@ -0,0 +1,47 @@
+From 962d4b4b7999df4f2be1d7e244b7bb74a6a5263a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 16 Apr 2025 16:00:28 +0100
+Subject: btrfs: exit after state split error at set_extent_bit()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 41d69d4d78d8b179bf3bcdfc56d28a12b3a608d2 ]
+
+If split_state() returned an error we call extent_io_tree_panic() which
+will trigger a BUG() call. However if CONFIG_BUG is disabled, which is an
+uncommon and exotic scenario, then we fallthrough and hit a use after free
+when calling set_state_bits() since the extent state record which the
+local variable 'prealloc' points to was freed by split_state().
+
+So jump to the label 'out' after calling extent_io_tree_panic() and set
+the 'prealloc' pointer to NULL since split_state() has already freed it
+when it hit an error.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/extent-io-tree.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
+index 92cfde37b1d33..b5b44ea91f999 100644
+--- a/fs/btrfs/extent-io-tree.c
++++ b/fs/btrfs/extent-io-tree.c
+@@ -1252,8 +1252,11 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+               if (!prealloc)
+                       goto search_again;
+               ret = split_state(tree, state, prealloc, end + 1);
+-              if (ret)
++              if (ret) {
+                       extent_io_tree_panic(tree, state, "split", ret);
++                      prealloc = NULL;
++                      goto out;
++              }
+ 
+               set_state_bits(tree, prealloc, bits, changeset);
+               cache_state(prealloc, cached_state);
+-- 
+2.39.5
+
diff --git a/queue-6.15/btrfs-fix-fsync-of-files-with-no-hard-links-not-pers.patch b/queue-6.15/btrfs-fix-fsync-of-files-with-no-hard-links-not-pers.patch

new file mode 100644 (file)

index 0000000..a841b09
--- /dev/null
+++ b/queue-6.15/btrfs-fix-fsync-of-files-with-no-hard-links-not-pers.patch
@@ -0,0 +1,148 @@
+From 319be88c8a49e4bbc8b28e98de43a6b1e267e999 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 20 Mar 2025 16:05:50 +0000
+Subject: btrfs: fix fsync of files with no hard links not persisting deletion
+
+From: Filipe Manana <fdmanana@suse.com>
+
+[ Upstream commit 5e85262e542d6da8898bb8563a724ad98f6fc936 ]
+
+If we fsync a file (or directory) that has no more hard links, because
+while a process had a file descriptor open on it, the file's last hard
+link was removed and then the process did an fsync against the file
+descriptor, after a power failure or crash the file still exists after
+replaying the log.
+
+This behaviour is incorrect since once an inode has no more hard links
+it's not accessible anymore and we insert an orphan item into its
+subvolume's tree so that the deletion of all its items is not missed in
+case of a power failure or crash.
+
+So after log replay the file shouldn't exist anymore, which is also the
+behaviour on ext4, xfs, f2fs and other filesystems.
+
+Fix this by not ignoring inodes with zero hard links at
+btrfs_log_inode_parent() and by committing an inode's delayed inode when
+we are not doing a fast fsync (either BTRFS_INODE_COPY_EVERYTHING or
+BTRFS_INODE_NEEDS_FULL_SYNC is set in the inode's runtime flags). This
+last step is necessary because when removing the last hard link we don't
+delete the corresponding ref (or extref) item, instead we record the
+change in the inode's delayed inode with the BTRFS_DELAYED_NODE_DEL_IREF
+flag, so that when the delayed inode is committed we delete the ref/extref
+item from the inode's subvolume tree - otherwise the logging code will log
+the last hard link and therefore upon log replay the inode is not deleted.
+
+The base code for a fstests test case that reproduces this bug is the
+following:
+
+   . ./common/dmflakey
+
+   _require_scratch
+   _require_dm_target flakey
+   _require_mknod
+
+   _scratch_mkfs >>$seqres.full 2>&1 || _fail "mkfs failed"
+   _require_metadata_journaling $SCRATCH_DEV
+   _init_flakey
+   _mount_flakey
+
+   touch $SCRATCH_MNT/foo
+
+   # Commit the current transaction and persist the file.
+   _scratch_sync
+
+   # A fifo to communicate with a background xfs_io process that will
+   # fsync the file after we deleted its hard link while it's open by
+   # xfs_io.
+   mkfifo $SCRATCH_MNT/fifo
+
+   tail -f $SCRATCH_MNT/fifo | \
+        $XFS_IO_PROG $SCRATCH_MNT/foo >>$seqres.full &
+   XFS_IO_PID=$!
+
+   # Give some time for the xfs_io process to open a file descriptor for
+   # the file.
+   sleep 1
+
+   # Now while the file is open by the xfs_io process, delete its only
+   # hard link.
+   rm -f $SCRATCH_MNT/foo
+
+   # Now that it has no more hard links, make the xfs_io process fsync it.
+   echo "fsync" > $SCRATCH_MNT/fifo
+
+   # Terminate the xfs_io process so that we can unmount.
+   echo "quit" > $SCRATCH_MNT/fifo
+   wait $XFS_IO_PID
+   unset XFS_IO_PID
+
+   # Simulate a power failure and then mount again the filesystem to
+   # replay the journal/log.
+   _flakey_drop_and_remount
+
+   # We don't expect the file to exist anymore, since it was fsynced when
+   # it had no more hard links.
+   [ -f $SCRATCH_MNT/foo ] && echo "file foo still exists"
+
+   _unmount_flakey
+
+   # success, all done
+   echo "Silence is golden"
+   status=0
+   exit
+
+A test case for fstests will be submitted soon.
+
+Reviewed-by: Boris Burkov <boris@bur.io>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/tree-log.c | 24 ++++++++++++++++--------
+ 1 file changed, 16 insertions(+), 8 deletions(-)
+
+diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
+index 90dc094cfa5e5..f5af11565b876 100644
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -6583,6 +6583,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
+               btrfs_log_get_delayed_items(inode, &delayed_ins_list,
+                                           &delayed_del_list);
+ 
++      /*
++       * If we are fsyncing a file with 0 hard links, then commit the delayed
++       * inode because the last inode ref (or extref) item may still be in the
++       * subvolume tree and if we log it the file will still exist after a log
++       * replay. So commit the delayed inode to delete that last ref and we
++       * skip logging it.
++       */
++      if (inode->vfs_inode.i_nlink == 0) {
++              ret = btrfs_commit_inode_delayed_inode(inode);
++              if (ret)
++                      goto out_unlock;
++      }
++
+       ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
+                                     path, dst_path, logged_isize,
+                                     inode_only, ctx,
+@@ -7051,14 +7064,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+       if (btrfs_root_generation(&root->root_item) == trans->transid)
+               return BTRFS_LOG_FORCE_COMMIT;
+ 
+-      /*
+-       * Skip already logged inodes or inodes corresponding to tmpfiles
+-       * (since logging them is pointless, a link count of 0 means they
+-       * will never be accessible).
+-       */
+-      if ((btrfs_inode_in_log(inode, trans->transid) &&
+-           list_empty(&ctx->ordered_extents)) ||
+-          inode->vfs_inode.i_nlink == 0)
++      /* Skip already logged inodes and without new extents. */
++      if (btrfs_inode_in_log(inode, trans->transid) &&
++          list_empty(&ctx->ordered_extents))
+               return BTRFS_NO_LOG_SYNC;
+ 
+       ret = start_log_trans(trans, root, ctx);
+-- 
+2.39.5
+
diff --git a/queue-6.15/fs-filesystems-fix-potential-unsigned-integer-underf.patch b/queue-6.15/fs-filesystems-fix-potential-unsigned-integer-underf.patch

new file mode 100644 (file)

index 0000000..a46f480
--- /dev/null
+++ b/queue-6.15/fs-filesystems-fix-potential-unsigned-integer-underf.patch
@@ -0,0 +1,55 @@
+From fee4a2d557a6ff56b1a1935873e0680eec47f787 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 10 Apr 2025 19:45:27 +0800
+Subject: fs/filesystems: Fix potential unsigned integer underflow in fs_name()
+
+From: Zijun Hu <quic_zijuhu@quicinc.com>
+
+[ Upstream commit 1363c134ade81e425873b410566e957fecebb261 ]
+
+fs_name() has @index as unsigned int, so there is underflow risk for
+operation '@index--'.
+
+Fix by breaking the for loop when '@index == 0' which is also more proper
+than '@index <= 0' for unsigned integer comparison.
+
+Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
+Link: https://lore.kernel.org/20250410-fix_fs-v1-1-7c14ccc8ebaa@quicinc.com
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/filesystems.c | 14 +++++++++-----
+ 1 file changed, 9 insertions(+), 5 deletions(-)
+
+diff --git a/fs/filesystems.c b/fs/filesystems.c
+index 58b9067b2391c..95e5256821a53 100644
+--- a/fs/filesystems.c
++++ b/fs/filesystems.c
+@@ -156,15 +156,19 @@ static int fs_index(const char __user * __name)
+ static int fs_name(unsigned int index, char __user * buf)
+ {
+       struct file_system_type * tmp;
+-      int len, res;
++      int len, res = -EINVAL;
+ 
+       read_lock(&file_systems_lock);
+-      for (tmp = file_systems; tmp; tmp = tmp->next, index--)
+-              if (index <= 0 && try_module_get(tmp->owner))
++      for (tmp = file_systems; tmp; tmp = tmp->next, index--) {
++              if (index == 0) {
++                      if (try_module_get(tmp->owner))
++                              res = 0;
+                       break;
++              }
++      }
+       read_unlock(&file_systems_lock);
+-      if (!tmp)
+-              return -EINVAL;
++      if (res)
++              return res;
+ 
+       /* OK, we got the reference, so we can safely block */
+       len = strlen(tmp->name) + 1;
+-- 
+2.39.5
+
diff --git a/queue-6.15/gfs2-pass-through-holder-from-the-vfs-for-freeze-tha.patch b/queue-6.15/gfs2-pass-through-holder-from-the-vfs-for-freeze-tha.patch

new file mode 100644 (file)

index 0000000..e552c19
--- /dev/null
+++ b/queue-6.15/gfs2-pass-through-holder-from-the-vfs-for-freeze-tha.patch
@@ -0,0 +1,97 @@
+From e0e37229330bdec04abc31325bfa2996ccfdc77d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 4 Apr 2025 21:02:28 +0200
+Subject: gfs2: pass through holder from the VFS for freeze/thaw
+
+From: Christian Brauner <brauner@kernel.org>
+
+[ Upstream commit 62a2175ddf7e72941868f164b7c1f92e00f213bd ]
+
+The filesystem's freeze/thaw functions can be called from contexts where
+the holder isn't userspace but the kernel, e.g., during systemd
+suspend/hibernate. So pass through the freeze/thaw flags from the VFS
+instead of hard-coding them.
+
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/gfs2/super.c | 14 ++++++++------
+ 1 file changed, 8 insertions(+), 6 deletions(-)
+
+diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
+index 54f1efd47a3e5..0bd7827e6371e 100644
+--- a/fs/gfs2/super.c
++++ b/fs/gfs2/super.c
+@@ -674,7 +674,7 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
+       return sdp->sd_log_error;
+ }
+ 
+-static int gfs2_do_thaw(struct gfs2_sbd *sdp)
++static int gfs2_do_thaw(struct gfs2_sbd *sdp, enum freeze_holder who)
+ {
+       struct super_block *sb = sdp->sd_vfs;
+       int error;
+@@ -682,7 +682,7 @@ static int gfs2_do_thaw(struct gfs2_sbd *sdp)
+       error = gfs2_freeze_lock_shared(sdp);
+       if (error)
+               goto fail;
+-      error = thaw_super(sb, FREEZE_HOLDER_USERSPACE);
++      error = thaw_super(sb, who);
+       if (!error)
+               return 0;
+ 
+@@ -710,7 +710,7 @@ void gfs2_freeze_func(struct work_struct *work)
+       gfs2_freeze_unlock(sdp);
+       set_bit(SDF_FROZEN, &sdp->sd_flags);
+ 
+-      error = gfs2_do_thaw(sdp);
++      error = gfs2_do_thaw(sdp, FREEZE_HOLDER_USERSPACE);
+       if (error)
+               goto out;
+ 
+@@ -728,6 +728,7 @@ void gfs2_freeze_func(struct work_struct *work)
+ /**
+  * gfs2_freeze_super - prevent further writes to the filesystem
+  * @sb: the VFS structure for the filesystem
++ * @who: freeze flags
+  *
+  */
+ 
+@@ -744,7 +745,7 @@ static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who)
+       }
+ 
+       for (;;) {
+-              error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
++              error = freeze_super(sb, who);
+               if (error) {
+                       fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n",
+                               error);
+@@ -758,7 +759,7 @@ static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who)
+                       break;
+               }
+ 
+-              error = gfs2_do_thaw(sdp);
++              error = gfs2_do_thaw(sdp, who);
+               if (error)
+                       goto out;
+ 
+@@ -796,6 +797,7 @@ static int gfs2_freeze_fs(struct super_block *sb)
+ /**
+  * gfs2_thaw_super - reallow writes to the filesystem
+  * @sb: the VFS structure for the filesystem
++ * @who: freeze flags
+  *
+  */
+ 
+@@ -814,7 +816,7 @@ static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who)
+       atomic_inc(&sb->s_active);
+       gfs2_freeze_unlock(sdp);
+ 
+-      error = gfs2_do_thaw(sdp);
++      error = gfs2_do_thaw(sdp, who);
+ 
+       if (!error) {
+               clear_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags);
+-- 
+2.39.5
+
diff --git a/queue-6.15/io_uring-consistently-use-rcu-semantics-with-sqpoll-.patch b/queue-6.15/io_uring-consistently-use-rcu-semantics-with-sqpoll-.patch

new file mode 100644 (file)

index 0000000..a60c911
--- /dev/null
+++ b/queue-6.15/io_uring-consistently-use-rcu-semantics-with-sqpoll-.patch
@@ -0,0 +1,183 @@
+From 4a341f091b2fddccccf86db2b6c6e165c030cdf9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 11 Jun 2025 13:53:43 -0700
+Subject: io_uring: consistently use rcu semantics with sqpoll thread
+
+From: Keith Busch <kbusch@kernel.org>
+
+[ Upstream commit c538f400fae22725580842deb2bef546701b64bd ]
+
+The sqpoll thread is dereferenced with rcu read protection in one place,
+so it needs to be annotated as an __rcu type, and should consistently
+use rcu helpers for access and assignment to make sparse happy.
+
+Since most of the accesses occur under the sqd->lock, we can use
+rcu_dereference_protected() without declaring an rcu read section.
+Provide a simple helper to get the thread from a locked context.
+
+Fixes: ac0b8b327a5677d ("io_uring: fix use-after-free of sq->thread in __io_uring_show_fdinfo()")
+Signed-off-by: Keith Busch <kbusch@kernel.org>
+Link: https://lore.kernel.org/r/20250611205343.1821117-1-kbusch@meta.com
+[axboe: fold in fix for register.c]
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ io_uring/io_uring.c |  4 ++--
+ io_uring/register.c |  7 +++++--
+ io_uring/sqpoll.c   | 34 ++++++++++++++++++++++++----------
+ io_uring/sqpoll.h   |  8 +++++++-
+ 4 files changed, 38 insertions(+), 15 deletions(-)
+
+diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
+index 9266d4f2016ad..e5466f6568269 100644
+--- a/io_uring/io_uring.c
++++ b/io_uring/io_uring.c
+@@ -2913,7 +2913,7 @@ static __cold void io_ring_exit_work(struct work_struct *work)
+                       struct task_struct *tsk;
+ 
+                       io_sq_thread_park(sqd);
+-                      tsk = sqd->thread;
++                      tsk = sqpoll_task_locked(sqd);
+                       if (tsk && tsk->io_uring && tsk->io_uring->io_wq)
+                               io_wq_cancel_cb(tsk->io_uring->io_wq,
+                                               io_cancel_ctx_cb, ctx, true);
+@@ -3150,7 +3150,7 @@ __cold void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd)
+       s64 inflight;
+       DEFINE_WAIT(wait);
+ 
+-      WARN_ON_ONCE(sqd && sqd->thread != current);
++      WARN_ON_ONCE(sqd && sqpoll_task_locked(sqd) != current);
+ 
+       if (!current->io_uring)
+               return;
+diff --git a/io_uring/register.c b/io_uring/register.c
+index cc23a4c205cd4..a59589249fce7 100644
+--- a/io_uring/register.c
++++ b/io_uring/register.c
+@@ -273,6 +273,8 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
+       if (ctx->flags & IORING_SETUP_SQPOLL) {
+               sqd = ctx->sq_data;
+               if (sqd) {
++                      struct task_struct *tsk;
++
+                       /*
+                        * Observe the correct sqd->lock -> ctx->uring_lock
+                        * ordering. Fine to drop uring_lock here, we hold
+@@ -282,8 +284,9 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
+                       mutex_unlock(&ctx->uring_lock);
+                       mutex_lock(&sqd->lock);
+                       mutex_lock(&ctx->uring_lock);
+-                      if (sqd->thread)
+-                              tctx = sqd->thread->io_uring;
++                      tsk = sqpoll_task_locked(sqd);
++                      if (tsk)
++                              tctx = tsk->io_uring;
+               }
+       } else {
+               tctx = current->io_uring;
+diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
+index 0625a421626f4..268d2fbe6160c 100644
+--- a/io_uring/sqpoll.c
++++ b/io_uring/sqpoll.c
+@@ -30,7 +30,7 @@ enum {
+ void io_sq_thread_unpark(struct io_sq_data *sqd)
+       __releases(&sqd->lock)
+ {
+-      WARN_ON_ONCE(sqd->thread == current);
++      WARN_ON_ONCE(sqpoll_task_locked(sqd) == current);
+ 
+       /*
+        * Do the dance but not conditional clear_bit() because it'd race with
+@@ -46,24 +46,32 @@ void io_sq_thread_unpark(struct io_sq_data *sqd)
+ void io_sq_thread_park(struct io_sq_data *sqd)
+       __acquires(&sqd->lock)
+ {
+-      WARN_ON_ONCE(data_race(sqd->thread) == current);
++      struct task_struct *tsk;
+ 
+       atomic_inc(&sqd->park_pending);
+       set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state);
+       mutex_lock(&sqd->lock);
+-      if (sqd->thread)
+-              wake_up_process(sqd->thread);
++
++      tsk = sqpoll_task_locked(sqd);
++      if (tsk) {
++              WARN_ON_ONCE(tsk == current);
++              wake_up_process(tsk);
++      }
+ }
+ 
+ void io_sq_thread_stop(struct io_sq_data *sqd)
+ {
+-      WARN_ON_ONCE(sqd->thread == current);
++      struct task_struct *tsk;
++
+       WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state));
+ 
+       set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state);
+       mutex_lock(&sqd->lock);
+-      if (sqd->thread)
+-              wake_up_process(sqd->thread);
++      tsk = sqpoll_task_locked(sqd);
++      if (tsk) {
++              WARN_ON_ONCE(tsk == current);
++              wake_up_process(tsk);
++      }
+       mutex_unlock(&sqd->lock);
+       wait_for_completion(&sqd->exited);
+ }
+@@ -486,7 +494,10 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
+                       goto err_sqpoll;
+               }
+ 
+-              sqd->thread = tsk;
++              mutex_lock(&sqd->lock);
++              rcu_assign_pointer(sqd->thread, tsk);
++              mutex_unlock(&sqd->lock);
++
+               task_to_put = get_task_struct(tsk);
+               ret = io_uring_alloc_task_context(tsk, ctx);
+               wake_up_new_task(tsk);
+@@ -514,10 +525,13 @@ __cold int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx,
+       int ret = -EINVAL;
+ 
+       if (sqd) {
++              struct task_struct *tsk;
++
+               io_sq_thread_park(sqd);
+               /* Don't set affinity for a dying thread */
+-              if (sqd->thread)
+-                      ret = io_wq_cpu_affinity(sqd->thread->io_uring, mask);
++              tsk = sqpoll_task_locked(sqd);
++              if (tsk)
++                      ret = io_wq_cpu_affinity(tsk->io_uring, mask);
+               io_sq_thread_unpark(sqd);
+       }
+ 
+diff --git a/io_uring/sqpoll.h b/io_uring/sqpoll.h
+index 4171666b1cf4c..b83dcdec9765f 100644
+--- a/io_uring/sqpoll.h
++++ b/io_uring/sqpoll.h
+@@ -8,7 +8,7 @@ struct io_sq_data {
+       /* ctx's that are using this sqd */
+       struct list_head        ctx_list;
+ 
+-      struct task_struct      *thread;
++      struct task_struct __rcu *thread;
+       struct wait_queue_head  wait;
+ 
+       unsigned                sq_thread_idle;
+@@ -29,3 +29,9 @@ void io_sq_thread_unpark(struct io_sq_data *sqd);
+ void io_put_sq_data(struct io_sq_data *sqd);
+ void io_sqpoll_wait_sq(struct io_ring_ctx *ctx);
+ int io_sqpoll_wq_cpu_affinity(struct io_ring_ctx *ctx, cpumask_var_t mask);
++
++static inline struct task_struct *sqpoll_task_locked(struct io_sq_data *sqd)
++{
++      return rcu_dereference_protected(sqd->thread,
++                                       lockdep_is_held(&sqd->lock));
++}
+-- 
+2.39.5
+
diff --git a/queue-6.15/io_uring-fix-spurious-drain-flushing.patch b/queue-6.15/io_uring-fix-spurious-drain-flushing.patch

new file mode 100644 (file)

index 0000000..9b2a68a
--- /dev/null
+++ b/queue-6.15/io_uring-fix-spurious-drain-flushing.patch
@@ -0,0 +1,67 @@
+From 1833532341f4a4ebb43130aad4385336074463dd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 9 May 2025 12:12:48 +0100
+Subject: io_uring: fix spurious drain flushing
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+[ Upstream commit fde04c7e2775feb0746301e0ef86a04d3598c3fe ]
+
+io_queue_deferred() is not tolerant to spurious calls not completing
+some requests. You can have an inflight drain-marked request and another
+request that came after and got queued into the drain list. Now, if
+io_queue_deferred() is called before the first request completes, it'll
+check the 2nd req with req_need_defer(), find that there is no drain
+flag set, and queue it for execution.
+
+To make io_queue_deferred() work, it should at least check sequences for
+the first request, and then we need also need to check if there is
+another drain request creating another bubble.
+
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/972bde11b7d4ef25b3f5e3fd34f80e4d2aa345b8.1746788718.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ io_uring/io_uring.c | 14 +++++++++++++-
+ 1 file changed, 13 insertions(+), 1 deletion(-)
+
+diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
+index edda31a15c6e6..9266d4f2016ad 100644
+--- a/io_uring/io_uring.c
++++ b/io_uring/io_uring.c
+@@ -537,18 +537,30 @@ void io_req_queue_iowq(struct io_kiocb *req)
+       io_req_task_work_add(req);
+ }
+ 
++static bool io_drain_defer_seq(struct io_kiocb *req, u32 seq)
++{
++      struct io_ring_ctx *ctx = req->ctx;
++
++      return seq + READ_ONCE(ctx->cq_extra) != ctx->cached_cq_tail;
++}
++
+ static __cold noinline void io_queue_deferred(struct io_ring_ctx *ctx)
+ {
++      bool drain_seen = false, first = true;
++
+       spin_lock(&ctx->completion_lock);
+       while (!list_empty(&ctx->defer_list)) {
+               struct io_defer_entry *de = list_first_entry(&ctx->defer_list,
+                                               struct io_defer_entry, list);
+ 
+-              if (req_need_defer(de->req, de->seq))
++              drain_seen |= de->req->flags & REQ_F_IO_DRAIN;
++              if ((drain_seen || first) && io_drain_defer_seq(de->req, de->seq))
+                       break;
++
+               list_del_init(&de->list);
+               io_req_task_queue(de->req);
+               kfree(de);
++              first = false;
+       }
+       spin_unlock(&ctx->completion_lock);
+ }
+-- 
+2.39.5
+
diff --git a/queue-6.15/io_uring-fix-use-after-free-of-sq-thread-in-__io_uri.patch b/queue-6.15/io_uring-fix-use-after-free-of-sq-thread-in-__io_uri.patch

new file mode 100644 (file)

index 0000000..3cff88f
--- /dev/null
+++ b/queue-6.15/io_uring-fix-use-after-free-of-sq-thread-in-__io_uri.patch
@@ -0,0 +1,208 @@
+From f103a13980783653fab3712b7268cbb729ea74ae Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 10 Jun 2025 10:18:01 -0700
+Subject: io_uring: fix use-after-free of sq->thread in
+ __io_uring_show_fdinfo()
+
+From: Penglei Jiang <superman.xpt@gmail.com>
+
+[ Upstream commit ac0b8b327a5677dc6fecdf353d808161525b1ff0 ]
+
+syzbot reports:
+
+BUG: KASAN: slab-use-after-free in getrusage+0x1109/0x1a60
+Read of size 8 at addr ffff88810de2d2c8 by task a.out/304
+
+CPU: 0 UID: 0 PID: 304 Comm: a.out Not tainted 6.16.0-rc1 #1 PREEMPT(voluntary)
+Hardware name: QEMU Ubuntu 24.04 PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
+Call Trace:
+ <TASK>
+ dump_stack_lvl+0x53/0x70
+ print_report+0xd0/0x670
+ ? __pfx__raw_spin_lock_irqsave+0x10/0x10
+ ? getrusage+0x1109/0x1a60
+ kasan_report+0xce/0x100
+ ? getrusage+0x1109/0x1a60
+ getrusage+0x1109/0x1a60
+ ? __pfx_getrusage+0x10/0x10
+ __io_uring_show_fdinfo+0x9fe/0x1790
+ ? ksys_read+0xf7/0x1c0
+ ? do_syscall_64+0xa4/0x260
+ ? vsnprintf+0x591/0x1100
+ ? __pfx___io_uring_show_fdinfo+0x10/0x10
+ ? __pfx_vsnprintf+0x10/0x10
+ ? mutex_trylock+0xcf/0x130
+ ? __pfx_mutex_trylock+0x10/0x10
+ ? __pfx_show_fd_locks+0x10/0x10
+ ? io_uring_show_fdinfo+0x57/0x80
+ io_uring_show_fdinfo+0x57/0x80
+ seq_show+0x38c/0x690
+ seq_read_iter+0x3f7/0x1180
+ ? inode_set_ctime_current+0x160/0x4b0
+ seq_read+0x271/0x3e0
+ ? __pfx_seq_read+0x10/0x10
+ ? __pfx__raw_spin_lock+0x10/0x10
+ ? __mark_inode_dirty+0x402/0x810
+ ? selinux_file_permission+0x368/0x500
+ ? file_update_time+0x10f/0x160
+ vfs_read+0x177/0xa40
+ ? __pfx___handle_mm_fault+0x10/0x10
+ ? __pfx_vfs_read+0x10/0x10
+ ? mutex_lock+0x81/0xe0
+ ? __pfx_mutex_lock+0x10/0x10
+ ? fdget_pos+0x24d/0x4b0
+ ksys_read+0xf7/0x1c0
+ ? __pfx_ksys_read+0x10/0x10
+ ? do_user_addr_fault+0x43b/0x9c0
+ do_syscall_64+0xa4/0x260
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+RIP: 0033:0x7f0f74170fc9
+Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 8
+RSP: 002b:00007fffece049e8 EFLAGS: 00000206 ORIG_RAX: 0000000000000000
+RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f0f74170fc9
+RDX: 0000000000001000 RSI: 00007fffece049f0 RDI: 0000000000000004
+RBP: 00007fffece05ad0 R08: 0000000000000000 R09: 00007fffece04d90
+R10: 0000000000000000 R11: 0000000000000206 R12: 00005651720a1100
+R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
+ </TASK>
+
+Allocated by task 298:
+ kasan_save_stack+0x33/0x60
+ kasan_save_track+0x14/0x30
+ __kasan_slab_alloc+0x6e/0x70
+ kmem_cache_alloc_node_noprof+0xe8/0x330
+ copy_process+0x376/0x5e00
+ create_io_thread+0xab/0xf0
+ io_sq_offload_create+0x9ed/0xf20
+ io_uring_setup+0x12b0/0x1cc0
+ do_syscall_64+0xa4/0x260
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+
+Freed by task 22:
+ kasan_save_stack+0x33/0x60
+ kasan_save_track+0x14/0x30
+ kasan_save_free_info+0x3b/0x60
+ __kasan_slab_free+0x37/0x50
+ kmem_cache_free+0xc4/0x360
+ rcu_core+0x5ff/0x19f0
+ handle_softirqs+0x18c/0x530
+ run_ksoftirqd+0x20/0x30
+ smpboot_thread_fn+0x287/0x6c0
+ kthread+0x30d/0x630
+ ret_from_fork+0xef/0x1a0
+ ret_from_fork_asm+0x1a/0x30
+
+Last potentially related work creation:
+ kasan_save_stack+0x33/0x60
+ kasan_record_aux_stack+0x8c/0xa0
+ __call_rcu_common.constprop.0+0x68/0x940
+ __schedule+0xff2/0x2930
+ __cond_resched+0x4c/0x80
+ mutex_lock+0x5c/0xe0
+ io_uring_del_tctx_node+0xe1/0x2b0
+ io_uring_clean_tctx+0xb7/0x160
+ io_uring_cancel_generic+0x34e/0x760
+ do_exit+0x240/0x2350
+ do_group_exit+0xab/0x220
+ __x64_sys_exit_group+0x39/0x40
+ x64_sys_call+0x1243/0x1840
+ do_syscall_64+0xa4/0x260
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+
+The buggy address belongs to the object at ffff88810de2cb00
+ which belongs to the cache task_struct of size 3712
+The buggy address is located 1992 bytes inside of
+ freed 3712-byte region [ffff88810de2cb00, ffff88810de2d980)
+
+which is caused by the task_struct pointed to by sq->thread being
+released while it is being used in the function
+__io_uring_show_fdinfo(). Holding ctx->uring_lock does not prevent ehre
+relase or exit of sq->thread.
+
+Fix this by assigning and looking up ->thread under RCU, and grabbing a
+reference to the task_struct. This ensures that it cannot get released
+while fdinfo is using it.
+
+Reported-by: syzbot+531502bbbe51d2f769f4@syzkaller.appspotmail.com
+Closes: https://lore.kernel.org/all/682b06a5.a70a0220.3849cf.00b3.GAE@google.com
+Fixes: 3fcb9d17206e ("io_uring/sqpoll: statistics of the true utilization of sq threads")
+Signed-off-by: Penglei Jiang <superman.xpt@gmail.com>
+Link: https://lore.kernel.org/r/20250610171801.70960-1-superman.xpt@gmail.com
+[axboe: massage commit message]
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ io_uring/fdinfo.c | 12 ++++++++++--
+ io_uring/sqpoll.c |  9 ++++-----
+ 2 files changed, 14 insertions(+), 7 deletions(-)
+
+diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c
+index e0d6a59a89fa1..f948917f7f707 100644
+--- a/io_uring/fdinfo.c
++++ b/io_uring/fdinfo.c
+@@ -172,18 +172,26 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
+ 
+       if (ctx->flags & IORING_SETUP_SQPOLL) {
+               struct io_sq_data *sq = ctx->sq_data;
++              struct task_struct *tsk;
+ 
++              rcu_read_lock();
++              tsk = rcu_dereference(sq->thread);
+               /*
+                * sq->thread might be NULL if we raced with the sqpoll
+                * thread termination.
+                */
+-              if (sq->thread) {
++              if (tsk) {
++                      get_task_struct(tsk);
++                      rcu_read_unlock();
++                      getrusage(tsk, RUSAGE_SELF, &sq_usage);
++                      put_task_struct(tsk);
+                       sq_pid = sq->task_pid;
+                       sq_cpu = sq->sq_cpu;
+-                      getrusage(sq->thread, RUSAGE_SELF, &sq_usage);
+                       sq_total_time = (sq_usage.ru_stime.tv_sec * 1000000
+                                        + sq_usage.ru_stime.tv_usec);
+                       sq_work_time = sq->work_time;
++              } else {
++                      rcu_read_unlock();
+               }
+       }
+ 
+diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
+index 03c699493b5ab..0625a421626f4 100644
+--- a/io_uring/sqpoll.c
++++ b/io_uring/sqpoll.c
+@@ -270,7 +270,8 @@ static int io_sq_thread(void *data)
+       /* offload context creation failed, just exit */
+       if (!current->io_uring) {
+               mutex_lock(&sqd->lock);
+-              sqd->thread = NULL;
++              rcu_assign_pointer(sqd->thread, NULL);
++              put_task_struct(current);
+               mutex_unlock(&sqd->lock);
+               goto err_out;
+       }
+@@ -379,7 +380,8 @@ static int io_sq_thread(void *data)
+               io_sq_tw(&retry_list, UINT_MAX);
+ 
+       io_uring_cancel_generic(true, sqd);
+-      sqd->thread = NULL;
++      rcu_assign_pointer(sqd->thread, NULL);
++      put_task_struct(current);
+       list_for_each_entry(ctx, &sqd->ctx_list, sqd_list)
+               atomic_or(IORING_SQ_NEED_WAKEUP, &ctx->rings->sq_flags);
+       io_run_task_work();
+@@ -495,9 +497,6 @@ __cold int io_sq_offload_create(struct io_ring_ctx *ctx,
+               ret = -EINVAL;
+               goto err;
+       }
+-
+-      if (task_to_put)
+-              put_task_struct(task_to_put);
+       return 0;
+ err_sqpoll:
+       complete(&ctx->sq_data->exited);
+-- 
+2.39.5
+
diff --git a/queue-6.15/nvmet-fcloop-access-fcpreq-only-when-holding-reqlock.patch b/queue-6.15/nvmet-fcloop-access-fcpreq-only-when-holding-reqlock.patch

new file mode 100644 (file)

index 0000000..02677ee
--- /dev/null
+++ b/queue-6.15/nvmet-fcloop-access-fcpreq-only-when-holding-reqlock.patch
@@ -0,0 +1,95 @@
+From e4ec2575f45f061c8965290edc07e0c3a6379666 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 7 May 2025 14:23:03 +0200
+Subject: nvmet-fcloop: access fcpreq only when holding reqlock
+
+From: Daniel Wagner <wagi@kernel.org>
+
+[ Upstream commit 47a827cd7929d0550c3496d70b417fcb5649b27b ]
+
+The abort handling logic expects that the state and the fcpreq are only
+accessed when holding the reqlock lock.
+
+While at it, only handle the aborts in the abort handler.
+
+Signed-off-by: Daniel Wagner <wagi@kernel.org>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/nvme/target/fcloop.c | 31 ++++++++++++++++---------------
+ 1 file changed, 16 insertions(+), 15 deletions(-)
+
+diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
+index 641201e62c1ba..20becea1ad968 100644
+--- a/drivers/nvme/target/fcloop.c
++++ b/drivers/nvme/target/fcloop.c
+@@ -618,12 +618,13 @@ fcloop_fcp_recv_work(struct work_struct *work)
+ {
+       struct fcloop_fcpreq *tfcp_req =
+               container_of(work, struct fcloop_fcpreq, fcp_rcv_work);
+-      struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
++      struct nvmefc_fcp_req *fcpreq;
+       unsigned long flags;
+       int ret = 0;
+       bool aborted = false;
+ 
+       spin_lock_irqsave(&tfcp_req->reqlock, flags);
++      fcpreq = tfcp_req->fcpreq;
+       switch (tfcp_req->inistate) {
+       case INI_IO_START:
+               tfcp_req->inistate = INI_IO_ACTIVE;
+@@ -638,16 +639,19 @@ fcloop_fcp_recv_work(struct work_struct *work)
+       }
+       spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
+ 
+-      if (unlikely(aborted))
+-              ret = -ECANCELED;
+-      else {
+-              if (likely(!check_for_drop(tfcp_req)))
+-                      ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
+-                              &tfcp_req->tgt_fcp_req,
+-                              fcpreq->cmdaddr, fcpreq->cmdlen);
+-              else
+-                      pr_info("%s: dropped command ********\n", __func__);
++      if (unlikely(aborted)) {
++              /* the abort handler will call fcloop_call_host_done */
++              return;
++      }
++
++      if (unlikely(check_for_drop(tfcp_req))) {
++              pr_info("%s: dropped command ********\n", __func__);
++              return;
+       }
++
++      ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
++                                 &tfcp_req->tgt_fcp_req,
++                                 fcpreq->cmdaddr, fcpreq->cmdlen);
+       if (ret)
+               fcloop_call_host_done(fcpreq, tfcp_req, ret);
+ }
+@@ -662,9 +666,10 @@ fcloop_fcp_abort_recv_work(struct work_struct *work)
+       unsigned long flags;
+ 
+       spin_lock_irqsave(&tfcp_req->reqlock, flags);
+-      fcpreq = tfcp_req->fcpreq;
+       switch (tfcp_req->inistate) {
+       case INI_IO_ABORTED:
++              fcpreq = tfcp_req->fcpreq;
++              tfcp_req->fcpreq = NULL;
+               break;
+       case INI_IO_COMPLETED:
+               completed = true;
+@@ -686,10 +691,6 @@ fcloop_fcp_abort_recv_work(struct work_struct *work)
+               nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport,
+                                       &tfcp_req->tgt_fcp_req);
+ 
+-      spin_lock_irqsave(&tfcp_req->reqlock, flags);
+-      tfcp_req->fcpreq = NULL;
+-      spin_unlock_irqrestore(&tfcp_req->reqlock, flags);
+-
+       fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
+       /* call_host_done releases reference for abort downcall */
+ }
+-- 
+2.39.5
+
diff --git a/queue-6.15/perf-ensure-bpf_perf_link-path-is-properly-serialize.patch b/queue-6.15/perf-ensure-bpf_perf_link-path-is-properly-serialize.patch

new file mode 100644 (file)

index 0000000..f0b6687
--- /dev/null
+++ b/queue-6.15/perf-ensure-bpf_perf_link-path-is-properly-serialize.patch
@@ -0,0 +1,98 @@
+From 99ba849522c7c0c22648a8c04e71e79e3a5611af Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 17 Jan 2025 10:54:50 +0100
+Subject: perf: Ensure bpf_perf_link path is properly serialized
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+[ Upstream commit 7ed9138a72829d2035ecbd8dbd35b1bc3c137c40 ]
+
+Ravi reported that the bpf_perf_link_attach() usage of
+perf_event_set_bpf_prog() is not serialized by ctx->mutex, unlike the
+PERF_EVENT_IOC_SET_BPF case.
+
+Reported-by: Ravi Bangoria <ravi.bangoria@amd.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Reviewed-by: Ravi Bangoria <ravi.bangoria@amd.com>
+Link: https://lkml.kernel.org/r/20250307193305.486326750@infradead.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/events/core.c | 34 ++++++++++++++++++++++++++++++----
+ 1 file changed, 30 insertions(+), 4 deletions(-)
+
+diff --git a/kernel/events/core.c b/kernel/events/core.c
+index 881d768e45564..e97bc9220fd1a 100644
+--- a/kernel/events/core.c
++++ b/kernel/events/core.c
+@@ -6239,6 +6239,9 @@ static int perf_event_set_output(struct perf_event *event,
+ static int perf_event_set_filter(struct perf_event *event, void __user *arg);
+ static int perf_copy_attr(struct perf_event_attr __user *uattr,
+                         struct perf_event_attr *attr);
++static int __perf_event_set_bpf_prog(struct perf_event *event,
++                                   struct bpf_prog *prog,
++                                   u64 bpf_cookie);
+ 
+ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
+ {
+@@ -6301,7 +6304,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon
+               if (IS_ERR(prog))
+                       return PTR_ERR(prog);
+ 
+-              err = perf_event_set_bpf_prog(event, prog, 0);
++              err = __perf_event_set_bpf_prog(event, prog, 0);
+               if (err) {
+                       bpf_prog_put(prog);
+                       return err;
+@@ -11069,8 +11072,9 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
+       return false;
+ }
+ 
+-int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+-                          u64 bpf_cookie)
++static int __perf_event_set_bpf_prog(struct perf_event *event,
++                                   struct bpf_prog *prog,
++                                   u64 bpf_cookie)
+ {
+       bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
+ 
+@@ -11108,6 +11112,20 @@ int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
+       return perf_event_attach_bpf_prog(event, prog, bpf_cookie);
+ }
+ 
++int perf_event_set_bpf_prog(struct perf_event *event,
++                          struct bpf_prog *prog,
++                          u64 bpf_cookie)
++{
++      struct perf_event_context *ctx;
++      int ret;
++
++      ctx = perf_event_ctx_lock(event);
++      ret = __perf_event_set_bpf_prog(event, prog, bpf_cookie);
++      perf_event_ctx_unlock(event, ctx);
++
++      return ret;
++}
++
+ void perf_event_free_bpf_prog(struct perf_event *event)
+ {
+       if (!event->prog)
+@@ -11130,7 +11148,15 @@ static void perf_event_free_filter(struct perf_event *event)
+ {
+ }
+ 
+-int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
++static int __perf_event_set_bpf_prog(struct perf_event *event,
++                                   struct bpf_prog *prog,
++                                   u64 bpf_cookie)
++{
++      return -ENOENT;
++}
++
++int perf_event_set_bpf_prog(struct perf_event *event,
++                          struct bpf_prog *prog,
+                           u64 bpf_cookie)
+ {
+       return -ENOENT;
+-- 
+2.39.5
+
diff --git a/queue-6.15/series b/queue-6.15/series

index be9206b5e586efd305ba4ae79f44a098a884c22e..d39ec192676395b27145b6c44e560b96ee3b1066 100644 (file)
--- a/queue-6.15/series
+++ b/queue-6.15/series
@@ -722,3 +722,18 @@ net_sched-ets-fix-a-race-in-ets_qdisc_change.patch
  net-drv-netdevsim-don-t-napi_complete-from-netpoll.patch
  net-ethtool-don-t-check-if-rss-context-exists-in-cas.patch
  drm-xe-lrc-use-a-temporary-buffer-for-wa-bb.patch
+btrfs-exit-after-state-insertion-failure-at-btrfs_co.patch
+fs-filesystems-fix-potential-unsigned-integer-underf.patch
+btrfs-fix-fsync-of-files-with-no-hard-links-not-pers.patch
+gfs2-pass-through-holder-from-the-vfs-for-freeze-tha.patch
+btrfs-exit-after-state-split-error-at-set_extent_bit.patch
+nvmet-fcloop-access-fcpreq-only-when-holding-reqlock.patch
+io_uring-fix-spurious-drain-flushing.patch
+perf-ensure-bpf_perf_link-path-is-properly-serialize.patch
+block-use-q-elevator-with-elevator_lock-held-in-elv_.patch
+io_uring-fix-use-after-free-of-sq-thread-in-__io_uri.patch
+block-don-t-use-submit_bio_noacct_nocheck-in-blk_zon.patch
+io_uring-consistently-use-rcu-semantics-with-sqpoll-.patch
+smb-client-fix-perf-regression-with-deferred-closes.patch
+bio-fix-bio_first_folio-for-sparsemem-without-vmemma.patch
+block-fix-bvec_set_folio-for-very-large-folios.patch
diff --git a/queue-6.15/smb-client-fix-perf-regression-with-deferred-closes.patch b/queue-6.15/smb-client-fix-perf-regression-with-deferred-closes.patch

new file mode 100644 (file)

index 0000000..99f93a4
--- /dev/null
+++ b/queue-6.15/smb-client-fix-perf-regression-with-deferred-closes.patch
@@ -0,0 +1,123 @@
+From 6902e05c4892fe7c859a417ed5023bc17a7f7b4f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 12 Jun 2025 12:45:04 -0300
+Subject: smb: client: fix perf regression with deferred closes
+
+From: Paulo Alcantara <pc@manguebit.org>
+
+[ Upstream commit b64af6bcd3b0f3fc633d6a70adb0991737abfef4 ]
+
+Customer reported that one of their applications started failing to
+open files with STATUS_INSUFFICIENT_RESOURCES due to NetApp server
+hitting the maximum number of opens to same file that it would allow
+for a single client connection.
+
+It turned out the client was failing to reuse open handles with
+deferred closes because matching ->f_flags directly without masking
+off O_CREAT|O_EXCL|O_TRUNC bits first broke the comparision and then
+client ended up with thousands of deferred closes to same file.  Those
+bits are already satisfied on the original open, so no need to check
+them against existing open handles.
+
+Reproducer:
+
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <string.h>
+ #include <unistd.h>
+ #include <fcntl.h>
+ #include <pthread.h>
+
+ #define NR_THREADS      4
+ #define NR_ITERATIONS   2500
+ #define TEST_FILE       "/mnt/1/test/dir/foo"
+
+ static char buf[64];
+
+ static void *worker(void *arg)
+ {
+         int i, j;
+         int fd;
+
+         for (i = 0; i < NR_ITERATIONS; i++) {
+                 fd = open(TEST_FILE, O_WRONLY|O_CREAT|O_APPEND, 0666);
+                 for (j = 0; j < 16; j++)
+                         write(fd, buf, sizeof(buf));
+                 close(fd);
+         }
+ }
+
+ int main(int argc, char *argv[])
+ {
+         pthread_t t[NR_THREADS];
+         int fd;
+         int i;
+
+         fd = open(TEST_FILE, O_WRONLY|O_CREAT|O_TRUNC, 0666);
+         close(fd);
+         memset(buf, 'a', sizeof(buf));
+         for (i = 0; i < NR_THREADS; i++)
+                 pthread_create(&t[i], NULL, worker, NULL);
+         for (i = 0; i < NR_THREADS; i++)
+                 pthread_join(t[i], NULL);
+         return 0;
+ }
+
+Before patch:
+
+$ mount.cifs //srv/share /mnt/1 -o ...
+$ mkdir -p /mnt/1/test/dir
+$ gcc repro.c && ./a.out
+...
+number of opens: 1391
+
+After patch:
+
+$ mount.cifs //srv/share /mnt/1 -o ...
+$ mkdir -p /mnt/1/test/dir
+$ gcc repro.c && ./a.out
+...
+number of opens: 1
+
+Cc: linux-cifs@vger.kernel.org
+Cc: David Howells <dhowells@redhat.com>
+Cc: Jay Shin <jaeshin@redhat.com>
+Cc: Pierguido Lambri <plambri@redhat.com>
+Fixes: b8ea3b1ff544 ("smb: enable reuse of deferred file handles for write operations")
+Acked-by: Shyam Prasad N <sprasad@microsoft.com>
+Signed-off-by: Paulo Alcantara (Red Hat) <pc@manguebit.org>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/smb/client/file.c | 9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
+index d2df10b8e6fd8..9835672267d27 100644
+--- a/fs/smb/client/file.c
++++ b/fs/smb/client/file.c
+@@ -999,15 +999,18 @@ int cifs_open(struct inode *inode, struct file *file)
+               rc = cifs_get_readable_path(tcon, full_path, &cfile);
+       }
+       if (rc == 0) {
+-              if (file->f_flags == cfile->f_flags) {
++              unsigned int oflags = file->f_flags & ~(O_CREAT|O_EXCL|O_TRUNC);
++              unsigned int cflags = cfile->f_flags & ~(O_CREAT|O_EXCL|O_TRUNC);
++
++              if (cifs_convert_flags(oflags, 0) == cifs_convert_flags(cflags, 0) &&
++                  (oflags & (O_SYNC|O_DIRECT)) == (cflags & (O_SYNC|O_DIRECT))) {
+                       file->private_data = cfile;
+                       spin_lock(&CIFS_I(inode)->deferred_lock);
+                       cifs_del_deferred_close(cfile);
+                       spin_unlock(&CIFS_I(inode)->deferred_lock);
+                       goto use_cache;
+-              } else {
+-                      _cifsFileInfo_put(cfile, true, false);
+               }
++              _cifsFileInfo_put(cfile, true, false);
+       } else {
+               /* hard link on the defeered close file */
+               rc = cifs_get_hardlink_path(tcon, inode, file);
+-- 
+2.39.5
+
author	Sasha Levin <sashal@kernel.org>
	Sun, 15 Jun 2025 22:47:44 +0000 (18:47 -0400)
committer	Sasha Levin <sashal@kernel.org>
	Sun, 15 Jun 2025 22:47:44 +0000 (18:47 -0400)
queue-6.15/bio-fix-bio_first_folio-for-sparsemem-without-vmemma.patch	[new file with mode: 0644]	patch \| blob
queue-6.15/block-don-t-use-submit_bio_noacct_nocheck-in-blk_zon.patch	[new file with mode: 0644]	patch \| blob
queue-6.15/block-fix-bvec_set_folio-for-very-large-folios.patch	[new file with mode: 0644]	patch \| blob
queue-6.15/block-use-q-elevator-with-elevator_lock-held-in-elv_.patch	[new file with mode: 0644]	patch \| blob
queue-6.15/btrfs-exit-after-state-insertion-failure-at-btrfs_co.patch	[new file with mode: 0644]	patch \| blob
queue-6.15/btrfs-exit-after-state-split-error-at-set_extent_bit.patch	[new file with mode: 0644]	patch \| blob
queue-6.15/btrfs-fix-fsync-of-files-with-no-hard-links-not-pers.patch	[new file with mode: 0644]	patch \| blob
queue-6.15/fs-filesystems-fix-potential-unsigned-integer-underf.patch	[new file with mode: 0644]	patch \| blob
queue-6.15/gfs2-pass-through-holder-from-the-vfs-for-freeze-tha.patch	[new file with mode: 0644]	patch \| blob
queue-6.15/io_uring-consistently-use-rcu-semantics-with-sqpoll-.patch	[new file with mode: 0644]	patch \| blob
queue-6.15/io_uring-fix-spurious-drain-flushing.patch	[new file with mode: 0644]	patch \| blob
queue-6.15/io_uring-fix-use-after-free-of-sq-thread-in-__io_uri.patch	[new file with mode: 0644]	patch \| blob
queue-6.15/nvmet-fcloop-access-fcpreq-only-when-holding-reqlock.patch	[new file with mode: 0644]	patch \| blob
queue-6.15/perf-ensure-bpf_perf_link-path-is-properly-serialize.patch	[new file with mode: 0644]	patch \| blob
queue-6.15/series		patch \| blob \| blame \| history
queue-6.15/smb-client-fix-perf-regression-with-deferred-closes.patch	[new file with mode: 0644]	patch \| blob