6.16-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 18 Aug 2025 10:51:42 +0000 (12:51 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 18 Aug 2025 10:51:42 +0000 (12:51 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 18 Aug 2025 10:51:42 +0000 (12:51 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 18 Aug 2025 10:51:42 +0000 (12:51 +0200)
diff --git a/queue-6.16/btrfs-abort-transaction-during-log-replay-if-walk_log_tree-failed.patch b/queue-6.16/btrfs-abort-transaction-during-log-replay-if-walk_log_tree-failed.patch

new file mode 100644 (file)

index 0000000..938a681
--- /dev/null
+++ b/queue-6.16/btrfs-abort-transaction-during-log-replay-if-walk_log_tree-failed.patch
@@ -0,0 +1,44 @@
+From 2a5898c4aac67494c2f0f7fe38373c95c371c930 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 21 May 2025 17:41:18 +0100
+Subject: btrfs: abort transaction during log replay if walk_log_tree() failed
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 2a5898c4aac67494c2f0f7fe38373c95c371c930 upstream.
+
+If we failed walking a log tree during replay, we have a missing
+transaction abort to prevent committing a transaction where we didn't
+fully replay all the changes from a log tree and therefore can leave the
+respective subvolume tree in some inconsistent state. So add the missing
+transaction abort.
+
+CC: stable@vger.kernel.org # 6.1+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c |    7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -7279,11 +7279,14 @@ again:
+ 
+               wc.replay_dest->log_root = log;
+               ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
+-              if (ret)
++              if (ret) {
+                       /* The loop needs to continue due to the root refs */
+                       btrfs_abort_transaction(trans, ret);
+-              else
++              } else {
+                       ret = walk_log_tree(trans, log, &wc);
++                      if (ret)
++                              btrfs_abort_transaction(trans, ret);
++              }
+ 
+               if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
+                       ret = fixup_inode_link_counts(trans, wc.replay_dest,
diff --git a/queue-6.16/btrfs-clear-dirty-status-from-extent-buffer-on-error-at-insert_new_root.patch b/queue-6.16/btrfs-clear-dirty-status-from-extent-buffer-on-error-at-insert_new_root.patch

new file mode 100644 (file)

index 0000000..577e0b2
--- /dev/null
+++ b/queue-6.16/btrfs-clear-dirty-status-from-extent-buffer-on-error-at-insert_new_root.patch
@@ -0,0 +1,36 @@
+From c0d013495a80cbb53e2288af7ae0ec4170aafd7c Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 30 Jun 2025 10:50:46 +0100
+Subject: btrfs: clear dirty status from extent buffer on error at insert_new_root()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit c0d013495a80cbb53e2288af7ae0ec4170aafd7c upstream.
+
+If we failed to insert the tree mod log operation, we are not removing the
+dirty status from the allocated and dirtied extent buffer before we free
+it. Removing the dirty status is needed for several reasons such as to
+adjust the fs_info->dirty_metadata_bytes counter and remove the dirty
+status from the respective folios. So add the missing call to
+btrfs_clear_buffer_dirty().
+
+Fixes: f61aa7ba08ab ("btrfs: do not BUG_ON() on tree mod log failure at insert_new_root()")
+CC: stable@vger.kernel.org # 6.6+
+Reviewed-by: Boris Burkov <boris@bur.io>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ctree.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -2872,6 +2872,7 @@ static noinline int insert_new_root(stru
+       if (ret < 0) {
+               int ret2;
+ 
++              btrfs_clear_buffer_dirty(trans, c);
+               ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
+               if (ret2 < 0)
+                       btrfs_abort_transaction(trans, ret2);
diff --git a/queue-6.16/btrfs-do-not-allow-relocation-of-partially-dropped-subvolumes.patch b/queue-6.16/btrfs-do-not-allow-relocation-of-partially-dropped-subvolumes.patch

new file mode 100644 (file)

index 0000000..24a6877
--- /dev/null
+++ b/queue-6.16/btrfs-do-not-allow-relocation-of-partially-dropped-subvolumes.patch
@@ -0,0 +1,124 @@
+From 4289b494ac553e74e86fed1c66b2bf9530bc1082 Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Fri, 25 Jul 2025 20:33:25 +0930
+Subject: btrfs: do not allow relocation of partially dropped subvolumes
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 4289b494ac553e74e86fed1c66b2bf9530bc1082 upstream.
+
+[BUG]
+There is an internal report that balance triggered transaction abort,
+with the following call trace:
+
+  item 85 key (594509824 169 0) itemoff 12599 itemsize 33
+          extent refs 1 gen 197740 flags 2
+          ref#0: tree block backref root 7
+  item 86 key (594558976 169 0) itemoff 12566 itemsize 33
+          extent refs 1 gen 197522 flags 2
+          ref#0: tree block backref root 7
+ ...
+ BTRFS error (device loop0): extent item not found for insert, bytenr 594526208 num_bytes 16384 parent 449921024 root_objectid 934 owner 1 offset 0
+ BTRFS error (device loop0): failed to run delayed ref for logical 594526208 num_bytes 16384 type 182 action 1 ref_mod 1: -117
+ ------------[ cut here ]------------
+ BTRFS: Transaction aborted (error -117)
+ WARNING: CPU: 1 PID: 6963 at ../fs/btrfs/extent-tree.c:2168 btrfs_run_delayed_refs+0xfa/0x110 [btrfs]
+
+And btrfs check doesn't report anything wrong related to the extent
+tree.
+
+[CAUSE]
+The cause is a little complex, firstly the extent tree indeed doesn't
+have the backref for 594526208.
+
+The extent tree only have the following two backrefs around that bytenr
+on-disk:
+
+        item 65 key (594509824 METADATA_ITEM 0) itemoff 13880 itemsize 33
+                refs 1 gen 197740 flags TREE_BLOCK
+                tree block skinny level 0
+                (176 0x7) tree block backref root CSUM_TREE
+        item 66 key (594558976 METADATA_ITEM 0) itemoff 13847 itemsize 33
+                refs 1 gen 197522 flags TREE_BLOCK
+                tree block skinny level 0
+                (176 0x7) tree block backref root CSUM_TREE
+
+But the such missing backref item is not an corruption on disk, as the
+offending delayed ref belongs to subvolume 934, and that subvolume is
+being dropped:
+
+        item 0 key (934 ROOT_ITEM 198229) itemoff 15844 itemsize 439
+                generation 198229 root_dirid 256 bytenr 10741039104 byte_limit 0 bytes_used 345571328
+                last_snapshot 198229 flags 0x1000000000001(RDONLY) refs 0
+                drop_progress key (206324 EXTENT_DATA 2711650304) drop_level 2
+                level 2 generation_v2 198229
+
+And that offending tree block 594526208 is inside the dropped range of
+that subvolume.  That explains why there is no backref item for that
+bytenr and why btrfs check is not reporting anything wrong.
+
+But this also shows another problem, as btrfs will do all the orphan
+subvolume cleanup at a read-write mount.
+
+So half-dropped subvolume should not exist after an RW mount, and
+balance itself is also exclusive to subvolume cleanup, meaning we
+shouldn't hit a subvolume half-dropped during relocation.
+
+The root cause is, there is no orphan item for this subvolume.
+In fact there are 5 subvolumes from around 2021 that have the same
+problem.
+
+It looks like the original report has some older kernels running, and
+caused those zombie subvolumes.
+
+Thankfully upstream commit 8d488a8c7ba2 ("btrfs: fix subvolume/snapshot
+deletion not triggered on mount") has long fixed the bug.
+
+[ENHANCEMENT]
+For repairing such old fs, btrfs-progs will be enhanced.
+
+Considering how delayed the problem will show up (at run delayed ref
+time) and at that time we have to abort transaction already, it is too
+late.
+
+Instead here we reject any half-dropped subvolume for reloc tree at the
+earliest time, preventing confusion and extra time wasted on debugging
+similar bugs.
+
+CC: stable@vger.kernel.org # 5.15+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/relocation.c |   19 +++++++++++++++++++
+ 1 file changed, 19 insertions(+)
+
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -594,6 +594,25 @@ static struct btrfs_root *create_reloc_r
+       if (btrfs_root_id(root) == objectid) {
+               u64 commit_root_gen;
+ 
++              /*
++               * Relocation will wait for cleaner thread, and any half-dropped
++               * subvolume will be fully cleaned up at mount time.
++               * So here we shouldn't hit a subvolume with non-zero drop_progress.
++               *
++               * If this isn't the case, error out since it can make us attempt to
++               * drop references for extents that were already dropped before.
++               */
++              if (unlikely(btrfs_disk_key_objectid(&root->root_item.drop_progress))) {
++                      struct btrfs_key cpu_key;
++
++                      btrfs_disk_key_to_cpu(&cpu_key, &root->root_item.drop_progress);
++                      btrfs_err(fs_info,
++      "cannot relocate partially dropped subvolume %llu, drop progress key (%llu %u %llu)",
++                                objectid, cpu_key.objectid, cpu_key.type, cpu_key.offset);
++                      ret = -EUCLEAN;
++                      goto fail;
++              }
++
+               /* called by btrfs_init_reloc_root */
+               ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+                                     BTRFS_TREE_RELOC_OBJECTID);
diff --git a/queue-6.16/btrfs-don-t-ignore-inode-missing-when-replaying-log-tree.patch b/queue-6.16/btrfs-don-t-ignore-inode-missing-when-replaying-log-tree.patch

new file mode 100644 (file)

index 0000000..faf752f
--- /dev/null
+++ b/queue-6.16/btrfs-don-t-ignore-inode-missing-when-replaying-log-tree.patch
@@ -0,0 +1,78 @@
+From 7ebf381a69421a88265d3c49cd0f007ba7336c9d Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Fri, 11 Jul 2025 20:21:28 +0100
+Subject: btrfs: don't ignore inode missing when replaying log tree
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 7ebf381a69421a88265d3c49cd0f007ba7336c9d upstream.
+
+During log replay, at add_inode_ref(), we return -ENOENT if our current
+inode isn't found on the subvolume tree or if a parent directory isn't
+found. The error comes from btrfs_iget_logging() <- btrfs_iget() <-
+btrfs_read_locked_inode().
+
+The single caller of add_inode_ref(), replay_one_buffer(), ignores an
+-ENOENT error because it expects that error to mean only that a parent
+directory wasn't found and that is ok.
+
+Before commit 5f61b961599a ("btrfs: fix inode lookup error handling during
+log replay") we were converting any error when getting a parent directory
+to -ENOENT and any error when getting the current inode to -EIO, so our
+caller would fail log replay in case we can't find the current inode.
+After that commit however in case the current inode is not found we return
+-ENOENT to the caller and therefore it ignores the critical fact that the
+current inode was not found in the subvolume tree.
+
+Fix this by converting -ENOENT to 0 when we don't find a parent directory,
+returning -ENOENT when we don't find the current inode and making the
+caller, replay_one_buffer(), not ignore -ENOENT anymore.
+
+Fixes: 5f61b961599a ("btrfs: fix inode lookup error handling during log replay")
+CC: stable@vger.kernel.org # 6.16
+Reviewed-by: Boris Burkov <boris@bur.io>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c |   14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -1383,6 +1383,8 @@ static noinline int add_inode_ref(struct
+       dir = btrfs_iget_logging(parent_objectid, root);
+       if (IS_ERR(dir)) {
+               ret = PTR_ERR(dir);
++              if (ret == -ENOENT)
++                      ret = 0;
+               dir = NULL;
+               goto out;
+       }
+@@ -1407,6 +1409,15 @@ static noinline int add_inode_ref(struct
+                               if (IS_ERR(dir)) {
+                                       ret = PTR_ERR(dir);
+                                       dir = NULL;
++                                      /*
++                                       * A new parent dir may have not been
++                                       * logged and not exist in the subvolume
++                                       * tree, see the comment above before
++                                       * the loop when getting the first
++                                       * parent dir.
++                                       */
++                                      if (ret == -ENOENT)
++                                              ret = 0;
+                                       goto out;
+                               }
+                       }
+@@ -2519,9 +2530,8 @@ static int replay_one_buffer(struct btrf
+                          key.type == BTRFS_INODE_EXTREF_KEY) {
+                       ret = add_inode_ref(wc->trans, root, log, path,
+                                           eb, i, &key);
+-                      if (ret && ret != -ENOENT)
++                      if (ret)
+                               break;
+-                      ret = 0;
+               } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
+                       ret = replay_one_extent(wc->trans, root, path,
+                                               eb, i, &key);
diff --git a/queue-6.16/btrfs-don-t-skip-accounting-in-early-enotty-return-in-btrfs_uring_encoded_read.patch b/queue-6.16/btrfs-don-t-skip-accounting-in-early-enotty-return-in-btrfs_uring_encoded_read.patch

new file mode 100644 (file)

index 0000000..5c68e96
--- /dev/null
+++ b/queue-6.16/btrfs-don-t-skip-accounting-in-early-enotty-return-in-btrfs_uring_encoded_read.patch
@@ -0,0 +1,36 @@
+From ea124ec327086325fc096abf42837dac471ac7ae Mon Sep 17 00:00:00 2001
+From: Caleb Sander Mateos <csander@purestorage.com>
+Date: Thu, 19 Jun 2025 13:27:45 -0600
+Subject: btrfs: don't skip accounting in early ENOTTY return in btrfs_uring_encoded_read()
+
+From: Caleb Sander Mateos <csander@purestorage.com>
+
+commit ea124ec327086325fc096abf42837dac471ac7ae upstream.
+
+btrfs_uring_encoded_read() returns early with -ENOTTY if the uring_cmd
+is issued with IO_URING_F_COMPAT but the kernel doesn't support compat
+syscalls. However, this early return bypasses the syscall accounting.
+Go to out_acct instead to ensure the syscall is counted.
+
+Fixes: 34310c442e17 ("btrfs: add io_uring command for encoded reads (ENCODED_READ ioctl)")
+CC: stable@vger.kernel.org # 6.15+
+Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ioctl.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -4829,7 +4829,8 @@ static int btrfs_uring_encoded_read(stru
+ #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
+               copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags);
+ #else
+-              return -ENOTTY;
++              ret = -ENOTTY;
++              goto out_acct;
+ #endif
+       } else {
+               copy_end = copy_end_kernel;
diff --git a/queue-6.16/btrfs-don-t-skip-remaining-extrefs-if-dir-not-found-during-log-replay.patch b/queue-6.16/btrfs-don-t-skip-remaining-extrefs-if-dir-not-found-during-log-replay.patch

new file mode 100644 (file)

index 0000000..02a4235
--- /dev/null
+++ b/queue-6.16/btrfs-don-t-skip-remaining-extrefs-if-dir-not-found-during-log-replay.patch
@@ -0,0 +1,90 @@
+From 24e066ded45b8147b79c7455ac43a5bff7b5f378 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Fri, 11 Jul 2025 20:48:23 +0100
+Subject: btrfs: don't skip remaining extrefs if dir not found during log replay
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 24e066ded45b8147b79c7455ac43a5bff7b5f378 upstream.
+
+During log replay, at add_inode_ref(), if we have an extref item that
+contains multiple extrefs and one of them points to a directory that does
+not exist in the subvolume tree, we are supposed to ignore it and process
+the remaining extrefs encoded in the extref item, since each extref can
+point to a different parent inode. However when that happens we just
+return from the function and ignore the remaining extrefs.
+
+The problem has been around since extrefs were introduced, in commit
+f186373fef00 ("btrfs: extended inode refs"), but it's hard to hit in
+practice because getting extref items encoding multiple extref requires
+getting a hash collision when computing the offset of the extref's
+key. The offset if computed like this:
+
+  key.offset = btrfs_extref_hash(dir_ino, name->name, name->len);
+
+and btrfs_extref_hash() is just a wrapper around crc32c().
+
+Fix this by moving to next iteration of the loop when we don't find
+the parent directory that an extref points to.
+
+Fixes: f186373fef00 ("btrfs: extended inode refs")
+CC: stable@vger.kernel.org # 6.1+
+Reviewed-by: Boris Burkov <boris@bur.io>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c |   18 ++++++++++++++----
+ 1 file changed, 14 insertions(+), 4 deletions(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -1400,6 +1400,8 @@ static noinline int add_inode_ref(struct
+               if (log_ref_ver) {
+                       ret = extref_get_fields(eb, ref_ptr, &name,
+                                               &ref_index, &parent_objectid);
++                      if (ret)
++                              goto out;
+                       /*
+                        * parent object can change from one array
+                        * item to another.
+@@ -1416,16 +1418,23 @@ static noinline int add_inode_ref(struct
+                                        * the loop when getting the first
+                                        * parent dir.
+                                        */
+-                                      if (ret == -ENOENT)
++                                      if (ret == -ENOENT) {
++                                              /*
++                                               * The next extref may refer to
++                                               * another parent dir that
++                                               * exists, so continue.
++                                               */
+                                               ret = 0;
++                                              goto next;
++                                      }
+                                       goto out;
+                               }
+                       }
+               } else {
+                       ret = ref_get_fields(eb, ref_ptr, &name, &ref_index);
++                      if (ret)
++                              goto out;
+               }
+-              if (ret)
+-                      goto out;
+ 
+               ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
+                                  ref_index, &name);
+@@ -1459,10 +1468,11 @@ static noinline int add_inode_ref(struct
+               }
+               /* Else, ret == 1, we already have a perfect match, we're done. */
+ 
++next:
+               ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len;
+               kfree(name.name);
+               name.name = NULL;
+-              if (log_ref_ver) {
++              if (log_ref_ver && dir) {
+                       iput(&dir->vfs_inode);
+                       dir = NULL;
+               }
diff --git a/queue-6.16/btrfs-error-on-missing-block-group-when-unaccounting-log-tree-extent-buffers.patch b/queue-6.16/btrfs-error-on-missing-block-group-when-unaccounting-log-tree-extent-buffers.patch

new file mode 100644 (file)

index 0000000..05360bc
--- /dev/null
+++ b/queue-6.16/btrfs-error-on-missing-block-group-when-unaccounting-log-tree-extent-buffers.patch
@@ -0,0 +1,78 @@
+From fc5799986fbca957e2e3c0480027f249951b7bcf Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 16 Jul 2025 11:41:21 +0100
+Subject: btrfs: error on missing block group when unaccounting log tree extent buffers
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit fc5799986fbca957e2e3c0480027f249951b7bcf upstream.
+
+Currently we only log an error message if we can't find the block group
+for a log tree extent buffer when unaccounting it (while freeing a log
+tree). A missing block group means something is seriously wrong and we
+end up leaking space from the metadata space info. So return -ENOENT in
+case we don't find the block group.
+
+CC: stable@vger.kernel.org # 6.12+
+Reviewed-by: Boris Burkov <boris@bur.io>
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c |   19 +++++++------------
+ 1 file changed, 7 insertions(+), 12 deletions(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -2574,14 +2574,14 @@ static int replay_one_buffer(struct btrf
+ /*
+  * Correctly adjust the reserved bytes occupied by a log tree extent buffer
+  */
+-static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
++static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
+ {
+       struct btrfs_block_group *cache;
+ 
+       cache = btrfs_lookup_block_group(fs_info, start);
+       if (!cache) {
+               btrfs_err(fs_info, "unable to find block group for %llu", start);
+-              return;
++              return -ENOENT;
+       }
+ 
+       spin_lock(&cache->space_info->lock);
+@@ -2592,27 +2592,22 @@ static void unaccount_log_buffer(struct
+       spin_unlock(&cache->space_info->lock);
+ 
+       btrfs_put_block_group(cache);
++
++      return 0;
+ }
+ 
+ static int clean_log_buffer(struct btrfs_trans_handle *trans,
+                           struct extent_buffer *eb)
+ {
+-      int ret;
+-
+       btrfs_tree_lock(eb);
+       btrfs_clear_buffer_dirty(trans, eb);
+       wait_on_extent_buffer_writeback(eb);
+       btrfs_tree_unlock(eb);
+ 
+-      if (trans) {
+-              ret = btrfs_pin_reserved_extent(trans, eb);
+-              if (ret)
+-                      return ret;
+-      } else {
+-              unaccount_log_buffer(eb->fs_info, eb->start);
+-      }
++      if (trans)
++              return btrfs_pin_reserved_extent(trans, eb);
+ 
+-      return 0;
++      return unaccount_log_buffer(eb->fs_info, eb->start);
+ }
+ 
+ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
diff --git a/queue-6.16/btrfs-fix-iteration-bug-in-__qgroup_excl_accounting.patch b/queue-6.16/btrfs-fix-iteration-bug-in-__qgroup_excl_accounting.patch

new file mode 100644 (file)

index 0000000..3770808
--- /dev/null
+++ b/queue-6.16/btrfs-fix-iteration-bug-in-__qgroup_excl_accounting.patch
@@ -0,0 +1,92 @@
+From 7b632596188e1973c6b3ac1c9f8252f735e1039f Mon Sep 17 00:00:00 2001
+From: Boris Burkov <boris@bur.io>
+Date: Wed, 30 Jul 2025 09:29:23 -0700
+Subject: btrfs: fix iteration bug in __qgroup_excl_accounting()
+
+From: Boris Burkov <boris@bur.io>
+
+commit 7b632596188e1973c6b3ac1c9f8252f735e1039f upstream.
+
+__qgroup_excl_accounting() uses the qgroup iterator machinery to
+update the account of one qgroups usage for all its parent hierarchy,
+when we either add or remove a relation and have only exclusive usage.
+
+However, there is a small bug there: we loop with an extra iteration
+temporary qgroup called `cur` but never actually refer to that in the
+body of the loop. As a result, we redundantly account the same usage to
+the first qgroup in the list.
+
+This can be reproduced in the following way:
+
+  mkfs.btrfs -f -O squota <dev>
+  mount <dev> <mnt>
+  btrfs subvol create <mnt>/sv
+  dd if=/dev/zero of=<mnt>/sv/f bs=1M count=1
+  sync
+  btrfs qgroup create 1/100 <mnt>
+  btrfs qgroup create 2/200 <mnt>
+  btrfs qgroup assign 1/100 2/200 <mnt>
+  btrfs qgroup assign 0/256 1/100 <mnt>
+  btrfs qgroup show <mnt>
+
+and the broken result is (note the 2MiB on 1/100 and 0Mib on 2/100):
+
+  Qgroupid    Referenced    Exclusive   Path
+  --------    ----------    ---------   ----
+  0/5           16.00KiB     16.00KiB   <toplevel>
+  0/256          1.02MiB      1.02MiB   sv
+
+  Qgroupid    Referenced    Exclusive   Path
+  --------    ----------    ---------   ----
+  0/5           16.00KiB     16.00KiB   <toplevel>
+  0/256          1.02MiB      1.02MiB   sv
+  1/100          2.03MiB      2.03MiB   2/100<1 member qgroup>
+  2/100            0.00B        0.00B   <0 member qgroups>
+
+With this fix, which simply re-uses `qgroup` as the iteration variable,
+we see the expected result:
+
+  Qgroupid    Referenced    Exclusive   Path
+  --------    ----------    ---------   ----
+  0/5           16.00KiB     16.00KiB   <toplevel>
+  0/256          1.02MiB      1.02MiB   sv
+
+  Qgroupid    Referenced    Exclusive   Path
+  --------    ----------    ---------   ----
+  0/5           16.00KiB     16.00KiB   <toplevel>
+  0/256          1.02MiB      1.02MiB   sv
+  1/100          1.02MiB      1.02MiB   2/100<1 member qgroup>
+  2/100          1.02MiB      1.02MiB   <0 member qgroups>
+
+The existing fstests did not exercise two layer inheritance so this bug
+was missed. I intend to add that testing there, as well.
+
+Fixes: a0bdc04b0732 ("btrfs: qgroup: use qgroup_iterator in __qgroup_excl_accounting()")
+CC: stable@vger.kernel.org # 6.12+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Boris Burkov <boris@bur.io>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/qgroup.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -1481,7 +1481,6 @@ static int __qgroup_excl_accounting(stru
+                                   struct btrfs_qgroup *src, int sign)
+ {
+       struct btrfs_qgroup *qgroup;
+-      struct btrfs_qgroup *cur;
+       LIST_HEAD(qgroup_list);
+       u64 num_bytes = src->excl;
+       int ret = 0;
+@@ -1491,7 +1490,7 @@ static int __qgroup_excl_accounting(stru
+               goto out;
+ 
+       qgroup_iterator_add(&qgroup_list, qgroup);
+-      list_for_each_entry(cur, &qgroup_list, iterator) {
++      list_for_each_entry(qgroup, &qgroup_list, iterator) {
+               struct btrfs_qgroup_list *glist;
+ 
+               qgroup->rfer += sign * num_bytes;
diff --git a/queue-6.16/btrfs-fix-log-tree-replay-failure-due-to-file-with-0-links-and-extents.patch b/queue-6.16/btrfs-fix-log-tree-replay-failure-due-to-file-with-0-links-and-extents.patch

new file mode 100644 (file)

index 0000000..3545019
--- /dev/null
+++ b/queue-6.16/btrfs-fix-log-tree-replay-failure-due-to-file-with-0-links-and-extents.patch
@@ -0,0 +1,147 @@
+From 0a32e4f0025a74c70dcab4478e9b29c22f5ecf2f Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 30 Jul 2025 19:18:37 +0100
+Subject: btrfs: fix log tree replay failure due to file with 0 links and extents
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 0a32e4f0025a74c70dcab4478e9b29c22f5ecf2f upstream.
+
+If we log a new inode (not persisted in a past transaction) that has 0
+links and extents, then log another inode with an higher inode number, we
+end up with failing to replay the log tree with -EINVAL. The steps for
+this are:
+
+1) create new file A
+2) write some data to file A
+3) open an fd on file A
+4) unlink file A
+5) fsync file A using the previously open fd
+6) create file B (has higher inode number than file A)
+7) fsync file B
+8) power fail before current transaction commits
+
+Now when attempting to mount the fs, the log replay will fail with
+-ENOENT at replay_one_extent() when attempting to replay the first
+extent of file A. The failure comes when trying to open the inode for
+file A in the subvolume tree, since it doesn't exist.
+
+Before commit 5f61b961599a ("btrfs: fix inode lookup error handling
+during log replay"), the returned error was -EIO instead of -ENOENT,
+since we converted any errors when attempting to read an inode during
+log replay to -EIO.
+
+The reason for this is that the log replay procedure fails to ignore
+the current inode when we are at the stage LOG_WALK_REPLAY_ALL, our
+current inode has 0 links and last inode we processed in the previous
+stage has a non 0 link count. In other words, the issue is that at
+replay_one_extent() we only update wc->ignore_cur_inode if the current
+replay stage is LOG_WALK_REPLAY_INODES.
+
+Fix this by updating wc->ignore_cur_inode whenever we find an inode item
+regardless of the current replay stage. This is a simple solution and easy
+to backport, but later we can do other alternatives like avoid logging
+extents or inode items other than the inode item for inodes with a link
+count of 0.
+
+The problem with the wc->ignore_cur_inode logic has been around since
+commit f2d72f42d5fa ("Btrfs: fix warning when replaying log after fsync
+of a tmpfile") but it only became frequent to hit since the more recent
+commit 5e85262e542d ("btrfs: fix fsync of files with no hard links not
+persisting deletion"), because we stopped skipping inodes with a link
+count of 0 when logging, while before the problem would only be triggered
+if trying to replay a log tree created with an older kernel which has a
+logged inode with 0 links.
+
+A test case for fstests will be submitted soon.
+
+Reported-by: Peter Jung <ptr1337@cachyos.org>
+Link: https://lore.kernel.org/linux-btrfs/fce139db-4458-4788-bb97-c29acf6cb1df@cachyos.org/
+Reported-by: burneddi <burneddi@protonmail.com>
+Link: https://lore.kernel.org/linux-btrfs/lh4W-Lwc0Mbk-QvBhhQyZxf6VbM3E8VtIvU3fPIQgweP_Q1n7wtlUZQc33sYlCKYd-o6rryJQfhHaNAOWWRKxpAXhM8NZPojzsJPyHMf2qY=@protonmail.com/#t
+Reported-by: Russell Haley <yumpusamongus@gmail.com>
+Link: https://lore.kernel.org/linux-btrfs/598ecc75-eb80-41b3-83c2-f2317fbb9864@gmail.com/
+Fixes: f2d72f42d5fa ("Btrfs: fix warning when replaying log after fsync of a tmpfile")
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-by: Boris Burkov <boris@bur.io>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c |   48 ++++++++++++++++++++++++++++++------------------
+ 1 file changed, 30 insertions(+), 18 deletions(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -321,8 +321,7 @@ struct walk_control {
+ 
+       /*
+        * Ignore any items from the inode currently being processed. Needs
+-       * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
+-       * the LOG_WALK_REPLAY_INODES stage.
++       * to be set every time we find a BTRFS_INODE_ITEM_KEY.
+        */
+       bool ignore_cur_inode;
+ 
+@@ -2434,23 +2433,30 @@ static int replay_one_buffer(struct btrf
+ 
+       nritems = btrfs_header_nritems(eb);
+       for (i = 0; i < nritems; i++) {
+-              btrfs_item_key_to_cpu(eb, &key, i);
++              struct btrfs_inode_item *inode_item;
+ 
+-              /* inode keys are done during the first stage */
+-              if (key.type == BTRFS_INODE_ITEM_KEY &&
+-                  wc->stage == LOG_WALK_REPLAY_INODES) {
+-                      struct btrfs_inode_item *inode_item;
+-                      u32 mode;
++              btrfs_item_key_to_cpu(eb, &key, i);
+ 
+-                      inode_item = btrfs_item_ptr(eb, i,
+-                                          struct btrfs_inode_item);
++              if (key.type == BTRFS_INODE_ITEM_KEY) {
++                      inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item);
+                       /*
+-                       * If we have a tmpfile (O_TMPFILE) that got fsync'ed
+-                       * and never got linked before the fsync, skip it, as
+-                       * replaying it is pointless since it would be deleted
+-                       * later. We skip logging tmpfiles, but it's always
+-                       * possible we are replaying a log created with a kernel
+-                       * that used to log tmpfiles.
++                       * An inode with no links is either:
++                       *
++                       * 1) A tmpfile (O_TMPFILE) that got fsync'ed and never
++                       *    got linked before the fsync, skip it, as replaying
++                       *    it is pointless since it would be deleted later.
++                       *    We skip logging tmpfiles, but it's always possible
++                       *    we are replaying a log created with a kernel that
++                       *    used to log tmpfiles;
++                       *
++                       * 2) A non-tmpfile which got its last link deleted
++                       *    while holding an open fd on it and later got
++                       *    fsynced through that fd. We always log the
++                       *    parent inodes when inode->last_unlink_trans is
++                       *    set to the current transaction, so ignore all the
++                       *    inode items for this inode. We will delete the
++                       *    inode when processing the parent directory with
++                       *    replay_dir_deletes().
+                        */
+                       if (btrfs_inode_nlink(eb, inode_item) == 0) {
+                               wc->ignore_cur_inode = true;
+@@ -2458,8 +2464,14 @@ static int replay_one_buffer(struct btrf
+                       } else {
+                               wc->ignore_cur_inode = false;
+                       }
+-                      ret = replay_xattr_deletes(wc->trans, root, log,
+-                                                 path, key.objectid);
++              }
++
++              /* Inode keys are done during the first stage. */
++              if (key.type == BTRFS_INODE_ITEM_KEY &&
++                  wc->stage == LOG_WALK_REPLAY_INODES) {
++                      u32 mode;
++
++                      ret = replay_xattr_deletes(wc->trans, root, log, path, key.objectid);
+                       if (ret)
+                               break;
+                       mode = btrfs_inode_mode(eb, inode_item);
diff --git a/queue-6.16/btrfs-fix-ssd_spread-overallocation.patch b/queue-6.16/btrfs-fix-ssd_spread-overallocation.patch

new file mode 100644 (file)

index 0000000..a7edc05
--- /dev/null
+++ b/queue-6.16/btrfs-fix-ssd_spread-overallocation.patch
@@ -0,0 +1,134 @@
+From 807d9023e75fc20bfd6dd2ac0408ce4af53f1648 Mon Sep 17 00:00:00 2001
+From: Boris Burkov <boris@bur.io>
+Date: Mon, 14 Jul 2025 16:44:28 -0700
+Subject: btrfs: fix ssd_spread overallocation
+
+From: Boris Burkov <boris@bur.io>
+
+commit 807d9023e75fc20bfd6dd2ac0408ce4af53f1648 upstream.
+
+If the ssd_spread mount option is enabled, then we run the so called
+clustered allocator for data block groups. In practice, this results in
+creating a btrfs_free_cluster which caches a block_group and borrows its
+free extents for allocation.
+
+Since the introduction of allocation size classes in 6.1, there has been
+a bug in the interaction between that feature and ssd_spread.
+find_free_extent() has a number of nested loops. The loop going over the
+allocation stages, stored in ffe_ctl->loop and managed by
+find_free_extent_update_loop(), the loop over the raid levels, and the
+loop over all the block_groups in a space_info. The size class feature
+relies on the block_group loop to ensure it gets a chance to see a
+block_group of a given size class.  However, the clustered allocator
+uses the cached cluster block_group and breaks that loop. Each call to
+do_allocation() will really just go back to the same cached block_group.
+Normally, this is OK, as the allocation either succeeds and we don't
+want to loop any more or it fails, and we clear the cluster and return
+its space to the block_group.
+
+But with size classes, the allocation can succeed, then later fail,
+outside of do_allocation() due to size class mismatch. That latter
+failure is not properly handled due to the highly complex multi loop
+logic. The result is a painful loop where we continue to allocate the
+same num_bytes from the cluster in a tight loop until it fails and
+releases the cluster and lets us try a new block_group. But by then, we
+have skipped great swaths of the available block_groups and are likely
+to fail to allocate, looping the outer loop. In pathological cases like
+the reproducer below, the cached block_group is often the very last one,
+in which case we don't perform this tight bg loop but instead rip
+through the ffe stages to LOOP_CHUNK_ALLOC and allocate a chunk, which
+is now the last one, and we enter the tight inner loop until an
+allocation failure. Then allocation succeeds on the final block_group
+and if the next allocation is a size mismatch, the exact same thing
+happens again.
+
+Triggering this is as easy as mounting with -o ssd_spread and then
+running:
+
+  mount -o ssd_spread $dev $mnt
+  dd if=/dev/zero of=$mnt/big bs=16M count=1 &>/dev/null
+  dd if=/dev/zero of=$mnt/med bs=4M count=1 &>/dev/null
+  sync
+
+if you do the two writes + sync in a loop, you can force btrfs to spin
+an excessive amount on semi-successful clustered allocations, before
+ultimately failing and advancing to the stage where we force a chunk
+allocation. This results in 2G of data allocated per iteration, despite
+only using ~20M of data. By using a small size classed extent, the inner
+loop takes longer and we can spin for longer.
+
+The simplest, shortest term fix to unbreak this is to make the clustered
+allocator size_class aware in the dumbest way, where it fails on size
+class mismatch. This may hinder the operation of the clustered
+allocator, but better hindered than completely broken and terribly
+overallocating.
+
+Further re-design improvements are also in the works.
+
+Fixes: 52bb7a2166af ("btrfs: introduce size class to block group allocator")
+CC: stable@vger.kernel.org # 6.1+
+Reported-by: David Sterba <dsterba@suse.com>
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Boris Burkov <boris@bur.io>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent-tree.c |   33 +++++++++++++++++----------------
+ 1 file changed, 17 insertions(+), 16 deletions(-)
+
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -3649,6 +3649,21 @@ btrfs_release_block_group(struct btrfs_b
+       btrfs_put_block_group(cache);
+ }
+ 
++static bool find_free_extent_check_size_class(const struct find_free_extent_ctl *ffe_ctl,
++                                            const struct btrfs_block_group *bg)
++{
++      if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
++              return true;
++      if (!btrfs_block_group_should_use_size_class(bg))
++              return true;
++      if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
++              return true;
++      if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
++          bg->size_class == BTRFS_BG_SZ_NONE)
++              return true;
++      return ffe_ctl->size_class == bg->size_class;
++}
++
+ /*
+  * Helper function for find_free_extent().
+  *
+@@ -3670,7 +3685,8 @@ static int find_free_extent_clustered(st
+       if (!cluster_bg)
+               goto refill_cluster;
+       if (cluster_bg != bg && (cluster_bg->ro ||
+-          !block_group_bits(cluster_bg, ffe_ctl->flags)))
++          !block_group_bits(cluster_bg, ffe_ctl->flags) ||
++          !find_free_extent_check_size_class(ffe_ctl, cluster_bg)))
+               goto release_cluster;
+ 
+       offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
+@@ -4227,21 +4243,6 @@ static int find_free_extent_update_loop(
+       return -ENOSPC;
+ }
+ 
+-static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl,
+-                                            struct btrfs_block_group *bg)
+-{
+-      if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED)
+-              return true;
+-      if (!btrfs_block_group_should_use_size_class(bg))
+-              return true;
+-      if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS)
+-              return true;
+-      if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS &&
+-          bg->size_class == BTRFS_BG_SZ_NONE)
+-              return true;
+-      return ffe_ctl->size_class == bg->size_class;
+-}
+-
+ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
+                                       struct find_free_extent_ctl *ffe_ctl,
+                                       struct btrfs_space_info *space_info,
diff --git a/queue-6.16/btrfs-fix-wrong-length-parameter-for-btrfs_cleanup_ordered_extents.patch b/queue-6.16/btrfs-fix-wrong-length-parameter-for-btrfs_cleanup_ordered_extents.patch

new file mode 100644 (file)

index 0000000..1a4a02b
--- /dev/null
+++ b/queue-6.16/btrfs-fix-wrong-length-parameter-for-btrfs_cleanup_ordered_extents.patch
@@ -0,0 +1,48 @@
+From deaf895212da74635a7f0a420e1ecf8f5eca1fe5 Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Sun, 20 Jul 2025 15:01:39 +0930
+Subject: btrfs: fix wrong length parameter for btrfs_cleanup_ordered_extents()
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit deaf895212da74635a7f0a420e1ecf8f5eca1fe5 upstream.
+
+Inside nocow_one_range(), if the checksum cloning for data reloc inode
+failed, we call btrfs_cleanup_ordered_extents() to cleanup the just
+allocated ordered extents.
+
+But unlike extent_clear_unlock_delalloc(),
+btrfs_cleanup_ordered_extents() requires a length, not an inclusive end
+bytenr.
+
+This can be problematic, as the @end is normally way larger than @len.
+
+This means btrfs_cleanup_ordered_extents() can be called on folios
+out of the correct range, and if the out-of-range folio is under
+writeback, we can incorrectly clear the ordered flag of the folio, and
+trigger the DEBUG_WARN() inside btrfs_writepage_cow_fixup().
+
+Fix the wrong parameter with correct length instead.
+
+Fixes: 94f6c5c17e52 ("btrfs: move ordered extent cleanup to where they are allocated")
+CC: stable@vger.kernel.org # 6.15+
+Reviewed-by: Boris Burkov <boris@bur.io>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/inode.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -2010,7 +2010,7 @@ static int nocow_one_range(struct btrfs_
+        * cleaered by the caller.
+        */
+       if (ret < 0)
+-              btrfs_cleanup_ordered_extents(inode, file_pos, end);
++              btrfs_cleanup_ordered_extents(inode, file_pos, len);
+       return ret;
+ }
+ 
diff --git a/queue-6.16/btrfs-populate-otime-when-logging-an-inode-item.patch b/queue-6.16/btrfs-populate-otime-when-logging-an-inode-item.patch

new file mode 100644 (file)

index 0000000..d3c455c
--- /dev/null
+++ b/queue-6.16/btrfs-populate-otime-when-logging-an-inode-item.patch
@@ -0,0 +1,109 @@
+From 1ef94169db0958d6de39f9ea6e063ce887342e2d Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Wed, 2 Jul 2025 15:08:13 +0930
+Subject: btrfs: populate otime when logging an inode item
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 1ef94169db0958d6de39f9ea6e063ce887342e2d upstream.
+
+[TEST FAILURE WITH EXPERIMENTAL FEATURES]
+When running test case generic/508, the test case will fail with the new
+btrfs shutdown support:
+
+generic/508       - output mismatch (see /home/adam/xfstests/results//generic/508.out.bad)
+#    --- tests/generic/508.out 2022-05-11 11:25:30.806666664 +0930
+#    +++ /home/adam/xfstests/results//generic/508.out.bad      2025-07-02 14:53:22.401824212 +0930
+#    @@ -1,2 +1,6 @@
+#     QA output created by 508
+#     Silence is golden
+#    +Before:
+#    +After : stat.btime = Thu Jan  1 09:30:00 1970
+#    +Before:
+#    +After : stat.btime = Wed Jul  2 14:53:22 2025
+#    ...
+#    (Run 'diff -u /home/adam/xfstests/tests/generic/508.out /home/adam/xfstests/results//generic/508.out.bad'  to see the entire diff)
+Ran: generic/508
+Failures: generic/508
+Failed 1 of 1 tests
+
+Please note that the test case requires shutdown support, thus the test
+case will be skipped using the current upstream kernel, as it doesn't
+have shutdown ioctl support.
+
+[CAUSE]
+The direct cause the 0 time stamp in the log tree:
+
+leaf 30507008 items 2 free space 16057 generation 9 owner TREE_LOG
+leaf 30507008 flags 0x1(WRITTEN) backref revision 1
+checksum stored e522548d
+checksum calced e522548d
+fs uuid 57d45451-481e-43e4-aa93-289ad707a3a0
+chunk uuid d52bd3fd-5163-4337-98a7-7986993ad398
+       item 0 key (257 INODE_ITEM 0) itemoff 16123 itemsize 160
+               generation 9 transid 9 size 0 nbytes 0
+               block group 0 mode 100644 links 1 uid 0 gid 0 rdev 0
+               sequence 1 flags 0x0(none)
+               atime 1751432947.492000000 (2025-07-02 14:39:07)
+               ctime 1751432947.492000000 (2025-07-02 14:39:07)
+               mtime 1751432947.492000000 (2025-07-02 14:39:07)
+               otime 0.0 (1970-01-01 09:30:00) <<<
+
+But the old fs tree has all the correct time stamp:
+
+btrfs-progs v6.12
+fs tree key (FS_TREE ROOT_ITEM 0)
+leaf 30425088 items 2 free space 16061 generation 5 owner FS_TREE
+leaf 30425088 flags 0x1(WRITTEN) backref revision 1
+checksum stored 48f6c57e
+checksum calced 48f6c57e
+fs uuid 57d45451-481e-43e4-aa93-289ad707a3a0
+chunk uuid d52bd3fd-5163-4337-98a7-7986993ad398
+       item 0 key (256 INODE_ITEM 0) itemoff 16123 itemsize 160
+               generation 3 transid 0 size 0 nbytes 16384
+               block group 0 mode 40755 links 1 uid 0 gid 0 rdev 0
+               sequence 0 flags 0x0(none)
+               atime 1751432947.0 (2025-07-02 14:39:07)
+               ctime 1751432947.0 (2025-07-02 14:39:07)
+               mtime 1751432947.0 (2025-07-02 14:39:07)
+               otime 1751432947.0 (2025-07-02 14:39:07) <<<
+
+The root cause is that fill_inode_item() in tree-log.c is only
+populating a/c/m time, not the otime (or btime in statx output).
+
+Part of the reason is that, the vfs inode only has a/c/m time, no native
+btime support yet.
+
+[FIX]
+Thankfully btrfs has its otime stored in btrfs_inode::i_otime_sec and
+btrfs_inode::i_otime_nsec.
+
+So what we really need is just fill the otime time stamp in
+fill_inode_item() of tree-log.c
+
+There is another fill_inode_item() in inode.c, which is doing the proper
+otime population.
+
+Fixes: 94edf4ae43a5 ("Btrfs: don't bother committing delayed inode updates when fsyncing")
+CC: stable@vger.kernel.org
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -4221,6 +4221,9 @@ static void fill_inode_item(struct btrfs
+       btrfs_set_token_timespec_nsec(&token, &item->ctime,
+                                     inode_get_ctime_nsec(inode));
+ 
++      btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec);
++      btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec);
++
+       /*
+        * We do not need to set the nbytes field, in fact during a fast fsync
+        * its value may not even be correct, since a fast fsync does not wait
diff --git a/queue-6.16/btrfs-qgroup-fix-qgroup-create-ioctl-returning-success-after-quotas-disabled.patch b/queue-6.16/btrfs-qgroup-fix-qgroup-create-ioctl-returning-success-after-quotas-disabled.patch

new file mode 100644 (file)

index 0000000..2aea347
--- /dev/null
+++ b/queue-6.16/btrfs-qgroup-fix-qgroup-create-ioctl-returning-success-after-quotas-disabled.patch
@@ -0,0 +1,75 @@
+From 08530d6e638427e7e1344bd67bacc03882ba95b9 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 1 Jul 2025 15:44:16 +0100
+Subject: btrfs: qgroup: fix qgroup create ioctl returning success after quotas disabled
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 08530d6e638427e7e1344bd67bacc03882ba95b9 upstream.
+
+When quotas are disabled qgroup ioctls are supposed to return -ENOTCONN,
+but the qgroup create ioctl stopped doing that when it races with a quota
+disable operation, returning 0 instead. This change of behaviour happened
+in commit 6ed05643ddb1 ("btrfs: create qgroup earlier in snapshot
+creation").
+
+The issue happens as follows:
+
+1) Task A enters btrfs_ioctl_qgroup_create(), qgroups are enabled and so
+   qgroup_enabled() returns true since fs_info->quota_root is not NULL;
+
+2) Task B enters btrfs_ioctl_quota_ctl() -> btrfs_quota_disable() and
+   disables qgroups, so now fs_info->quota_root is NULL;
+
+3) Task A enters btrfs_create_qgroup() and calls btrfs_qgroup_mode(),
+   which returns BTRFS_QGROUP_MODE_DISABLED since quotas are disabled,
+   and then btrfs_create_qgroup() returns 0 to the caller, which makes
+   the ioctl return 0 instead of -ENOTCONN.
+
+   The check for fs_info->quota_root and returning -ENOTCONN if it's NULL
+   is made only after the call btrfs_qgroup_mode().
+
+Fix this by moving the check for disabled quotas with btrfs_qgroup_mode()
+into transaction.c:create_pending_snapshot(), so that we don't abort the
+transaction if btrfs_create_qgroup() returns -ENOTCONN and quotas are
+disabled.
+
+Fixes: 6ed05643ddb1 ("btrfs: create qgroup earlier in snapshot creation")
+CC: stable@vger.kernel.org # 6.12+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/qgroup.c      |    3 ---
+ fs/btrfs/transaction.c |    6 ++++--
+ 2 files changed, 4 insertions(+), 5 deletions(-)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -1690,9 +1690,6 @@ int btrfs_create_qgroup(struct btrfs_tra
+       struct btrfs_qgroup *prealloc = NULL;
+       int ret = 0;
+ 
+-      if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED)
+-              return 0;
+-
+       mutex_lock(&fs_info->qgroup_ioctl_lock);
+       if (!fs_info->quota_root) {
+               ret = -ENOTCONN;
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -1735,8 +1735,10 @@ static noinline int create_pending_snaps
+ 
+       ret = btrfs_create_qgroup(trans, objectid);
+       if (ret && ret != -EEXIST) {
+-              btrfs_abort_transaction(trans, ret);
+-              goto fail;
++              if (ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info)) {
++                      btrfs_abort_transaction(trans, ret);
++                      goto fail;
++              }
+       }
+ 
+       /*
diff --git a/queue-6.16/btrfs-qgroup-fix-race-between-quota-disable-and-quota-rescan-ioctl.patch b/queue-6.16/btrfs-qgroup-fix-race-between-quota-disable-and-quota-rescan-ioctl.patch

new file mode 100644 (file)

index 0000000..11c2f8b
--- /dev/null
+++ b/queue-6.16/btrfs-qgroup-fix-race-between-quota-disable-and-quota-rescan-ioctl.patch
@@ -0,0 +1,108 @@
+From e1249667750399a48cafcf5945761d39fa584edf Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 30 Jun 2025 13:19:20 +0100
+Subject: btrfs: qgroup: fix race between quota disable and quota rescan ioctl
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit e1249667750399a48cafcf5945761d39fa584edf upstream.
+
+There's a race between a task disabling quotas and another running the
+rescan ioctl that can result in a use-after-free of qgroup records from
+the fs_info->qgroup_tree rbtree.
+
+This happens as follows:
+
+1) Task A enters btrfs_ioctl_quota_rescan() -> btrfs_qgroup_rescan();
+
+2) Task B enters btrfs_quota_disable() and calls
+   btrfs_qgroup_wait_for_completion(), which does nothing because at that
+   point fs_info->qgroup_rescan_running is false (it wasn't set yet by
+   task A);
+
+3) Task B calls btrfs_free_qgroup_config() which starts freeing qgroups
+   from fs_info->qgroup_tree without taking the lock fs_info->qgroup_lock;
+
+4) Task A enters qgroup_rescan_zero_tracking() which starts iterating
+   the fs_info->qgroup_tree tree while holding fs_info->qgroup_lock,
+   but task B is freeing qgroup records from that tree without holding
+   the lock, resulting in a use-after-free.
+
+Fix this by taking fs_info->qgroup_lock at btrfs_free_qgroup_config().
+Also at btrfs_qgroup_rescan() don't start the rescan worker if quotas
+were already disabled.
+
+Reported-by: cen zhang <zzzccc427@gmail.com>
+Link: https://lore.kernel.org/linux-btrfs/CAFRLqsV+cMDETFuzqdKSHk_FDm6tneea45krsHqPD6B3FetLpQ@mail.gmail.com/
+CC: stable@vger.kernel.org # 6.1+
+Reviewed-by: Boris Burkov <boris@bur.io>
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/qgroup.c |   31 ++++++++++++++++++++++++-------
+ 1 file changed, 24 insertions(+), 7 deletions(-)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -630,22 +630,30 @@ bool btrfs_check_quota_leak(const struct
+ 
+ /*
+  * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
+- * first two are in single-threaded paths.And for the third one, we have set
+- * quota_root to be null with qgroup_lock held before, so it is safe to clean
+- * up the in-memory structures without qgroup_lock held.
++ * first two are in single-threaded paths.
+  */
+ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
+ {
+       struct rb_node *n;
+       struct btrfs_qgroup *qgroup;
+ 
++      /*
++       * btrfs_quota_disable() can be called concurrently with
++       * btrfs_qgroup_rescan() -> qgroup_rescan_zero_tracking(), so take the
++       * lock.
++       */
++      spin_lock(&fs_info->qgroup_lock);
+       while ((n = rb_first(&fs_info->qgroup_tree))) {
+               qgroup = rb_entry(n, struct btrfs_qgroup, node);
+               rb_erase(n, &fs_info->qgroup_tree);
+               __del_qgroup_rb(qgroup);
++              spin_unlock(&fs_info->qgroup_lock);
+               btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
+               kfree(qgroup);
++              spin_lock(&fs_info->qgroup_lock);
+       }
++      spin_unlock(&fs_info->qgroup_lock);
++
+       /*
+        * We call btrfs_free_qgroup_config() when unmounting
+        * filesystem and disabling quota, so we set qgroup_ulist
+@@ -4039,12 +4047,21 @@ btrfs_qgroup_rescan(struct btrfs_fs_info
+       qgroup_rescan_zero_tracking(fs_info);
+ 
+       mutex_lock(&fs_info->qgroup_rescan_lock);
+-      fs_info->qgroup_rescan_running = true;
+-      btrfs_queue_work(fs_info->qgroup_rescan_workers,
+-                       &fs_info->qgroup_rescan_work);
++      /*
++       * The rescan worker is only for full accounting qgroups, check if it's
++       * enabled as it is pointless to queue it otherwise. A concurrent quota
++       * disable may also have just cleared BTRFS_FS_QUOTA_ENABLED.
++       */
++      if (btrfs_qgroup_full_accounting(fs_info)) {
++              fs_info->qgroup_rescan_running = true;
++              btrfs_queue_work(fs_info->qgroup_rescan_workers,
++                               &fs_info->qgroup_rescan_work);
++      } else {
++              ret = -ENOTCONN;
++      }
+       mutex_unlock(&fs_info->qgroup_rescan_lock);
+ 
+-      return 0;
++      return ret;
+ }
+ 
+ int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
diff --git a/queue-6.16/btrfs-qgroup-set-quota-enabled-bit-if-quota-disable-fails-flushing-reservations.patch b/queue-6.16/btrfs-qgroup-set-quota-enabled-bit-if-quota-disable-fails-flushing-reservations.patch

new file mode 100644 (file)

index 0000000..7a92e80
--- /dev/null
+++ b/queue-6.16/btrfs-qgroup-set-quota-enabled-bit-if-quota-disable-fails-flushing-reservations.patch
@@ -0,0 +1,48 @@
+From e41c75ca3189341e76e6af64b857c05b68a1d7db Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 1 Jul 2025 11:39:44 +0100
+Subject: btrfs: qgroup: set quota enabled bit if quota disable fails flushing reservations
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit e41c75ca3189341e76e6af64b857c05b68a1d7db upstream.
+
+Before waiting for the rescan worker to finish and flushing reservations,
+we clear the BTRFS_FS_QUOTA_ENABLED flag from fs_info. If we fail flushing
+reservations we leave with the flag not set which is not correct since
+quotas are still enabled - we must set back the flag on error paths, such
+as when we fail to start a transaction, except for error paths that abort
+a transaction. The reservation flushing happens very early before we do
+any operation that actually disables quotas and before we start a
+transaction, so set back BTRFS_FS_QUOTA_ENABLED if it fails.
+
+Fixes: af0e2aab3b70 ("btrfs: qgroup: flush reservations during quota disable")
+CC: stable@vger.kernel.org # 6.12+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/qgroup.c |    7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -1354,11 +1354,14 @@ int btrfs_quota_disable(struct btrfs_fs_
+ 
+       /*
+        * We have nothing held here and no trans handle, just return the error
+-       * if there is one.
++       * if there is one and set back the quota enabled bit since we didn't
++       * actually disable quotas.
+        */
+       ret = flush_reservations(fs_info);
+-      if (ret)
++      if (ret) {
++              set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+               return ret;
++      }
+ 
+       /*
+        * 1 For the root item
diff --git a/queue-6.16/btrfs-send-use-fallocate-for-hole-punching-with-send-stream-v2.patch b/queue-6.16/btrfs-send-use-fallocate-for-hole-punching-with-send-stream-v2.patch

new file mode 100644 (file)

index 0000000..43dbf9e
--- /dev/null
+++ b/queue-6.16/btrfs-send-use-fallocate-for-hole-punching-with-send-stream-v2.patch
@@ -0,0 +1,110 @@
+From 005b0a0c24e1628313e951516b675109a92cacfe Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Fri, 18 Jul 2025 13:07:29 +0100
+Subject: btrfs: send: use fallocate for hole punching with send stream v2
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 005b0a0c24e1628313e951516b675109a92cacfe upstream.
+
+Currently holes are sent as writes full of zeroes, which results in
+unnecessarily using disk space at the receiving end and increasing the
+stream size.
+
+In some cases we avoid sending writes of zeroes, like during a full
+send operation where we just skip writes for holes.
+
+But for some cases we fill previous holes with writes of zeroes too, like
+in this scenario:
+
+1) We have a file with a hole in the range [2M, 3M), we snapshot the
+   subvolume and do a full send. The range [2M, 3M) stays as a hole at
+   the receiver since we skip sending write commands full of zeroes;
+
+2) We punch a hole for the range [3M, 4M) in our file, so that now it
+   has a 2M hole in the range [2M, 4M), and snapshot the subvolume.
+   Now if we do an incremental send, we will send write commands full
+   of zeroes for the range [2M, 4M), removing the hole for [2M, 3M) at
+   the receiver.
+
+We could improve cases such as this last one by doing additional
+comparisons of file extent items (or their absence) between the parent
+and send snapshots, but that's a lot of code to add plus additional CPU
+and IO costs.
+
+Since the send stream v2 already has a fallocate command and btrfs-progs
+implements a callback to execute fallocate since the send stream v2
+support was added to it, update the kernel to use fallocate for punching
+holes for V2+ streams.
+
+Test coverage is provided by btrfs/284 which is a version of btrfs/007
+that exercises send stream v2 instead of v1, using fsstress with random
+operations and fssum to verify file contents.
+
+Link: https://github.com/kdave/btrfs-progs/issues/1001
+CC: stable@vger.kernel.org # 6.1+
+Reviewed-by: Boris Burkov <boris@bur.io>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/send.c |   33 +++++++++++++++++++++++++++++++++
+ 1 file changed, 33 insertions(+)
+
+--- a/fs/btrfs/send.c
++++ b/fs/btrfs/send.c
+@@ -4,6 +4,7 @@
+  */
+ 
+ #include <linux/bsearch.h>
++#include <linux/falloc.h>
+ #include <linux/fs.h>
+ #include <linux/file.h>
+ #include <linux/sort.h>
+@@ -5411,6 +5412,30 @@ tlv_put_failure:
+       return ret;
+ }
+ 
++static int send_fallocate(struct send_ctx *sctx, u32 mode, u64 offset, u64 len)
++{
++      struct fs_path *path;
++      int ret;
++
++      path = get_cur_inode_path(sctx);
++      if (IS_ERR(path))
++              return PTR_ERR(path);
++
++      ret = begin_cmd(sctx, BTRFS_SEND_C_FALLOCATE);
++      if (ret < 0)
++              return ret;
++
++      TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
++      TLV_PUT_U32(sctx, BTRFS_SEND_A_FALLOCATE_MODE, mode);
++      TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
++      TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
++
++      ret = send_cmd(sctx);
++
++tlv_put_failure:
++      return ret;
++}
++
+ static int send_hole(struct send_ctx *sctx, u64 end)
+ {
+       struct fs_path *p = NULL;
+@@ -5419,6 +5444,14 @@ static int send_hole(struct send_ctx *sc
+       int ret = 0;
+ 
+       /*
++       * Starting with send stream v2 we have fallocate and can use it to
++       * punch holes instead of sending writes full of zeroes.
++       */
++      if (proto_cmd_ok(sctx, BTRFS_SEND_C_FALLOCATE))
++              return send_fallocate(sctx, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
++                                    offset, end - offset);
++
++      /*
+        * A hole that starts at EOF or beyond it. Since we do not yet support
+        * fallocate (for extent preallocation and hole punching), sending a
+        * write of zeroes starting at EOF or beyond would later require issuing
diff --git a/queue-6.16/btrfs-zoned-do-not-remove-unwritten-non-data-block-group.patch b/queue-6.16/btrfs-zoned-do-not-remove-unwritten-non-data-block-group.patch

new file mode 100644 (file)

index 0000000..af7622b
--- /dev/null
+++ b/queue-6.16/btrfs-zoned-do-not-remove-unwritten-non-data-block-group.patch
@@ -0,0 +1,88 @@
+From 3061801420469610c8fa6080a950e56770773ef1 Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Sun, 29 Jun 2025 23:07:42 +0900
+Subject: btrfs: zoned: do not remove unwritten non-data block group
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 3061801420469610c8fa6080a950e56770773ef1 upstream.
+
+There are some reports of "unable to find chunk map for logical 2147483648
+length 16384" error message appears in dmesg. This means some IOs are
+occurring after a block group is removed.
+
+When a metadata tree node is cleaned on a zoned setup, we keep that node
+still dirty and write it out not to create a write hole. However, this can
+make a block group's used bytes == 0 while there is a dirty region left.
+
+Such an unused block group is moved into the unused_bg list and processed
+for removal. When the removal succeeds, the block group is removed from the
+transaction->dirty_bgs list, so the unused dirty nodes in the block group
+are not sent at the transaction commit time. It will be written at some
+later time e.g, sync or umount, and causes "unable to find chunk map"
+errors.
+
+This can happen relatively easy on SMR whose zone size is 256MB. However,
+calling do_zone_finish() on such block group returns -EAGAIN and keep that
+block group intact, which is why the issue is hidden until now.
+
+Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking")
+CC: stable@vger.kernel.org # 6.1+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/block-group.c |   27 +++++++++++++++++++++++++--
+ 1 file changed, 25 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -34,6 +34,19 @@ int btrfs_should_fragment_free_space(con
+ }
+ #endif
+ 
++static inline bool has_unwritten_metadata(struct btrfs_block_group *block_group)
++{
++      /* The meta_write_pointer is available only on the zoned setup. */
++      if (!btrfs_is_zoned(block_group->fs_info))
++              return false;
++
++      if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
++              return false;
++
++      return block_group->start + block_group->alloc_offset >
++              block_group->meta_write_pointer;
++}
++
+ /*
+  * Return target flags in extended format or 0 if restripe for this chunk_type
+  * is not in progress
+@@ -1244,6 +1257,15 @@ int btrfs_remove_block_group(struct btrf
+               goto out;
+ 
+       spin_lock(&block_group->lock);
++      /*
++       * Hitting this WARN means we removed a block group with an unwritten
++       * region. It will cause "unable to find chunk map for logical" errors.
++       */
++      if (WARN_ON(has_unwritten_metadata(block_group)))
++              btrfs_warn(fs_info,
++                         "block group %llu is removed before metadata write out",
++                         block_group->start);
++
+       set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
+ 
+       /*
+@@ -1586,8 +1608,9 @@ void btrfs_delete_unused_bgs(struct btrf
+                * needing to allocate extents from the block group.
+                */
+               used = btrfs_space_info_used(space_info, true);
+-              if (space_info->total_bytes - block_group->length < used &&
+-                  block_group->zone_unusable < block_group->length) {
++              if ((space_info->total_bytes - block_group->length < used &&
++                   block_group->zone_unusable < block_group->length) ||
++                  has_unwritten_metadata(block_group)) {
+                       /*
+                        * Add a reference for the list, compensate for the ref
+                        * drop under the "next" label for the
diff --git a/queue-6.16/btrfs-zoned-do-not-select-metadata-bg-as-finish-target.patch b/queue-6.16/btrfs-zoned-do-not-select-metadata-bg-as-finish-target.patch

new file mode 100644 (file)

index 0000000..cbc695e
--- /dev/null
+++ b/queue-6.16/btrfs-zoned-do-not-select-metadata-bg-as-finish-target.patch
@@ -0,0 +1,37 @@
+From 3a931e9b39c7ff8066657042f5f00d3b7e6ad315 Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Wed, 16 Jul 2025 16:59:52 +0900
+Subject: btrfs: zoned: do not select metadata BG as finish target
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 3a931e9b39c7ff8066657042f5f00d3b7e6ad315 upstream.
+
+We call btrfs_zone_finish_one_bg() to zone finish one block group and make
+room to activate another block group. Currently, we can choose a metadata
+block group as a target. But, as we reserve an active metadata block group,
+we no longer want to select a metadata block group. So, skip it in the
+loop.
+
+CC: stable@vger.kernel.org # 6.6+
+Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/zoned.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -2651,7 +2651,7 @@ int btrfs_zone_finish_one_bg(struct btrf
+ 
+               spin_lock(&block_group->lock);
+               if (block_group->reserved || block_group->alloc_offset == 0 ||
+-                  (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) ||
++                  !(block_group->flags & BTRFS_BLOCK_GROUP_DATA) ||
+                   test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
+                       spin_unlock(&block_group->lock);
+                       continue;
diff --git a/queue-6.16/btrfs-zoned-requeue-to-unused-block-group-list-if-zone-finish-failed.patch b/queue-6.16/btrfs-zoned-requeue-to-unused-block-group-list-if-zone-finish-failed.patch

new file mode 100644 (file)

index 0000000..406b3d4
--- /dev/null
+++ b/queue-6.16/btrfs-zoned-requeue-to-unused-block-group-list-if-zone-finish-failed.patch
@@ -0,0 +1,39 @@
+From 62be7afcc13b2727bdc6a4c91aefed6b452e6ecc Mon Sep 17 00:00:00 2001
+From: Naohiro Aota <naohiro.aota@wdc.com>
+Date: Sun, 29 Jun 2025 23:18:29 +0900
+Subject: btrfs: zoned: requeue to unused block group list if zone finish failed
+
+From: Naohiro Aota <naohiro.aota@wdc.com>
+
+commit 62be7afcc13b2727bdc6a4c91aefed6b452e6ecc upstream.
+
+btrfs_zone_finish() can fail for several reason. If it is -EAGAIN, we need
+to try it again later. So, put the block group to the retry list properly.
+
+Failing to do so will keep the removable block group intact until remount
+and can causes unnecessary ENOSPC.
+
+Fixes: 74e91b12b115 ("btrfs: zoned: zone finish unused block group")
+CC: stable@vger.kernel.org # 6.1+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/block-group.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -1616,8 +1616,10 @@ void btrfs_delete_unused_bgs(struct btrf
+               ret = btrfs_zone_finish(block_group);
+               if (ret < 0) {
+                       btrfs_dec_block_group_ro(block_group);
+-                      if (ret == -EAGAIN)
++                      if (ret == -EAGAIN) {
++                              btrfs_link_bg_list(block_group, &retry_list);
+                               ret = 0;
++                      }
+                       goto next;
+               }
+ 
diff --git a/queue-6.16/btrfs-zoned-reserve-data_reloc-block-group-on-mount.patch b/queue-6.16/btrfs-zoned-reserve-data_reloc-block-group-on-mount.patch

new file mode 100644 (file)

index 0000000..337c29b
--- /dev/null
+++ b/queue-6.16/btrfs-zoned-reserve-data_reloc-block-group-on-mount.patch
@@ -0,0 +1,137 @@
+From 694ce5e143d67267ad26b04463e790a597500b00 Mon Sep 17 00:00:00 2001
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Date: Tue, 3 Jun 2025 08:14:01 +0200
+Subject: btrfs: zoned: reserve data_reloc block group on mount
+
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+
+commit 694ce5e143d67267ad26b04463e790a597500b00 upstream.
+
+Create a block group dedicated for data relocation on mount of a zoned
+filesystem.
+
+If there is already more than one empty DATA block group on mount, this
+one is picked for the data relocation block group, instead of a newly
+created one.
+
+This is done to ensure, there is always space for performing garbage
+collection and the filesystem is not hitting ENOSPC under heavy overwrite
+workloads.
+
+CC: stable@vger.kernel.org # 6.6+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/disk-io.c |    1 
+ fs/btrfs/zoned.c   |   61 +++++++++++++++++++++++++++++++++++++++++++++++++++++
+ fs/btrfs/zoned.h   |    3 ++
+ 3 files changed, 65 insertions(+)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -3561,6 +3561,7 @@ int __cold open_ctree(struct super_block
+               goto fail_sysfs;
+       }
+ 
++      btrfs_zoned_reserve_data_reloc_bg(fs_info);
+       btrfs_free_zone_cache(fs_info);
+ 
+       btrfs_check_active_zone_reservation(fs_info);
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -17,6 +17,7 @@
+ #include "fs.h"
+ #include "accessors.h"
+ #include "bio.h"
++#include "transaction.h"
+ 
+ /* Maximum number of zones to report per blkdev_report_zones() call */
+ #define BTRFS_REPORT_NR_ZONES   4096
+@@ -2501,6 +2502,66 @@ void btrfs_clear_data_reloc_bg(struct bt
+       spin_unlock(&fs_info->relocation_bg_lock);
+ }
+ 
++void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
++{
++      struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
++      struct btrfs_space_info *space_info = data_sinfo->sub_group[0];
++      struct btrfs_trans_handle *trans;
++      struct btrfs_block_group *bg;
++      struct list_head *bg_list;
++      u64 alloc_flags;
++      bool initial = false;
++      bool did_chunk_alloc = false;
++      int index;
++      int ret;
++
++      if (!btrfs_is_zoned(fs_info))
++              return;
++
++      if (fs_info->data_reloc_bg)
++              return;
++
++      if (sb_rdonly(fs_info->sb))
++              return;
++
++      ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
++      alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
++      index = btrfs_bg_flags_to_raid_index(alloc_flags);
++
++      bg_list = &data_sinfo->block_groups[index];
++again:
++      list_for_each_entry(bg, bg_list, list) {
++              if (bg->used > 0)
++                      continue;
++
++              if (!initial) {
++                      initial = true;
++                      continue;
++              }
++
++              fs_info->data_reloc_bg = bg->start;
++              set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags);
++              btrfs_zone_activate(bg);
++
++              return;
++      }
++
++      if (did_chunk_alloc)
++              return;
++
++      trans = btrfs_join_transaction(fs_info->tree_root);
++      if (IS_ERR(trans))
++              return;
++
++      ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
++      btrfs_end_transaction(trans);
++      if (ret == 1) {
++              did_chunk_alloc = true;
++              bg_list = &space_info->block_groups[index];
++              goto again;
++      }
++}
++
+ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
+ {
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+--- a/fs/btrfs/zoned.h
++++ b/fs/btrfs/zoned.h
+@@ -88,6 +88,7 @@ void btrfs_zone_finish_endio(struct btrf
+ void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
+                                  struct extent_buffer *eb);
+ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
++void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info);
+ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
+ bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info);
+ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
+@@ -241,6 +242,8 @@ static inline void btrfs_schedule_zone_f
+ 
+ static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
+ 
++static inline void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info) { }
++
+ static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
+ 
+ static inline bool btrfs_zoned_should_reclaim(const struct btrfs_fs_info *fs_info)
diff --git a/queue-6.16/btrfs-zoned-use-filesystem-size-not-disk-size-for-reclaim-decision.patch b/queue-6.16/btrfs-zoned-use-filesystem-size-not-disk-size-for-reclaim-decision.patch

new file mode 100644 (file)

index 0000000..fb79d95
--- /dev/null
+++ b/queue-6.16/btrfs-zoned-use-filesystem-size-not-disk-size-for-reclaim-decision.patch
@@ -0,0 +1,48 @@
+From 55f7c65b2f69c7e4cb7aa7c1654a228ccf734fd8 Mon Sep 17 00:00:00 2001
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Date: Tue, 20 May 2025 09:20:47 +0200
+Subject: btrfs: zoned: use filesystem size not disk size for reclaim decision
+
+From: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+
+commit 55f7c65b2f69c7e4cb7aa7c1654a228ccf734fd8 upstream.
+
+When deciding if a zoned filesystem is reaching the threshold to reclaim
+data block groups, look at the size of the filesystem not to potentially
+total available size of all drives in the filesystem.
+
+Especially if a filesystem was created with mkfs' -b option, constraining
+it to only a portion of the block device, the numbers won't match and
+potentially garbage collection is kicking in too late.
+
+Fixes: 3687fcb0752a ("btrfs: zoned: make auto-reclaim less aggressive")
+CC: stable@vger.kernel.org # 6.1+
+Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
+Tested-by: Damien Le Moal <dlemoal@kernel.org>
+Signed-off-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/zoned.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/fs/btrfs/zoned.c
++++ b/fs/btrfs/zoned.c
+@@ -2523,8 +2523,8 @@ bool btrfs_zoned_should_reclaim(const st
+ {
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+       struct btrfs_device *device;
++      u64 total = btrfs_super_total_bytes(fs_info->super_copy);
+       u64 used = 0;
+-      u64 total = 0;
+       u64 factor;
+ 
+       ASSERT(btrfs_is_zoned(fs_info));
+@@ -2537,7 +2537,6 @@ bool btrfs_zoned_should_reclaim(const st
+               if (!device->bdev)
+                       continue;
+ 
+-              total += device->disk_total_bytes;
+               used += device->bytes_used;
+       }
+       mutex_unlock(&fs_devices->device_list_mutex);
diff --git a/queue-6.16/cdc-acm-fix-race-between-initial-clearing-halt-and-open.patch b/queue-6.16/cdc-acm-fix-race-between-initial-clearing-halt-and-open.patch

new file mode 100644 (file)

index 0000000..e5bc2ae
--- /dev/null
+++ b/queue-6.16/cdc-acm-fix-race-between-initial-clearing-halt-and-open.patch
@@ -0,0 +1,50 @@
+From 64690a90cd7c6db16d3af8616be1f4bf8d492850 Mon Sep 17 00:00:00 2001
+From: Oliver Neukum <oneukum@suse.com>
+Date: Thu, 17 Jul 2025 16:12:50 +0200
+Subject: cdc-acm: fix race between initial clearing halt and open
+
+From: Oliver Neukum <oneukum@suse.com>
+
+commit 64690a90cd7c6db16d3af8616be1f4bf8d492850 upstream.
+
+On the devices that need their endpoints to get an
+initial clear_halt, this needs to be done before
+the devices can be opened. That means it needs to be
+before the devices are registered.
+
+Fixes: 15bf722e6f6c0 ("cdc-acm: Add support of ATOL FPrint fiscal printers")
+Cc: stable <stable@kernel.org>
+Signed-off-by: Oliver Neukum <oneukum@suse.com>
+Link: https://lore.kernel.org/r/20250717141259.2345605-1-oneukum@suse.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/usb/class/cdc-acm.c |   11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+--- a/drivers/usb/class/cdc-acm.c
++++ b/drivers/usb/class/cdc-acm.c
+@@ -1520,6 +1520,12 @@ skip_countries:
+                       goto err_remove_files;
+       }
+ 
++      if (quirks & CLEAR_HALT_CONDITIONS) {
++              /* errors intentionally ignored */
++              usb_clear_halt(usb_dev, acm->in);
++              usb_clear_halt(usb_dev, acm->out);
++      }
++
+       tty_dev = tty_port_register_device(&acm->port, acm_tty_driver, minor,
+                       &control_interface->dev);
+       if (IS_ERR(tty_dev)) {
+@@ -1527,11 +1533,6 @@ skip_countries:
+               goto err_release_data_interface;
+       }
+ 
+-      if (quirks & CLEAR_HALT_CONDITIONS) {
+-              usb_clear_halt(usb_dev, acm->in);
+-              usb_clear_halt(usb_dev, acm->out);
+-      }
+-
+       dev_info(&intf->dev, "ttyACM%d: USB ACM device\n", minor);
+ 
+       return 0;
diff --git a/queue-6.16/comedi-fix-race-between-polling-and-detaching.patch b/queue-6.16/comedi-fix-race-between-polling-and-detaching.patch

new file mode 100644 (file)

index 0000000..c65376d
--- /dev/null
+++ b/queue-6.16/comedi-fix-race-between-polling-and-detaching.patch
@@ -0,0 +1,157 @@
+From 35b6fc51c666fc96355be5cd633ed0fe4ccf68b2 Mon Sep 17 00:00:00 2001
+From: Ian Abbott <abbotti@mev.co.uk>
+Date: Tue, 22 Jul 2025 16:53:16 +0100
+Subject: comedi: fix race between polling and detaching
+
+From: Ian Abbott <abbotti@mev.co.uk>
+
+commit 35b6fc51c666fc96355be5cd633ed0fe4ccf68b2 upstream.
+
+syzbot reports a use-after-free in comedi in the below link, which is
+due to comedi gladly removing the allocated async area even though poll
+requests are still active on the wait_queue_head inside of it. This can
+cause a use-after-free when the poll entries are later triggered or
+removed, as the memory for the wait_queue_head has been freed.  We need
+to check there are no tasks queued on any of the subdevices' wait queues
+before allowing the device to be detached by the `COMEDI_DEVCONFIG`
+ioctl.
+
+Tasks will read-lock `dev->attach_lock` before adding themselves to the
+subdevice wait queue, so fix the problem in the `COMEDI_DEVCONFIG` ioctl
+handler by write-locking `dev->attach_lock` before checking that all of
+the subdevices are safe to be deleted.  This includes testing for any
+sleepers on the subdevices' wait queues.  It remains locked until the
+device has been detached.  This requires the `comedi_device_detach()`
+function to be refactored slightly, moving the bulk of it into new
+function `comedi_device_detach_locked()`.
+
+Note that the refactor of `comedi_device_detach()` results in
+`comedi_device_cancel_all()` now being called while `dev->attach_lock`
+is write-locked, which wasn't the case previously, but that does not
+matter.
+
+Thanks to Jens Axboe for diagnosing the problem and co-developing this
+patch.
+
+Cc: stable <stable@kernel.org>
+Fixes: 2f3fdcd7ce93 ("staging: comedi: add rw_semaphore to protect against device detachment")
+Link: https://lore.kernel.org/all/687bd5fe.a70a0220.693ce.0091.GAE@google.com/
+Reported-by: syzbot+01523a0ae5600aef5895@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=01523a0ae5600aef5895
+Co-developed-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Ian Abbott <abbotti@mev.co.uk>
+Tested-by: Jens Axboe <axboe@kernel.dk>
+Link: https://lore.kernel.org/r/20250722155316.27432-1-abbotti@mev.co.uk
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/comedi/comedi_fops.c     |   33 +++++++++++++++++++++++++--------
+ drivers/comedi/comedi_internal.h |    1 +
+ drivers/comedi/drivers.c         |   13 ++++++++++---
+ 3 files changed, 36 insertions(+), 11 deletions(-)
+
+--- a/drivers/comedi/comedi_fops.c
++++ b/drivers/comedi/comedi_fops.c
+@@ -787,6 +787,7 @@ static int is_device_busy(struct comedi_
+       struct comedi_subdevice *s;
+       int i;
+ 
++      lockdep_assert_held_write(&dev->attach_lock);
+       lockdep_assert_held(&dev->mutex);
+       if (!dev->attached)
+               return 0;
+@@ -795,7 +796,16 @@ static int is_device_busy(struct comedi_
+               s = &dev->subdevices[i];
+               if (s->busy)
+                       return 1;
+-              if (s->async && comedi_buf_is_mmapped(s))
++              if (!s->async)
++                      continue;
++              if (comedi_buf_is_mmapped(s))
++                      return 1;
++              /*
++               * There may be tasks still waiting on the subdevice's wait
++               * queue, although they should already be about to be removed
++               * from it since the subdevice has no active async command.
++               */
++              if (wq_has_sleeper(&s->async->wait_head))
+                       return 1;
+       }
+ 
+@@ -825,15 +835,22 @@ static int do_devconfig_ioctl(struct com
+               return -EPERM;
+ 
+       if (!arg) {
+-              if (is_device_busy(dev))
+-                      return -EBUSY;
+-              if (dev->attached) {
+-                      struct module *driver_module = dev->driver->module;
++              int rc = 0;
+ 
+-                      comedi_device_detach(dev);
+-                      module_put(driver_module);
++              if (dev->attached) {
++                      down_write(&dev->attach_lock);
++                      if (is_device_busy(dev)) {
++                              rc = -EBUSY;
++                      } else {
++                              struct module *driver_module =
++                                      dev->driver->module;
++
++                              comedi_device_detach_locked(dev);
++                              module_put(driver_module);
++                      }
++                      up_write(&dev->attach_lock);
+               }
+-              return 0;
++              return rc;
+       }
+ 
+       if (copy_from_user(&it, arg, sizeof(it)))
+--- a/drivers/comedi/comedi_internal.h
++++ b/drivers/comedi/comedi_internal.h
+@@ -50,6 +50,7 @@ extern struct mutex comedi_drivers_list_
+ int insn_inval(struct comedi_device *dev, struct comedi_subdevice *s,
+              struct comedi_insn *insn, unsigned int *data);
+ 
++void comedi_device_detach_locked(struct comedi_device *dev);
+ void comedi_device_detach(struct comedi_device *dev);
+ int comedi_device_attach(struct comedi_device *dev,
+                        struct comedi_devconfig *it);
+--- a/drivers/comedi/drivers.c
++++ b/drivers/comedi/drivers.c
+@@ -158,7 +158,7 @@ static void comedi_device_detach_cleanup
+       int i;
+       struct comedi_subdevice *s;
+ 
+-      lockdep_assert_held(&dev->attach_lock);
++      lockdep_assert_held_write(&dev->attach_lock);
+       lockdep_assert_held(&dev->mutex);
+       if (dev->subdevices) {
+               for (i = 0; i < dev->n_subdevices; i++) {
+@@ -196,16 +196,23 @@ static void comedi_device_detach_cleanup
+       comedi_clear_hw_dev(dev);
+ }
+ 
+-void comedi_device_detach(struct comedi_device *dev)
++void comedi_device_detach_locked(struct comedi_device *dev)
+ {
++      lockdep_assert_held_write(&dev->attach_lock);
+       lockdep_assert_held(&dev->mutex);
+       comedi_device_cancel_all(dev);
+-      down_write(&dev->attach_lock);
+       dev->attached = false;
+       dev->detach_count++;
+       if (dev->driver)
+               dev->driver->detach(dev);
+       comedi_device_detach_cleanup(dev);
++}
++
++void comedi_device_detach(struct comedi_device *dev)
++{
++      lockdep_assert_held(&dev->mutex);
++      down_write(&dev->attach_lock);
++      comedi_device_detach_locked(dev);
+       up_write(&dev->attach_lock);
+ }
+ 
diff --git a/queue-6.16/series b/queue-6.16/series

index b4cdedf4e78b98c4204919adebe80d6b82fc7c10..e68ac176cd84246587a5958fb118aa2d1380d3f9 100644 (file)
--- a/queue-6.16/series
+++ b/queue-6.16/series
@@ -505,3 +505,29 @@ ext4-initialize-superblock-fields-in-the-kballoc-test.c-kunit-tests.patch
  usb-core-config-prevent-oob-read-in-ss-endpoint-companion-parsing.patch
  misc-rtsx-usb-ensure-mmc-child-device-is-active-when-card-is-present.patch
  usb-typec-ucsi-update-power_supply-on-power-role-change.patch
+comedi-fix-race-between-polling-and-detaching.patch
+thunderbolt-fix-copy-paste-error-in-match_service_id.patch
+usb-typec-fusb302-cache-pd-rx-state.patch
+cdc-acm-fix-race-between-initial-clearing-halt-and-open.patch
+btrfs-zoned-use-filesystem-size-not-disk-size-for-reclaim-decision.patch
+btrfs-abort-transaction-during-log-replay-if-walk_log_tree-failed.patch
+btrfs-zoned-reserve-data_reloc-block-group-on-mount.patch
+btrfs-zoned-requeue-to-unused-block-group-list-if-zone-finish-failed.patch
+btrfs-zoned-do-not-remove-unwritten-non-data-block-group.patch
+btrfs-qgroup-set-quota-enabled-bit-if-quota-disable-fails-flushing-reservations.patch
+btrfs-don-t-ignore-inode-missing-when-replaying-log-tree.patch
+btrfs-fix-ssd_spread-overallocation.patch
+btrfs-qgroup-fix-race-between-quota-disable-and-quota-rescan-ioctl.patch
+btrfs-populate-otime-when-logging-an-inode-item.patch
+btrfs-don-t-skip-accounting-in-early-enotty-return-in-btrfs_uring_encoded_read.patch
+btrfs-qgroup-fix-qgroup-create-ioctl-returning-success-after-quotas-disabled.patch
+btrfs-don-t-skip-remaining-extrefs-if-dir-not-found-during-log-replay.patch
+btrfs-clear-dirty-status-from-extent-buffer-on-error-at-insert_new_root.patch
+btrfs-send-use-fallocate-for-hole-punching-with-send-stream-v2.patch
+btrfs-fix-log-tree-replay-failure-due-to-file-with-0-links-and-extents.patch
+btrfs-error-on-missing-block-group-when-unaccounting-log-tree-extent-buffers.patch
+btrfs-zoned-do-not-select-metadata-bg-as-finish-target.patch
+btrfs-fix-wrong-length-parameter-for-btrfs_cleanup_ordered_extents.patch
+btrfs-fix-iteration-bug-in-__qgroup_excl_accounting.patch
+btrfs-do-not-allow-relocation-of-partially-dropped-subvolumes.patch
+xfs-fix-scrub-trace-with-null-pointer-in-quotacheck.patch
diff --git a/queue-6.16/thunderbolt-fix-copy-paste-error-in-match_service_id.patch b/queue-6.16/thunderbolt-fix-copy-paste-error-in-match_service_id.patch

new file mode 100644 (file)

index 0000000..44e1f33
--- /dev/null
+++ b/queue-6.16/thunderbolt-fix-copy-paste-error-in-match_service_id.patch
@@ -0,0 +1,32 @@
+From 5cc1f66cb23cccc704e3def27ad31ed479e934a5 Mon Sep 17 00:00:00 2001
+From: Eric Biggers <ebiggers@kernel.org>
+Date: Sun, 20 Jul 2025 22:01:36 -0700
+Subject: thunderbolt: Fix copy+paste error in match_service_id()
+
+From: Eric Biggers <ebiggers@kernel.org>
+
+commit 5cc1f66cb23cccc704e3def27ad31ed479e934a5 upstream.
+
+The second instance of TBSVC_MATCH_PROTOCOL_VERSION seems to have been
+intended to be TBSVC_MATCH_PROTOCOL_REVISION.
+
+Fixes: d1ff70241a27 ("thunderbolt: Add support for XDomain discovery protocol")
+Cc: stable <stable@kernel.org>
+Signed-off-by: Eric Biggers <ebiggers@kernel.org>
+Link: https://lore.kernel.org/r/20250721050136.30004-1-ebiggers@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/thunderbolt/domain.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/thunderbolt/domain.c
++++ b/drivers/thunderbolt/domain.c
+@@ -36,7 +36,7 @@ static bool match_service_id(const struc
+                       return false;
+       }
+ 
+-      if (id->match_flags & TBSVC_MATCH_PROTOCOL_VERSION) {
++      if (id->match_flags & TBSVC_MATCH_PROTOCOL_REVISION) {
+               if (id->protocol_revision != svc->prtcrevs)
+                       return false;
+       }
diff --git a/queue-6.16/usb-typec-fusb302-cache-pd-rx-state.patch b/queue-6.16/usb-typec-fusb302-cache-pd-rx-state.patch

new file mode 100644 (file)

index 0000000..859d4c4
--- /dev/null
+++ b/queue-6.16/usb-typec-fusb302-cache-pd-rx-state.patch
@@ -0,0 +1,92 @@
+From 1e61f6ab08786d66a11cfc51e13d6f08a6b06c56 Mon Sep 17 00:00:00 2001
+From: Sebastian Reichel <sebastian.reichel@collabora.com>
+Date: Fri, 4 Jul 2025 19:55:06 +0200
+Subject: usb: typec: fusb302: cache PD RX state
+
+From: Sebastian Reichel <sebastian.reichel@collabora.com>
+
+commit 1e61f6ab08786d66a11cfc51e13d6f08a6b06c56 upstream.
+
+This patch fixes a race condition communication error, which ends up in
+PD hard resets when losing the race. Some systems, like the Radxa ROCK
+5B are powered through USB-C without any backup power source and use a
+FUSB302 chip to do the PD negotiation. This means it is quite important
+to avoid hard resets, since that effectively kills the system's
+power-supply.
+
+I've found the following race condition while debugging unplanned power
+loss during booting the board every now and then:
+
+1. lots of TCPM/FUSB302/PD initialization stuff
+2. TCPM ends up in SNK_WAIT_CAPABILITIES (tcpm_set_pd_rx is enabled here)
+3. the remote PD source does not send anything, so TCPM does a SOFT RESET
+4. TCPM ends up in SNK_WAIT_CAPABILITIES for the second time
+   (tcpm_set_pd_rx is enabled again, even though it is still on)
+
+At this point I've seen broken CRC good messages being send by the
+FUSB302 with a logic analyzer sniffing the CC lines. Also it looks like
+messages are being lost and things generally going haywire with one of
+the two sides doing a hard reset once a broken CRC good message was send
+to the bus.
+
+I think the system is running into a race condition, that the FIFOs are
+being cleared and/or the automatic good CRC message generation flag is
+being updated while a message is already arriving.
+
+Let's avoid this by caching the PD RX enabled state, as we have already
+processed anything in the FIFOs and are in a good state. As a side
+effect that this also optimizes I2C bus usage :)
+
+As far as I can tell the problem theoretically also exists when TCPM
+enters SNK_WAIT_CAPABILITIES the first time, but I believe this is less
+critical for the following reason:
+
+On devices like the ROCK 5B, which are powered through a TCPM backed
+USB-C port, the bootloader must have done some prior PD communication
+(initial communication must happen within 5 seconds after plugging the
+USB-C plug). This means the first time the kernel TCPM state machine
+reaches SNK_WAIT_CAPABILITIES, the remote side is not sending messages
+actively. On other devices a hard reset simply adds some extra delay and
+things should be good afterwards.
+
+Fixes: c034a43e72dda ("staging: typec: Fairchild FUSB302 Type-c chip driver")
+Cc: stable <stable@kernel.org>
+Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
+Reviewed-by: Heikki Krogerus <heikki.krogerus@linux.intel.com>
+Link: https://lore.kernel.org/r/20250704-fusb302-race-condition-fix-v1-1-239012c0e27a@kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/usb/typec/tcpm/fusb302.c |    8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/drivers/usb/typec/tcpm/fusb302.c
++++ b/drivers/usb/typec/tcpm/fusb302.c
+@@ -104,6 +104,7 @@ struct fusb302_chip {
+       bool vconn_on;
+       bool vbus_on;
+       bool charge_on;
++      bool pd_rx_on;
+       bool vbus_present;
+       enum typec_cc_polarity cc_polarity;
+       enum typec_cc_status cc1;
+@@ -841,6 +842,11 @@ static int tcpm_set_pd_rx(struct tcpc_de
+       int ret = 0;
+ 
+       mutex_lock(&chip->lock);
++      if (chip->pd_rx_on == on) {
++              fusb302_log(chip, "pd is already %s", str_on_off(on));
++              goto done;
++      }
++
+       ret = fusb302_pd_rx_flush(chip);
+       if (ret < 0) {
+               fusb302_log(chip, "cannot flush pd rx buffer, ret=%d", ret);
+@@ -863,6 +869,8 @@ static int tcpm_set_pd_rx(struct tcpc_de
+                           str_on_off(on), ret);
+               goto done;
+       }
++
++      chip->pd_rx_on = on;
+       fusb302_log(chip, "pd := %s", str_on_off(on));
+ done:
+       mutex_unlock(&chip->lock);
diff --git a/queue-6.16/xfs-fix-scrub-trace-with-null-pointer-in-quotacheck.patch b/queue-6.16/xfs-fix-scrub-trace-with-null-pointer-in-quotacheck.patch

new file mode 100644 (file)

index 0000000..2daee54
--- /dev/null
+++ b/queue-6.16/xfs-fix-scrub-trace-with-null-pointer-in-quotacheck.patch
@@ -0,0 +1,32 @@
+From 5d94b19f066480addfcdcb5efde66152ad5a7c0e Mon Sep 17 00:00:00 2001
+From: Andrey Albershteyn <aalbersh@redhat.com>
+Date: Thu, 31 Jul 2025 19:07:22 +0200
+Subject: xfs: fix scrub trace with null pointer in quotacheck
+
+From: Andrey Albershteyn <aalbersh@redhat.com>
+
+commit 5d94b19f066480addfcdcb5efde66152ad5a7c0e upstream.
+
+The quotacheck doesn't initialize sc->ip.
+
+Cc: stable@vger.kernel.org # v6.8
+Fixes: 21d7500929c8a0 ("xfs: improve dquot iteration for scrub")
+Reviewed-by: Darrick J. Wong <djwong@kernel.org>
+Signed-off-by: Andrey Albershteyn <aalbersh@kernel.org>
+Signed-off-by: Carlos Maiolino <cem@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/xfs/scrub/trace.h |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/xfs/scrub/trace.h
++++ b/fs/xfs/scrub/trace.h
+@@ -479,7 +479,7 @@ DECLARE_EVENT_CLASS(xchk_dqiter_class,
+               __field(xfs_exntst_t, state)
+       ),
+       TP_fast_assign(
+-              __entry->dev = cursor->sc->ip->i_mount->m_super->s_dev;
++              __entry->dev = cursor->sc->mp->m_super->s_dev;
+               __entry->dqtype = cursor->dqtype;
+               __entry->ino = cursor->quota_ip->i_ino;
+               __entry->cur_id = cursor->id;
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 18 Aug 2025 10:51:42 +0000 (12:51 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 18 Aug 2025 10:51:42 +0000 (12:51 +0200)
queue-6.16/btrfs-abort-transaction-during-log-replay-if-walk_log_tree-failed.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-clear-dirty-status-from-extent-buffer-on-error-at-insert_new_root.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-do-not-allow-relocation-of-partially-dropped-subvolumes.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-don-t-ignore-inode-missing-when-replaying-log-tree.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-don-t-skip-accounting-in-early-enotty-return-in-btrfs_uring_encoded_read.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-don-t-skip-remaining-extrefs-if-dir-not-found-during-log-replay.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-error-on-missing-block-group-when-unaccounting-log-tree-extent-buffers.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-fix-iteration-bug-in-__qgroup_excl_accounting.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-fix-log-tree-replay-failure-due-to-file-with-0-links-and-extents.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-fix-ssd_spread-overallocation.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-fix-wrong-length-parameter-for-btrfs_cleanup_ordered_extents.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-populate-otime-when-logging-an-inode-item.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-qgroup-fix-qgroup-create-ioctl-returning-success-after-quotas-disabled.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-qgroup-fix-race-between-quota-disable-and-quota-rescan-ioctl.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-qgroup-set-quota-enabled-bit-if-quota-disable-fails-flushing-reservations.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-send-use-fallocate-for-hole-punching-with-send-stream-v2.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-zoned-do-not-remove-unwritten-non-data-block-group.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-zoned-do-not-select-metadata-bg-as-finish-target.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-zoned-requeue-to-unused-block-group-list-if-zone-finish-failed.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-zoned-reserve-data_reloc-block-group-on-mount.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/btrfs-zoned-use-filesystem-size-not-disk-size-for-reclaim-decision.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/cdc-acm-fix-race-between-initial-clearing-halt-and-open.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/comedi-fix-race-between-polling-and-detaching.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/series		patch \| blob \| blame \| history
queue-6.16/thunderbolt-fix-copy-paste-error-in-match_service_id.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/usb-typec-fusb302-cache-pd-rx-state.patch	[new file with mode: 0644]	patch \| blob
queue-6.16/xfs-fix-scrub-trace-with-null-pointer-in-quotacheck.patch	[new file with mode: 0644]	patch \| blob