From: Greg Kroah-Hartman Date: Mon, 18 Aug 2025 10:58:25 +0000 (+0200) Subject: 6.12-stable patches X-Git-Tag: v6.12.43~32 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=056e6e1a6b600641c06f82a695c3762375afe480;p=thirdparty%2Fkernel%2Fstable-queue.git 6.12-stable patches added patches: btrfs-abort-transaction-during-log-replay-if-walk_log_tree-failed.patch btrfs-clear-dirty-status-from-extent-buffer-on-error-at-insert_new_root.patch btrfs-do-not-allow-relocation-of-partially-dropped-subvolumes.patch btrfs-don-t-ignore-inode-missing-when-replaying-log-tree.patch btrfs-don-t-skip-remaining-extrefs-if-dir-not-found-during-log-replay.patch btrfs-error-on-missing-block-group-when-unaccounting-log-tree-extent-buffers.patch btrfs-fix-iteration-bug-in-__qgroup_excl_accounting.patch btrfs-fix-log-tree-replay-failure-due-to-file-with-0-links-and-extents.patch btrfs-fix-ssd_spread-overallocation.patch btrfs-populate-otime-when-logging-an-inode-item.patch btrfs-qgroup-fix-qgroup-create-ioctl-returning-success-after-quotas-disabled.patch btrfs-qgroup-set-quota-enabled-bit-if-quota-disable-fails-flushing-reservations.patch btrfs-zoned-do-not-remove-unwritten-non-data-block-group.patch btrfs-zoned-do-not-select-metadata-bg-as-finish-target.patch btrfs-zoned-use-filesystem-size-not-disk-size-for-reclaim-decision.patch cdc-acm-fix-race-between-initial-clearing-halt-and-open.patch comedi-fix-race-between-polling-and-detaching.patch thunderbolt-fix-copy-paste-error-in-match_service_id.patch xfs-fix-scrub-trace-with-null-pointer-in-quotacheck.patch --- diff --git a/queue-6.12/btrfs-abort-transaction-during-log-replay-if-walk_log_tree-failed.patch b/queue-6.12/btrfs-abort-transaction-during-log-replay-if-walk_log_tree-failed.patch new file mode 100644 index 0000000000..77e0d9ad6c --- /dev/null +++ b/queue-6.12/btrfs-abort-transaction-during-log-replay-if-walk_log_tree-failed.patch @@ -0,0 +1,44 @@ +From 2a5898c4aac67494c2f0f7fe38373c95c371c930 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Wed, 21 May 2025 17:41:18 +0100 +Subject: btrfs: abort transaction during log replay if walk_log_tree() failed + +From: Filipe Manana + +commit 2a5898c4aac67494c2f0f7fe38373c95c371c930 upstream. + +If we failed walking a log tree during replay, we have a missing +transaction abort to prevent committing a transaction where we didn't +fully replay all the changes from a log tree and therefore can leave the +respective subvolume tree in some inconsistent state. So add the missing +transaction abort. + +CC: stable@vger.kernel.org # 6.1+ +Reviewed-by: Qu Wenruo +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-log.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -7295,11 +7295,14 @@ again: + + wc.replay_dest->log_root = log; + ret = btrfs_record_root_in_trans(trans, wc.replay_dest); +- if (ret) ++ if (ret) { + /* The loop needs to continue due to the root refs */ + btrfs_abort_transaction(trans, ret); +- else ++ } else { + ret = walk_log_tree(trans, log, &wc); ++ if (ret) ++ btrfs_abort_transaction(trans, ret); ++ } + + if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { + ret = fixup_inode_link_counts(trans, wc.replay_dest, diff --git a/queue-6.12/btrfs-clear-dirty-status-from-extent-buffer-on-error-at-insert_new_root.patch b/queue-6.12/btrfs-clear-dirty-status-from-extent-buffer-on-error-at-insert_new_root.patch new file mode 100644 index 0000000000..62e0fb7304 --- /dev/null +++ b/queue-6.12/btrfs-clear-dirty-status-from-extent-buffer-on-error-at-insert_new_root.patch @@ -0,0 +1,36 @@ +From c0d013495a80cbb53e2288af7ae0ec4170aafd7c Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 30 Jun 2025 10:50:46 +0100 +Subject: btrfs: clear dirty status from extent buffer on error at insert_new_root() + +From: Filipe Manana + +commit c0d013495a80cbb53e2288af7ae0ec4170aafd7c upstream. + +If we failed to insert the tree mod log operation, we are not removing the +dirty status from the allocated and dirtied extent buffer before we free +it. Removing the dirty status is needed for several reasons such as to +adjust the fs_info->dirty_metadata_bytes counter and remove the dirty +status from the respective folios. So add the missing call to +btrfs_clear_buffer_dirty(). + +Fixes: f61aa7ba08ab ("btrfs: do not BUG_ON() on tree mod log failure at insert_new_root()") +CC: stable@vger.kernel.org # 6.6+ +Reviewed-by: Boris Burkov +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/ctree.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -2901,6 +2901,7 @@ static noinline int insert_new_root(stru + if (ret < 0) { + int ret2; + ++ btrfs_clear_buffer_dirty(trans, c); + ret2 = btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1); + if (ret2 < 0) + btrfs_abort_transaction(trans, ret2); diff --git a/queue-6.12/btrfs-do-not-allow-relocation-of-partially-dropped-subvolumes.patch b/queue-6.12/btrfs-do-not-allow-relocation-of-partially-dropped-subvolumes.patch new file mode 100644 index 0000000000..c2904c85ba --- /dev/null +++ b/queue-6.12/btrfs-do-not-allow-relocation-of-partially-dropped-subvolumes.patch @@ -0,0 +1,124 @@ +From 4289b494ac553e74e86fed1c66b2bf9530bc1082 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Fri, 25 Jul 2025 20:33:25 +0930 +Subject: btrfs: do not allow relocation of partially dropped subvolumes + +From: Qu Wenruo + +commit 4289b494ac553e74e86fed1c66b2bf9530bc1082 upstream. + +[BUG] +There is an internal report that balance triggered transaction abort, +with the following call trace: + + item 85 key (594509824 169 0) itemoff 12599 itemsize 33 + extent refs 1 gen 197740 flags 2 + ref#0: tree block backref root 7 + item 86 key (594558976 169 0) itemoff 12566 itemsize 33 + extent refs 1 gen 197522 flags 2 + ref#0: tree block backref root 7 + ... + BTRFS error (device loop0): extent item not found for insert, bytenr 594526208 num_bytes 16384 parent 449921024 root_objectid 934 owner 1 offset 0 + BTRFS error (device loop0): failed to run delayed ref for logical 594526208 num_bytes 16384 type 182 action 1 ref_mod 1: -117 + ------------[ cut here ]------------ + BTRFS: Transaction aborted (error -117) + WARNING: CPU: 1 PID: 6963 at ../fs/btrfs/extent-tree.c:2168 btrfs_run_delayed_refs+0xfa/0x110 [btrfs] + +And btrfs check doesn't report anything wrong related to the extent +tree. + +[CAUSE] +The cause is a little complex, firstly the extent tree indeed doesn't +have the backref for 594526208. + +The extent tree only have the following two backrefs around that bytenr +on-disk: + + item 65 key (594509824 METADATA_ITEM 0) itemoff 13880 itemsize 33 + refs 1 gen 197740 flags TREE_BLOCK + tree block skinny level 0 + (176 0x7) tree block backref root CSUM_TREE + item 66 key (594558976 METADATA_ITEM 0) itemoff 13847 itemsize 33 + refs 1 gen 197522 flags TREE_BLOCK + tree block skinny level 0 + (176 0x7) tree block backref root CSUM_TREE + +But the such missing backref item is not an corruption on disk, as the +offending delayed ref belongs to subvolume 934, and that subvolume is +being dropped: + + item 0 key (934 ROOT_ITEM 198229) itemoff 15844 itemsize 439 + generation 198229 root_dirid 256 bytenr 10741039104 byte_limit 0 bytes_used 345571328 + last_snapshot 198229 flags 0x1000000000001(RDONLY) refs 0 + drop_progress key (206324 EXTENT_DATA 2711650304) drop_level 2 + level 2 generation_v2 198229 + +And that offending tree block 594526208 is inside the dropped range of +that subvolume. That explains why there is no backref item for that +bytenr and why btrfs check is not reporting anything wrong. + +But this also shows another problem, as btrfs will do all the orphan +subvolume cleanup at a read-write mount. + +So half-dropped subvolume should not exist after an RW mount, and +balance itself is also exclusive to subvolume cleanup, meaning we +shouldn't hit a subvolume half-dropped during relocation. + +The root cause is, there is no orphan item for this subvolume. +In fact there are 5 subvolumes from around 2021 that have the same +problem. + +It looks like the original report has some older kernels running, and +caused those zombie subvolumes. + +Thankfully upstream commit 8d488a8c7ba2 ("btrfs: fix subvolume/snapshot +deletion not triggered on mount") has long fixed the bug. + +[ENHANCEMENT] +For repairing such old fs, btrfs-progs will be enhanced. + +Considering how delayed the problem will show up (at run delayed ref +time) and at that time we have to abort transaction already, it is too +late. + +Instead here we reject any half-dropped subvolume for reloc tree at the +earliest time, preventing confusion and extra time wasted on debugging +similar bugs. + +CC: stable@vger.kernel.org # 5.15+ +Reviewed-by: Filipe Manana +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/relocation.c | 19 +++++++++++++++++++ + 1 file changed, 19 insertions(+) + +--- a/fs/btrfs/relocation.c ++++ b/fs/btrfs/relocation.c +@@ -687,6 +687,25 @@ static struct btrfs_root *create_reloc_r + if (btrfs_root_id(root) == objectid) { + u64 commit_root_gen; + ++ /* ++ * Relocation will wait for cleaner thread, and any half-dropped ++ * subvolume will be fully cleaned up at mount time. ++ * So here we shouldn't hit a subvolume with non-zero drop_progress. ++ * ++ * If this isn't the case, error out since it can make us attempt to ++ * drop references for extents that were already dropped before. ++ */ ++ if (unlikely(btrfs_disk_key_objectid(&root->root_item.drop_progress))) { ++ struct btrfs_key cpu_key; ++ ++ btrfs_disk_key_to_cpu(&cpu_key, &root->root_item.drop_progress); ++ btrfs_err(fs_info, ++ "cannot relocate partially dropped subvolume %llu, drop progress key (%llu %u %llu)", ++ objectid, cpu_key.objectid, cpu_key.type, cpu_key.offset); ++ ret = -EUCLEAN; ++ goto fail; ++ } ++ + /* called by btrfs_init_reloc_root */ + ret = btrfs_copy_root(trans, root, root->commit_root, &eb, + BTRFS_TREE_RELOC_OBJECTID); diff --git a/queue-6.12/btrfs-don-t-ignore-inode-missing-when-replaying-log-tree.patch b/queue-6.12/btrfs-don-t-ignore-inode-missing-when-replaying-log-tree.patch new file mode 100644 index 0000000000..3f3cd9177b --- /dev/null +++ b/queue-6.12/btrfs-don-t-ignore-inode-missing-when-replaying-log-tree.patch @@ -0,0 +1,78 @@ +From 7ebf381a69421a88265d3c49cd0f007ba7336c9d Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Fri, 11 Jul 2025 20:21:28 +0100 +Subject: btrfs: don't ignore inode missing when replaying log tree + +From: Filipe Manana + +commit 7ebf381a69421a88265d3c49cd0f007ba7336c9d upstream. + +During log replay, at add_inode_ref(), we return -ENOENT if our current +inode isn't found on the subvolume tree or if a parent directory isn't +found. The error comes from btrfs_iget_logging() <- btrfs_iget() <- +btrfs_read_locked_inode(). + +The single caller of add_inode_ref(), replay_one_buffer(), ignores an +-ENOENT error because it expects that error to mean only that a parent +directory wasn't found and that is ok. + +Before commit 5f61b961599a ("btrfs: fix inode lookup error handling during +log replay") we were converting any error when getting a parent directory +to -ENOENT and any error when getting the current inode to -EIO, so our +caller would fail log replay in case we can't find the current inode. +After that commit however in case the current inode is not found we return +-ENOENT to the caller and therefore it ignores the critical fact that the +current inode was not found in the subvolume tree. + +Fix this by converting -ENOENT to 0 when we don't find a parent directory, +returning -ENOENT when we don't find the current inode and making the +caller, replay_one_buffer(), not ignore -ENOENT anymore. + +Fixes: 5f61b961599a ("btrfs: fix inode lookup error handling during log replay") +CC: stable@vger.kernel.org # 6.16 +Reviewed-by: Boris Burkov +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-log.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -1396,6 +1396,8 @@ static noinline int add_inode_ref(struct + dir = btrfs_iget_logging(parent_objectid, root); + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); ++ if (ret == -ENOENT) ++ ret = 0; + dir = NULL; + goto out; + } +@@ -1420,6 +1422,15 @@ static noinline int add_inode_ref(struct + if (IS_ERR(dir)) { + ret = PTR_ERR(dir); + dir = NULL; ++ /* ++ * A new parent dir may have not been ++ * logged and not exist in the subvolume ++ * tree, see the comment above before ++ * the loop when getting the first ++ * parent dir. ++ */ ++ if (ret == -ENOENT) ++ ret = 0; + goto out; + } + } +@@ -2532,9 +2543,8 @@ static int replay_one_buffer(struct btrf + key.type == BTRFS_INODE_EXTREF_KEY) { + ret = add_inode_ref(wc->trans, root, log, path, + eb, i, &key); +- if (ret && ret != -ENOENT) ++ if (ret) + break; +- ret = 0; + } else if (key.type == BTRFS_EXTENT_DATA_KEY) { + ret = replay_one_extent(wc->trans, root, path, + eb, i, &key); diff --git a/queue-6.12/btrfs-don-t-skip-remaining-extrefs-if-dir-not-found-during-log-replay.patch b/queue-6.12/btrfs-don-t-skip-remaining-extrefs-if-dir-not-found-during-log-replay.patch new file mode 100644 index 0000000000..d600d4bc32 --- /dev/null +++ b/queue-6.12/btrfs-don-t-skip-remaining-extrefs-if-dir-not-found-during-log-replay.patch @@ -0,0 +1,90 @@ +From 24e066ded45b8147b79c7455ac43a5bff7b5f378 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Fri, 11 Jul 2025 20:48:23 +0100 +Subject: btrfs: don't skip remaining extrefs if dir not found during log replay + +From: Filipe Manana + +commit 24e066ded45b8147b79c7455ac43a5bff7b5f378 upstream. + +During log replay, at add_inode_ref(), if we have an extref item that +contains multiple extrefs and one of them points to a directory that does +not exist in the subvolume tree, we are supposed to ignore it and process +the remaining extrefs encoded in the extref item, since each extref can +point to a different parent inode. However when that happens we just +return from the function and ignore the remaining extrefs. + +The problem has been around since extrefs were introduced, in commit +f186373fef00 ("btrfs: extended inode refs"), but it's hard to hit in +practice because getting extref items encoding multiple extref requires +getting a hash collision when computing the offset of the extref's +key. The offset if computed like this: + + key.offset = btrfs_extref_hash(dir_ino, name->name, name->len); + +and btrfs_extref_hash() is just a wrapper around crc32c(). + +Fix this by moving to next iteration of the loop when we don't find +the parent directory that an extref points to. + +Fixes: f186373fef00 ("btrfs: extended inode refs") +CC: stable@vger.kernel.org # 6.1+ +Reviewed-by: Boris Burkov +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-log.c | 18 ++++++++++++++---- + 1 file changed, 14 insertions(+), 4 deletions(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -1413,6 +1413,8 @@ static noinline int add_inode_ref(struct + if (log_ref_ver) { + ret = extref_get_fields(eb, ref_ptr, &name, + &ref_index, &parent_objectid); ++ if (ret) ++ goto out; + /* + * parent object can change from one array + * item to another. +@@ -1429,16 +1431,23 @@ static noinline int add_inode_ref(struct + * the loop when getting the first + * parent dir. + */ +- if (ret == -ENOENT) ++ if (ret == -ENOENT) { ++ /* ++ * The next extref may refer to ++ * another parent dir that ++ * exists, so continue. ++ */ + ret = 0; ++ goto next; ++ } + goto out; + } + } + } else { + ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); ++ if (ret) ++ goto out; + } +- if (ret) +- goto out; + + ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), + ref_index, &name); +@@ -1472,10 +1481,11 @@ static noinline int add_inode_ref(struct + } + /* Else, ret == 1, we already have a perfect match, we're done. */ + ++next: + ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len; + kfree(name.name); + name.name = NULL; +- if (log_ref_ver) { ++ if (log_ref_ver && dir) { + iput(&dir->vfs_inode); + dir = NULL; + } diff --git a/queue-6.12/btrfs-error-on-missing-block-group-when-unaccounting-log-tree-extent-buffers.patch b/queue-6.12/btrfs-error-on-missing-block-group-when-unaccounting-log-tree-extent-buffers.patch new file mode 100644 index 0000000000..a6238f72d8 --- /dev/null +++ b/queue-6.12/btrfs-error-on-missing-block-group-when-unaccounting-log-tree-extent-buffers.patch @@ -0,0 +1,78 @@ +From fc5799986fbca957e2e3c0480027f249951b7bcf Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Wed, 16 Jul 2025 11:41:21 +0100 +Subject: btrfs: error on missing block group when unaccounting log tree extent buffers + +From: Filipe Manana + +commit fc5799986fbca957e2e3c0480027f249951b7bcf upstream. + +Currently we only log an error message if we can't find the block group +for a log tree extent buffer when unaccounting it (while freeing a log +tree). A missing block group means something is seriously wrong and we +end up leaking space from the metadata space info. So return -ENOENT in +case we don't find the block group. + +CC: stable@vger.kernel.org # 6.12+ +Reviewed-by: Boris Burkov +Reviewed-by: Qu Wenruo +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-log.c | 19 +++++++------------ + 1 file changed, 7 insertions(+), 12 deletions(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -2587,14 +2587,14 @@ static int replay_one_buffer(struct btrf + /* + * Correctly adjust the reserved bytes occupied by a log tree extent buffer + */ +-static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) ++static int unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start) + { + struct btrfs_block_group *cache; + + cache = btrfs_lookup_block_group(fs_info, start); + if (!cache) { + btrfs_err(fs_info, "unable to find block group for %llu", start); +- return; ++ return -ENOENT; + } + + spin_lock(&cache->space_info->lock); +@@ -2605,27 +2605,22 @@ static void unaccount_log_buffer(struct + spin_unlock(&cache->space_info->lock); + + btrfs_put_block_group(cache); ++ ++ return 0; + } + + static int clean_log_buffer(struct btrfs_trans_handle *trans, + struct extent_buffer *eb) + { +- int ret; +- + btrfs_tree_lock(eb); + btrfs_clear_buffer_dirty(trans, eb); + wait_on_extent_buffer_writeback(eb); + btrfs_tree_unlock(eb); + +- if (trans) { +- ret = btrfs_pin_reserved_extent(trans, eb); +- if (ret) +- return ret; +- } else { +- unaccount_log_buffer(eb->fs_info, eb->start); +- } ++ if (trans) ++ return btrfs_pin_reserved_extent(trans, eb); + +- return 0; ++ return unaccount_log_buffer(eb->fs_info, eb->start); + } + + static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, diff --git a/queue-6.12/btrfs-fix-iteration-bug-in-__qgroup_excl_accounting.patch b/queue-6.12/btrfs-fix-iteration-bug-in-__qgroup_excl_accounting.patch new file mode 100644 index 0000000000..90698418e3 --- /dev/null +++ b/queue-6.12/btrfs-fix-iteration-bug-in-__qgroup_excl_accounting.patch @@ -0,0 +1,92 @@ +From 7b632596188e1973c6b3ac1c9f8252f735e1039f Mon Sep 17 00:00:00 2001 +From: Boris Burkov +Date: Wed, 30 Jul 2025 09:29:23 -0700 +Subject: btrfs: fix iteration bug in __qgroup_excl_accounting() + +From: Boris Burkov + +commit 7b632596188e1973c6b3ac1c9f8252f735e1039f upstream. + +__qgroup_excl_accounting() uses the qgroup iterator machinery to +update the account of one qgroups usage for all its parent hierarchy, +when we either add or remove a relation and have only exclusive usage. + +However, there is a small bug there: we loop with an extra iteration +temporary qgroup called `cur` but never actually refer to that in the +body of the loop. As a result, we redundantly account the same usage to +the first qgroup in the list. + +This can be reproduced in the following way: + + mkfs.btrfs -f -O squota + mount + btrfs subvol create /sv + dd if=/dev/zero of=/sv/f bs=1M count=1 + sync + btrfs qgroup create 1/100 + btrfs qgroup create 2/200 + btrfs qgroup assign 1/100 2/200 + btrfs qgroup assign 0/256 1/100 + btrfs qgroup show + +and the broken result is (note the 2MiB on 1/100 and 0Mib on 2/100): + + Qgroupid Referenced Exclusive Path + -------- ---------- --------- ---- + 0/5 16.00KiB 16.00KiB + 0/256 1.02MiB 1.02MiB sv + + Qgroupid Referenced Exclusive Path + -------- ---------- --------- ---- + 0/5 16.00KiB 16.00KiB + 0/256 1.02MiB 1.02MiB sv + 1/100 2.03MiB 2.03MiB 2/100<1 member qgroup> + 2/100 0.00B 0.00B <0 member qgroups> + +With this fix, which simply re-uses `qgroup` as the iteration variable, +we see the expected result: + + Qgroupid Referenced Exclusive Path + -------- ---------- --------- ---- + 0/5 16.00KiB 16.00KiB + 0/256 1.02MiB 1.02MiB sv + + Qgroupid Referenced Exclusive Path + -------- ---------- --------- ---- + 0/5 16.00KiB 16.00KiB + 0/256 1.02MiB 1.02MiB sv + 1/100 1.02MiB 1.02MiB 2/100<1 member qgroup> + 2/100 1.02MiB 1.02MiB <0 member qgroups> + +The existing fstests did not exercise two layer inheritance so this bug +was missed. I intend to add that testing there, as well. + +Fixes: a0bdc04b0732 ("btrfs: qgroup: use qgroup_iterator in __qgroup_excl_accounting()") +CC: stable@vger.kernel.org # 6.12+ +Reviewed-by: Filipe Manana +Signed-off-by: Boris Burkov +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/qgroup.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -1492,7 +1492,6 @@ static int __qgroup_excl_accounting(stru + struct btrfs_qgroup *src, int sign) + { + struct btrfs_qgroup *qgroup; +- struct btrfs_qgroup *cur; + LIST_HEAD(qgroup_list); + u64 num_bytes = src->excl; + int ret = 0; +@@ -1502,7 +1501,7 @@ static int __qgroup_excl_accounting(stru + goto out; + + qgroup_iterator_add(&qgroup_list, qgroup); +- list_for_each_entry(cur, &qgroup_list, iterator) { ++ list_for_each_entry(qgroup, &qgroup_list, iterator) { + struct btrfs_qgroup_list *glist; + + qgroup->rfer += sign * num_bytes; diff --git a/queue-6.12/btrfs-fix-log-tree-replay-failure-due-to-file-with-0-links-and-extents.patch b/queue-6.12/btrfs-fix-log-tree-replay-failure-due-to-file-with-0-links-and-extents.patch new file mode 100644 index 0000000000..239852d255 --- /dev/null +++ b/queue-6.12/btrfs-fix-log-tree-replay-failure-due-to-file-with-0-links-and-extents.patch @@ -0,0 +1,147 @@ +From 0a32e4f0025a74c70dcab4478e9b29c22f5ecf2f Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Wed, 30 Jul 2025 19:18:37 +0100 +Subject: btrfs: fix log tree replay failure due to file with 0 links and extents + +From: Filipe Manana + +commit 0a32e4f0025a74c70dcab4478e9b29c22f5ecf2f upstream. + +If we log a new inode (not persisted in a past transaction) that has 0 +links and extents, then log another inode with an higher inode number, we +end up with failing to replay the log tree with -EINVAL. The steps for +this are: + +1) create new file A +2) write some data to file A +3) open an fd on file A +4) unlink file A +5) fsync file A using the previously open fd +6) create file B (has higher inode number than file A) +7) fsync file B +8) power fail before current transaction commits + +Now when attempting to mount the fs, the log replay will fail with +-ENOENT at replay_one_extent() when attempting to replay the first +extent of file A. The failure comes when trying to open the inode for +file A in the subvolume tree, since it doesn't exist. + +Before commit 5f61b961599a ("btrfs: fix inode lookup error handling +during log replay"), the returned error was -EIO instead of -ENOENT, +since we converted any errors when attempting to read an inode during +log replay to -EIO. + +The reason for this is that the log replay procedure fails to ignore +the current inode when we are at the stage LOG_WALK_REPLAY_ALL, our +current inode has 0 links and last inode we processed in the previous +stage has a non 0 link count. In other words, the issue is that at +replay_one_extent() we only update wc->ignore_cur_inode if the current +replay stage is LOG_WALK_REPLAY_INODES. + +Fix this by updating wc->ignore_cur_inode whenever we find an inode item +regardless of the current replay stage. This is a simple solution and easy +to backport, but later we can do other alternatives like avoid logging +extents or inode items other than the inode item for inodes with a link +count of 0. + +The problem with the wc->ignore_cur_inode logic has been around since +commit f2d72f42d5fa ("Btrfs: fix warning when replaying log after fsync +of a tmpfile") but it only became frequent to hit since the more recent +commit 5e85262e542d ("btrfs: fix fsync of files with no hard links not +persisting deletion"), because we stopped skipping inodes with a link +count of 0 when logging, while before the problem would only be triggered +if trying to replay a log tree created with an older kernel which has a +logged inode with 0 links. + +A test case for fstests will be submitted soon. + +Reported-by: Peter Jung +Link: https://lore.kernel.org/linux-btrfs/fce139db-4458-4788-bb97-c29acf6cb1df@cachyos.org/ +Reported-by: burneddi +Link: https://lore.kernel.org/linux-btrfs/lh4W-Lwc0Mbk-QvBhhQyZxf6VbM3E8VtIvU3fPIQgweP_Q1n7wtlUZQc33sYlCKYd-o6rryJQfhHaNAOWWRKxpAXhM8NZPojzsJPyHMf2qY=@protonmail.com/#t +Reported-by: Russell Haley +Link: https://lore.kernel.org/linux-btrfs/598ecc75-eb80-41b3-83c2-f2317fbb9864@gmail.com/ +Fixes: f2d72f42d5fa ("Btrfs: fix warning when replaying log after fsync of a tmpfile") +CC: stable@vger.kernel.org # 5.4+ +Reviewed-by: Boris Burkov +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-log.c | 48 ++++++++++++++++++++++++++++++------------------ + 1 file changed, 30 insertions(+), 18 deletions(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -324,8 +324,7 @@ struct walk_control { + + /* + * Ignore any items from the inode currently being processed. Needs +- * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in +- * the LOG_WALK_REPLAY_INODES stage. ++ * to be set every time we find a BTRFS_INODE_ITEM_KEY. + */ + bool ignore_cur_inode; + +@@ -2447,23 +2446,30 @@ static int replay_one_buffer(struct btrf + + nritems = btrfs_header_nritems(eb); + for (i = 0; i < nritems; i++) { +- btrfs_item_key_to_cpu(eb, &key, i); ++ struct btrfs_inode_item *inode_item; + +- /* inode keys are done during the first stage */ +- if (key.type == BTRFS_INODE_ITEM_KEY && +- wc->stage == LOG_WALK_REPLAY_INODES) { +- struct btrfs_inode_item *inode_item; +- u32 mode; ++ btrfs_item_key_to_cpu(eb, &key, i); + +- inode_item = btrfs_item_ptr(eb, i, +- struct btrfs_inode_item); ++ if (key.type == BTRFS_INODE_ITEM_KEY) { ++ inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item); + /* +- * If we have a tmpfile (O_TMPFILE) that got fsync'ed +- * and never got linked before the fsync, skip it, as +- * replaying it is pointless since it would be deleted +- * later. We skip logging tmpfiles, but it's always +- * possible we are replaying a log created with a kernel +- * that used to log tmpfiles. ++ * An inode with no links is either: ++ * ++ * 1) A tmpfile (O_TMPFILE) that got fsync'ed and never ++ * got linked before the fsync, skip it, as replaying ++ * it is pointless since it would be deleted later. ++ * We skip logging tmpfiles, but it's always possible ++ * we are replaying a log created with a kernel that ++ * used to log tmpfiles; ++ * ++ * 2) A non-tmpfile which got its last link deleted ++ * while holding an open fd on it and later got ++ * fsynced through that fd. We always log the ++ * parent inodes when inode->last_unlink_trans is ++ * set to the current transaction, so ignore all the ++ * inode items for this inode. We will delete the ++ * inode when processing the parent directory with ++ * replay_dir_deletes(). + */ + if (btrfs_inode_nlink(eb, inode_item) == 0) { + wc->ignore_cur_inode = true; +@@ -2471,8 +2477,14 @@ static int replay_one_buffer(struct btrf + } else { + wc->ignore_cur_inode = false; + } +- ret = replay_xattr_deletes(wc->trans, root, log, +- path, key.objectid); ++ } ++ ++ /* Inode keys are done during the first stage. */ ++ if (key.type == BTRFS_INODE_ITEM_KEY && ++ wc->stage == LOG_WALK_REPLAY_INODES) { ++ u32 mode; ++ ++ ret = replay_xattr_deletes(wc->trans, root, log, path, key.objectid); + if (ret) + break; + mode = btrfs_inode_mode(eb, inode_item); diff --git a/queue-6.12/btrfs-fix-ssd_spread-overallocation.patch b/queue-6.12/btrfs-fix-ssd_spread-overallocation.patch new file mode 100644 index 0000000000..5ac2349871 --- /dev/null +++ b/queue-6.12/btrfs-fix-ssd_spread-overallocation.patch @@ -0,0 +1,134 @@ +From 807d9023e75fc20bfd6dd2ac0408ce4af53f1648 Mon Sep 17 00:00:00 2001 +From: Boris Burkov +Date: Mon, 14 Jul 2025 16:44:28 -0700 +Subject: btrfs: fix ssd_spread overallocation + +From: Boris Burkov + +commit 807d9023e75fc20bfd6dd2ac0408ce4af53f1648 upstream. + +If the ssd_spread mount option is enabled, then we run the so called +clustered allocator for data block groups. In practice, this results in +creating a btrfs_free_cluster which caches a block_group and borrows its +free extents for allocation. + +Since the introduction of allocation size classes in 6.1, there has been +a bug in the interaction between that feature and ssd_spread. +find_free_extent() has a number of nested loops. The loop going over the +allocation stages, stored in ffe_ctl->loop and managed by +find_free_extent_update_loop(), the loop over the raid levels, and the +loop over all the block_groups in a space_info. The size class feature +relies on the block_group loop to ensure it gets a chance to see a +block_group of a given size class. However, the clustered allocator +uses the cached cluster block_group and breaks that loop. Each call to +do_allocation() will really just go back to the same cached block_group. +Normally, this is OK, as the allocation either succeeds and we don't +want to loop any more or it fails, and we clear the cluster and return +its space to the block_group. + +But with size classes, the allocation can succeed, then later fail, +outside of do_allocation() due to size class mismatch. That latter +failure is not properly handled due to the highly complex multi loop +logic. The result is a painful loop where we continue to allocate the +same num_bytes from the cluster in a tight loop until it fails and +releases the cluster and lets us try a new block_group. But by then, we +have skipped great swaths of the available block_groups and are likely +to fail to allocate, looping the outer loop. In pathological cases like +the reproducer below, the cached block_group is often the very last one, +in which case we don't perform this tight bg loop but instead rip +through the ffe stages to LOOP_CHUNK_ALLOC and allocate a chunk, which +is now the last one, and we enter the tight inner loop until an +allocation failure. Then allocation succeeds on the final block_group +and if the next allocation is a size mismatch, the exact same thing +happens again. + +Triggering this is as easy as mounting with -o ssd_spread and then +running: + + mount -o ssd_spread $dev $mnt + dd if=/dev/zero of=$mnt/big bs=16M count=1 &>/dev/null + dd if=/dev/zero of=$mnt/med bs=4M count=1 &>/dev/null + sync + +if you do the two writes + sync in a loop, you can force btrfs to spin +an excessive amount on semi-successful clustered allocations, before +ultimately failing and advancing to the stage where we force a chunk +allocation. This results in 2G of data allocated per iteration, despite +only using ~20M of data. By using a small size classed extent, the inner +loop takes longer and we can spin for longer. + +The simplest, shortest term fix to unbreak this is to make the clustered +allocator size_class aware in the dumbest way, where it fails on size +class mismatch. This may hinder the operation of the clustered +allocator, but better hindered than completely broken and terribly +overallocating. + +Further re-design improvements are also in the works. + +Fixes: 52bb7a2166af ("btrfs: introduce size class to block group allocator") +CC: stable@vger.kernel.org # 6.1+ +Reported-by: David Sterba +Reviewed-by: Filipe Manana +Signed-off-by: Boris Burkov +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent-tree.c | 33 +++++++++++++++++---------------- + 1 file changed, 17 insertions(+), 16 deletions(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -3654,6 +3654,21 @@ btrfs_release_block_group(struct btrfs_b + btrfs_put_block_group(cache); + } + ++static bool find_free_extent_check_size_class(const struct find_free_extent_ctl *ffe_ctl, ++ const struct btrfs_block_group *bg) ++{ ++ if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) ++ return true; ++ if (!btrfs_block_group_should_use_size_class(bg)) ++ return true; ++ if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) ++ return true; ++ if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && ++ bg->size_class == BTRFS_BG_SZ_NONE) ++ return true; ++ return ffe_ctl->size_class == bg->size_class; ++} ++ + /* + * Helper function for find_free_extent(). + * +@@ -3675,7 +3690,8 @@ static int find_free_extent_clustered(st + if (!cluster_bg) + goto refill_cluster; + if (cluster_bg != bg && (cluster_bg->ro || +- !block_group_bits(cluster_bg, ffe_ctl->flags))) ++ !block_group_bits(cluster_bg, ffe_ctl->flags) || ++ !find_free_extent_check_size_class(ffe_ctl, cluster_bg))) + goto release_cluster; + + offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr, +@@ -4231,21 +4247,6 @@ static int find_free_extent_update_loop( + return -ENOSPC; + } + +-static bool find_free_extent_check_size_class(struct find_free_extent_ctl *ffe_ctl, +- struct btrfs_block_group *bg) +-{ +- if (ffe_ctl->policy == BTRFS_EXTENT_ALLOC_ZONED) +- return true; +- if (!btrfs_block_group_should_use_size_class(bg)) +- return true; +- if (ffe_ctl->loop >= LOOP_WRONG_SIZE_CLASS) +- return true; +- if (ffe_ctl->loop >= LOOP_UNSET_SIZE_CLASS && +- bg->size_class == BTRFS_BG_SZ_NONE) +- return true; +- return ffe_ctl->size_class == bg->size_class; +-} +- + static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, diff --git a/queue-6.12/btrfs-populate-otime-when-logging-an-inode-item.patch b/queue-6.12/btrfs-populate-otime-when-logging-an-inode-item.patch new file mode 100644 index 0000000000..42062e009c --- /dev/null +++ b/queue-6.12/btrfs-populate-otime-when-logging-an-inode-item.patch @@ -0,0 +1,109 @@ +From 1ef94169db0958d6de39f9ea6e063ce887342e2d Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Wed, 2 Jul 2025 15:08:13 +0930 +Subject: btrfs: populate otime when logging an inode item + +From: Qu Wenruo + +commit 1ef94169db0958d6de39f9ea6e063ce887342e2d upstream. + +[TEST FAILURE WITH EXPERIMENTAL FEATURES] +When running test case generic/508, the test case will fail with the new +btrfs shutdown support: + +generic/508 - output mismatch (see /home/adam/xfstests/results//generic/508.out.bad) +# --- tests/generic/508.out 2022-05-11 11:25:30.806666664 +0930 +# +++ /home/adam/xfstests/results//generic/508.out.bad 2025-07-02 14:53:22.401824212 +0930 +# @@ -1,2 +1,6 @@ +# QA output created by 508 +# Silence is golden +# +Before: +# +After : stat.btime = Thu Jan 1 09:30:00 1970 +# +Before: +# +After : stat.btime = Wed Jul 2 14:53:22 2025 +# ... +# (Run 'diff -u /home/adam/xfstests/tests/generic/508.out /home/adam/xfstests/results//generic/508.out.bad' to see the entire diff) +Ran: generic/508 +Failures: generic/508 +Failed 1 of 1 tests + +Please note that the test case requires shutdown support, thus the test +case will be skipped using the current upstream kernel, as it doesn't +have shutdown ioctl support. + +[CAUSE] +The direct cause the 0 time stamp in the log tree: + +leaf 30507008 items 2 free space 16057 generation 9 owner TREE_LOG +leaf 30507008 flags 0x1(WRITTEN) backref revision 1 +checksum stored e522548d +checksum calced e522548d +fs uuid 57d45451-481e-43e4-aa93-289ad707a3a0 +chunk uuid d52bd3fd-5163-4337-98a7-7986993ad398 + item 0 key (257 INODE_ITEM 0) itemoff 16123 itemsize 160 + generation 9 transid 9 size 0 nbytes 0 + block group 0 mode 100644 links 1 uid 0 gid 0 rdev 0 + sequence 1 flags 0x0(none) + atime 1751432947.492000000 (2025-07-02 14:39:07) + ctime 1751432947.492000000 (2025-07-02 14:39:07) + mtime 1751432947.492000000 (2025-07-02 14:39:07) + otime 0.0 (1970-01-01 09:30:00) <<< + +But the old fs tree has all the correct time stamp: + +btrfs-progs v6.12 +fs tree key (FS_TREE ROOT_ITEM 0) +leaf 30425088 items 2 free space 16061 generation 5 owner FS_TREE +leaf 30425088 flags 0x1(WRITTEN) backref revision 1 +checksum stored 48f6c57e +checksum calced 48f6c57e +fs uuid 57d45451-481e-43e4-aa93-289ad707a3a0 +chunk uuid d52bd3fd-5163-4337-98a7-7986993ad398 + item 0 key (256 INODE_ITEM 0) itemoff 16123 itemsize 160 + generation 3 transid 0 size 0 nbytes 16384 + block group 0 mode 40755 links 1 uid 0 gid 0 rdev 0 + sequence 0 flags 0x0(none) + atime 1751432947.0 (2025-07-02 14:39:07) + ctime 1751432947.0 (2025-07-02 14:39:07) + mtime 1751432947.0 (2025-07-02 14:39:07) + otime 1751432947.0 (2025-07-02 14:39:07) <<< + +The root cause is that fill_inode_item() in tree-log.c is only +populating a/c/m time, not the otime (or btime in statx output). + +Part of the reason is that, the vfs inode only has a/c/m time, no native +btime support yet. + +[FIX] +Thankfully btrfs has its otime stored in btrfs_inode::i_otime_sec and +btrfs_inode::i_otime_nsec. + +So what we really need is just fill the otime time stamp in +fill_inode_item() of tree-log.c + +There is another fill_inode_item() in inode.c, which is doing the proper +otime population. + +Fixes: 94edf4ae43a5 ("Btrfs: don't bother committing delayed inode updates when fsyncing") +CC: stable@vger.kernel.org +Reviewed-by: Filipe Manana +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-log.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -4235,6 +4235,9 @@ static void fill_inode_item(struct btrfs + btrfs_set_token_timespec_nsec(&token, &item->ctime, + inode_get_ctime_nsec(inode)); + ++ btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec); ++ btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec); ++ + /* + * We do not need to set the nbytes field, in fact during a fast fsync + * its value may not even be correct, since a fast fsync does not wait diff --git a/queue-6.12/btrfs-qgroup-fix-qgroup-create-ioctl-returning-success-after-quotas-disabled.patch b/queue-6.12/btrfs-qgroup-fix-qgroup-create-ioctl-returning-success-after-quotas-disabled.patch new file mode 100644 index 0000000000..d76962cc15 --- /dev/null +++ b/queue-6.12/btrfs-qgroup-fix-qgroup-create-ioctl-returning-success-after-quotas-disabled.patch @@ -0,0 +1,75 @@ +From 08530d6e638427e7e1344bd67bacc03882ba95b9 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Tue, 1 Jul 2025 15:44:16 +0100 +Subject: btrfs: qgroup: fix qgroup create ioctl returning success after quotas disabled + +From: Filipe Manana + +commit 08530d6e638427e7e1344bd67bacc03882ba95b9 upstream. + +When quotas are disabled qgroup ioctls are supposed to return -ENOTCONN, +but the qgroup create ioctl stopped doing that when it races with a quota +disable operation, returning 0 instead. This change of behaviour happened +in commit 6ed05643ddb1 ("btrfs: create qgroup earlier in snapshot +creation"). + +The issue happens as follows: + +1) Task A enters btrfs_ioctl_qgroup_create(), qgroups are enabled and so + qgroup_enabled() returns true since fs_info->quota_root is not NULL; + +2) Task B enters btrfs_ioctl_quota_ctl() -> btrfs_quota_disable() and + disables qgroups, so now fs_info->quota_root is NULL; + +3) Task A enters btrfs_create_qgroup() and calls btrfs_qgroup_mode(), + which returns BTRFS_QGROUP_MODE_DISABLED since quotas are disabled, + and then btrfs_create_qgroup() returns 0 to the caller, which makes + the ioctl return 0 instead of -ENOTCONN. + + The check for fs_info->quota_root and returning -ENOTCONN if it's NULL + is made only after the call btrfs_qgroup_mode(). + +Fix this by moving the check for disabled quotas with btrfs_qgroup_mode() +into transaction.c:create_pending_snapshot(), so that we don't abort the +transaction if btrfs_create_qgroup() returns -ENOTCONN and quotas are +disabled. + +Fixes: 6ed05643ddb1 ("btrfs: create qgroup earlier in snapshot creation") +CC: stable@vger.kernel.org # 6.12+ +Reviewed-by: Qu Wenruo +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/qgroup.c | 3 --- + fs/btrfs/transaction.c | 6 ++++-- + 2 files changed, 4 insertions(+), 5 deletions(-) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -1701,9 +1701,6 @@ int btrfs_create_qgroup(struct btrfs_tra + struct btrfs_qgroup *prealloc = NULL; + int ret = 0; + +- if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) +- return 0; +- + mutex_lock(&fs_info->qgroup_ioctl_lock); + if (!fs_info->quota_root) { + ret = -ENOTCONN; +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -1739,8 +1739,10 @@ static noinline int create_pending_snaps + + ret = btrfs_create_qgroup(trans, objectid); + if (ret && ret != -EEXIST) { +- btrfs_abort_transaction(trans, ret); +- goto fail; ++ if (ret != -ENOTCONN || btrfs_qgroup_enabled(fs_info)) { ++ btrfs_abort_transaction(trans, ret); ++ goto fail; ++ } + } + + /* diff --git a/queue-6.12/btrfs-qgroup-set-quota-enabled-bit-if-quota-disable-fails-flushing-reservations.patch b/queue-6.12/btrfs-qgroup-set-quota-enabled-bit-if-quota-disable-fails-flushing-reservations.patch new file mode 100644 index 0000000000..010c11cfe8 --- /dev/null +++ b/queue-6.12/btrfs-qgroup-set-quota-enabled-bit-if-quota-disable-fails-flushing-reservations.patch @@ -0,0 +1,48 @@ +From e41c75ca3189341e76e6af64b857c05b68a1d7db Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Tue, 1 Jul 2025 11:39:44 +0100 +Subject: btrfs: qgroup: set quota enabled bit if quota disable fails flushing reservations + +From: Filipe Manana + +commit e41c75ca3189341e76e6af64b857c05b68a1d7db upstream. + +Before waiting for the rescan worker to finish and flushing reservations, +we clear the BTRFS_FS_QUOTA_ENABLED flag from fs_info. If we fail flushing +reservations we leave with the flag not set which is not correct since +quotas are still enabled - we must set back the flag on error paths, such +as when we fail to start a transaction, except for error paths that abort +a transaction. The reservation flushing happens very early before we do +any operation that actually disables quotas and before we start a +transaction, so set back BTRFS_FS_QUOTA_ENABLED if it fails. + +Fixes: af0e2aab3b70 ("btrfs: qgroup: flush reservations during quota disable") +CC: stable@vger.kernel.org # 6.12+ +Reviewed-by: Qu Wenruo +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/qgroup.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -1373,11 +1373,14 @@ int btrfs_quota_disable(struct btrfs_fs_ + + /* + * We have nothing held here and no trans handle, just return the error +- * if there is one. ++ * if there is one and set back the quota enabled bit since we didn't ++ * actually disable quotas. + */ + ret = flush_reservations(fs_info); +- if (ret) ++ if (ret) { ++ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + return ret; ++ } + + /* + * 1 For the root item diff --git a/queue-6.12/btrfs-zoned-do-not-remove-unwritten-non-data-block-group.patch b/queue-6.12/btrfs-zoned-do-not-remove-unwritten-non-data-block-group.patch new file mode 100644 index 0000000000..fe7747e7f5 --- /dev/null +++ b/queue-6.12/btrfs-zoned-do-not-remove-unwritten-non-data-block-group.patch @@ -0,0 +1,88 @@ +From 3061801420469610c8fa6080a950e56770773ef1 Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Sun, 29 Jun 2025 23:07:42 +0900 +Subject: btrfs: zoned: do not remove unwritten non-data block group + +From: Naohiro Aota + +commit 3061801420469610c8fa6080a950e56770773ef1 upstream. + +There are some reports of "unable to find chunk map for logical 2147483648 +length 16384" error message appears in dmesg. This means some IOs are +occurring after a block group is removed. + +When a metadata tree node is cleaned on a zoned setup, we keep that node +still dirty and write it out not to create a write hole. However, this can +make a block group's used bytes == 0 while there is a dirty region left. + +Such an unused block group is moved into the unused_bg list and processed +for removal. When the removal succeeds, the block group is removed from the +transaction->dirty_bgs list, so the unused dirty nodes in the block group +are not sent at the transaction commit time. It will be written at some +later time e.g, sync or umount, and causes "unable to find chunk map" +errors. + +This can happen relatively easy on SMR whose zone size is 256MB. However, +calling do_zone_finish() on such block group returns -EAGAIN and keep that +block group intact, which is why the issue is hidden until now. + +Fixes: afba2bc036b0 ("btrfs: zoned: implement active zone tracking") +CC: stable@vger.kernel.org # 6.1+ +Reviewed-by: Johannes Thumshirn +Signed-off-by: Naohiro Aota +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/block-group.c | 27 +++++++++++++++++++++++++-- + 1 file changed, 25 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/block-group.c ++++ b/fs/btrfs/block-group.c +@@ -34,6 +34,19 @@ int btrfs_should_fragment_free_space(con + } + #endif + ++static inline bool has_unwritten_metadata(struct btrfs_block_group *block_group) ++{ ++ /* The meta_write_pointer is available only on the zoned setup. */ ++ if (!btrfs_is_zoned(block_group->fs_info)) ++ return false; ++ ++ if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) ++ return false; ++ ++ return block_group->start + block_group->alloc_offset > ++ block_group->meta_write_pointer; ++} ++ + /* + * Return target flags in extended format or 0 if restripe for this chunk_type + * is not in progress +@@ -1249,6 +1262,15 @@ int btrfs_remove_block_group(struct btrf + goto out; + + spin_lock(&block_group->lock); ++ /* ++ * Hitting this WARN means we removed a block group with an unwritten ++ * region. It will cause "unable to find chunk map for logical" errors. ++ */ ++ if (WARN_ON(has_unwritten_metadata(block_group))) ++ btrfs_warn(fs_info, ++ "block group %llu is removed before metadata write out", ++ block_group->start); ++ + set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags); + + /* +@@ -1567,8 +1589,9 @@ void btrfs_delete_unused_bgs(struct btrf + * needing to allocate extents from the block group. + */ + used = btrfs_space_info_used(space_info, true); +- if (space_info->total_bytes - block_group->length < used && +- block_group->zone_unusable < block_group->length) { ++ if ((space_info->total_bytes - block_group->length < used && ++ block_group->zone_unusable < block_group->length) || ++ has_unwritten_metadata(block_group)) { + /* + * Add a reference for the list, compensate for the ref + * drop under the "next" label for the diff --git a/queue-6.12/btrfs-zoned-do-not-select-metadata-bg-as-finish-target.patch b/queue-6.12/btrfs-zoned-do-not-select-metadata-bg-as-finish-target.patch new file mode 100644 index 0000000000..0ba287b08e --- /dev/null +++ b/queue-6.12/btrfs-zoned-do-not-select-metadata-bg-as-finish-target.patch @@ -0,0 +1,37 @@ +From 3a931e9b39c7ff8066657042f5f00d3b7e6ad315 Mon Sep 17 00:00:00 2001 +From: Naohiro Aota +Date: Wed, 16 Jul 2025 16:59:52 +0900 +Subject: btrfs: zoned: do not select metadata BG as finish target + +From: Naohiro Aota + +commit 3a931e9b39c7ff8066657042f5f00d3b7e6ad315 upstream. + +We call btrfs_zone_finish_one_bg() to zone finish one block group and make +room to activate another block group. Currently, we can choose a metadata +block group as a target. But, as we reserve an active metadata block group, +we no longer want to select a metadata block group. So, skip it in the +loop. + +CC: stable@vger.kernel.org # 6.6+ +Reviewed-by: Damien Le Moal +Reviewed-by: Johannes Thumshirn +Signed-off-by: Naohiro Aota +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/zoned.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -2523,7 +2523,7 @@ int btrfs_zone_finish_one_bg(struct btrf + + spin_lock(&block_group->lock); + if (block_group->reserved || block_group->alloc_offset == 0 || +- (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) || ++ !(block_group->flags & BTRFS_BLOCK_GROUP_DATA) || + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { + spin_unlock(&block_group->lock); + continue; diff --git a/queue-6.12/btrfs-zoned-use-filesystem-size-not-disk-size-for-reclaim-decision.patch b/queue-6.12/btrfs-zoned-use-filesystem-size-not-disk-size-for-reclaim-decision.patch new file mode 100644 index 0000000000..b9f3e1d22d --- /dev/null +++ b/queue-6.12/btrfs-zoned-use-filesystem-size-not-disk-size-for-reclaim-decision.patch @@ -0,0 +1,48 @@ +From 55f7c65b2f69c7e4cb7aa7c1654a228ccf734fd8 Mon Sep 17 00:00:00 2001 +From: Johannes Thumshirn +Date: Tue, 20 May 2025 09:20:47 +0200 +Subject: btrfs: zoned: use filesystem size not disk size for reclaim decision + +From: Johannes Thumshirn + +commit 55f7c65b2f69c7e4cb7aa7c1654a228ccf734fd8 upstream. + +When deciding if a zoned filesystem is reaching the threshold to reclaim +data block groups, look at the size of the filesystem not to potentially +total available size of all drives in the filesystem. + +Especially if a filesystem was created with mkfs' -b option, constraining +it to only a portion of the block device, the numbers won't match and +potentially garbage collection is kicking in too late. + +Fixes: 3687fcb0752a ("btrfs: zoned: make auto-reclaim less aggressive") +CC: stable@vger.kernel.org # 6.1+ +Reviewed-by: Damien Le Moal +Tested-by: Damien Le Moal +Signed-off-by: Johannes Thumshirn +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/zoned.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/fs/btrfs/zoned.c ++++ b/fs/btrfs/zoned.c +@@ -2456,8 +2456,8 @@ bool btrfs_zoned_should_reclaim(const st + { + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; ++ u64 total = btrfs_super_total_bytes(fs_info->super_copy); + u64 used = 0; +- u64 total = 0; + u64 factor; + + ASSERT(btrfs_is_zoned(fs_info)); +@@ -2470,7 +2470,6 @@ bool btrfs_zoned_should_reclaim(const st + if (!device->bdev) + continue; + +- total += device->disk_total_bytes; + used += device->bytes_used; + } + mutex_unlock(&fs_devices->device_list_mutex); diff --git a/queue-6.12/cdc-acm-fix-race-between-initial-clearing-halt-and-open.patch b/queue-6.12/cdc-acm-fix-race-between-initial-clearing-halt-and-open.patch new file mode 100644 index 0000000000..e5bc2ae952 --- /dev/null +++ b/queue-6.12/cdc-acm-fix-race-between-initial-clearing-halt-and-open.patch @@ -0,0 +1,50 @@ +From 64690a90cd7c6db16d3af8616be1f4bf8d492850 Mon Sep 17 00:00:00 2001 +From: Oliver Neukum +Date: Thu, 17 Jul 2025 16:12:50 +0200 +Subject: cdc-acm: fix race between initial clearing halt and open + +From: Oliver Neukum + +commit 64690a90cd7c6db16d3af8616be1f4bf8d492850 upstream. + +On the devices that need their endpoints to get an +initial clear_halt, this needs to be done before +the devices can be opened. That means it needs to be +before the devices are registered. + +Fixes: 15bf722e6f6c0 ("cdc-acm: Add support of ATOL FPrint fiscal printers") +Cc: stable +Signed-off-by: Oliver Neukum +Link: https://lore.kernel.org/r/20250717141259.2345605-1-oneukum@suse.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/usb/class/cdc-acm.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +--- a/drivers/usb/class/cdc-acm.c ++++ b/drivers/usb/class/cdc-acm.c +@@ -1520,6 +1520,12 @@ skip_countries: + goto err_remove_files; + } + ++ if (quirks & CLEAR_HALT_CONDITIONS) { ++ /* errors intentionally ignored */ ++ usb_clear_halt(usb_dev, acm->in); ++ usb_clear_halt(usb_dev, acm->out); ++ } ++ + tty_dev = tty_port_register_device(&acm->port, acm_tty_driver, minor, + &control_interface->dev); + if (IS_ERR(tty_dev)) { +@@ -1527,11 +1533,6 @@ skip_countries: + goto err_release_data_interface; + } + +- if (quirks & CLEAR_HALT_CONDITIONS) { +- usb_clear_halt(usb_dev, acm->in); +- usb_clear_halt(usb_dev, acm->out); +- } +- + dev_info(&intf->dev, "ttyACM%d: USB ACM device\n", minor); + + return 0; diff --git a/queue-6.12/comedi-fix-race-between-polling-and-detaching.patch b/queue-6.12/comedi-fix-race-between-polling-and-detaching.patch new file mode 100644 index 0000000000..c65376d6e2 --- /dev/null +++ b/queue-6.12/comedi-fix-race-between-polling-and-detaching.patch @@ -0,0 +1,157 @@ +From 35b6fc51c666fc96355be5cd633ed0fe4ccf68b2 Mon Sep 17 00:00:00 2001 +From: Ian Abbott +Date: Tue, 22 Jul 2025 16:53:16 +0100 +Subject: comedi: fix race between polling and detaching + +From: Ian Abbott + +commit 35b6fc51c666fc96355be5cd633ed0fe4ccf68b2 upstream. + +syzbot reports a use-after-free in comedi in the below link, which is +due to comedi gladly removing the allocated async area even though poll +requests are still active on the wait_queue_head inside of it. This can +cause a use-after-free when the poll entries are later triggered or +removed, as the memory for the wait_queue_head has been freed. We need +to check there are no tasks queued on any of the subdevices' wait queues +before allowing the device to be detached by the `COMEDI_DEVCONFIG` +ioctl. + +Tasks will read-lock `dev->attach_lock` before adding themselves to the +subdevice wait queue, so fix the problem in the `COMEDI_DEVCONFIG` ioctl +handler by write-locking `dev->attach_lock` before checking that all of +the subdevices are safe to be deleted. This includes testing for any +sleepers on the subdevices' wait queues. It remains locked until the +device has been detached. This requires the `comedi_device_detach()` +function to be refactored slightly, moving the bulk of it into new +function `comedi_device_detach_locked()`. + +Note that the refactor of `comedi_device_detach()` results in +`comedi_device_cancel_all()` now being called while `dev->attach_lock` +is write-locked, which wasn't the case previously, but that does not +matter. + +Thanks to Jens Axboe for diagnosing the problem and co-developing this +patch. + +Cc: stable +Fixes: 2f3fdcd7ce93 ("staging: comedi: add rw_semaphore to protect against device detachment") +Link: https://lore.kernel.org/all/687bd5fe.a70a0220.693ce.0091.GAE@google.com/ +Reported-by: syzbot+01523a0ae5600aef5895@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=01523a0ae5600aef5895 +Co-developed-by: Jens Axboe +Signed-off-by: Jens Axboe +Signed-off-by: Ian Abbott +Tested-by: Jens Axboe +Link: https://lore.kernel.org/r/20250722155316.27432-1-abbotti@mev.co.uk +Signed-off-by: Greg Kroah-Hartman +--- + drivers/comedi/comedi_fops.c | 33 +++++++++++++++++++++++++-------- + drivers/comedi/comedi_internal.h | 1 + + drivers/comedi/drivers.c | 13 ++++++++++--- + 3 files changed, 36 insertions(+), 11 deletions(-) + +--- a/drivers/comedi/comedi_fops.c ++++ b/drivers/comedi/comedi_fops.c +@@ -787,6 +787,7 @@ static int is_device_busy(struct comedi_ + struct comedi_subdevice *s; + int i; + ++ lockdep_assert_held_write(&dev->attach_lock); + lockdep_assert_held(&dev->mutex); + if (!dev->attached) + return 0; +@@ -795,7 +796,16 @@ static int is_device_busy(struct comedi_ + s = &dev->subdevices[i]; + if (s->busy) + return 1; +- if (s->async && comedi_buf_is_mmapped(s)) ++ if (!s->async) ++ continue; ++ if (comedi_buf_is_mmapped(s)) ++ return 1; ++ /* ++ * There may be tasks still waiting on the subdevice's wait ++ * queue, although they should already be about to be removed ++ * from it since the subdevice has no active async command. ++ */ ++ if (wq_has_sleeper(&s->async->wait_head)) + return 1; + } + +@@ -825,15 +835,22 @@ static int do_devconfig_ioctl(struct com + return -EPERM; + + if (!arg) { +- if (is_device_busy(dev)) +- return -EBUSY; +- if (dev->attached) { +- struct module *driver_module = dev->driver->module; ++ int rc = 0; + +- comedi_device_detach(dev); +- module_put(driver_module); ++ if (dev->attached) { ++ down_write(&dev->attach_lock); ++ if (is_device_busy(dev)) { ++ rc = -EBUSY; ++ } else { ++ struct module *driver_module = ++ dev->driver->module; ++ ++ comedi_device_detach_locked(dev); ++ module_put(driver_module); ++ } ++ up_write(&dev->attach_lock); + } +- return 0; ++ return rc; + } + + if (copy_from_user(&it, arg, sizeof(it))) +--- a/drivers/comedi/comedi_internal.h ++++ b/drivers/comedi/comedi_internal.h +@@ -50,6 +50,7 @@ extern struct mutex comedi_drivers_list_ + int insn_inval(struct comedi_device *dev, struct comedi_subdevice *s, + struct comedi_insn *insn, unsigned int *data); + ++void comedi_device_detach_locked(struct comedi_device *dev); + void comedi_device_detach(struct comedi_device *dev); + int comedi_device_attach(struct comedi_device *dev, + struct comedi_devconfig *it); +--- a/drivers/comedi/drivers.c ++++ b/drivers/comedi/drivers.c +@@ -158,7 +158,7 @@ static void comedi_device_detach_cleanup + int i; + struct comedi_subdevice *s; + +- lockdep_assert_held(&dev->attach_lock); ++ lockdep_assert_held_write(&dev->attach_lock); + lockdep_assert_held(&dev->mutex); + if (dev->subdevices) { + for (i = 0; i < dev->n_subdevices; i++) { +@@ -196,16 +196,23 @@ static void comedi_device_detach_cleanup + comedi_clear_hw_dev(dev); + } + +-void comedi_device_detach(struct comedi_device *dev) ++void comedi_device_detach_locked(struct comedi_device *dev) + { ++ lockdep_assert_held_write(&dev->attach_lock); + lockdep_assert_held(&dev->mutex); + comedi_device_cancel_all(dev); +- down_write(&dev->attach_lock); + dev->attached = false; + dev->detach_count++; + if (dev->driver) + dev->driver->detach(dev); + comedi_device_detach_cleanup(dev); ++} ++ ++void comedi_device_detach(struct comedi_device *dev) ++{ ++ lockdep_assert_held(&dev->mutex); ++ down_write(&dev->attach_lock); ++ comedi_device_detach_locked(dev); + up_write(&dev->attach_lock); + } + diff --git a/queue-6.12/series b/queue-6.12/series index 726cfe4e80..b3aa5c0eac 100644 --- a/queue-6.12/series +++ b/queue-6.12/series @@ -385,3 +385,22 @@ ext4-initialize-superblock-fields-in-the-kballoc-test.c-kunit-tests.patch usb-core-config-prevent-oob-read-in-ss-endpoint-companion-parsing.patch misc-rtsx-usb-ensure-mmc-child-device-is-active-when-card-is-present.patch usb-typec-ucsi-update-power_supply-on-power-role-change.patch +comedi-fix-race-between-polling-and-detaching.patch +thunderbolt-fix-copy-paste-error-in-match_service_id.patch +cdc-acm-fix-race-between-initial-clearing-halt-and-open.patch +btrfs-zoned-use-filesystem-size-not-disk-size-for-reclaim-decision.patch +btrfs-abort-transaction-during-log-replay-if-walk_log_tree-failed.patch +btrfs-zoned-do-not-remove-unwritten-non-data-block-group.patch +btrfs-qgroup-set-quota-enabled-bit-if-quota-disable-fails-flushing-reservations.patch +btrfs-don-t-ignore-inode-missing-when-replaying-log-tree.patch +btrfs-fix-ssd_spread-overallocation.patch +btrfs-populate-otime-when-logging-an-inode-item.patch +btrfs-qgroup-fix-qgroup-create-ioctl-returning-success-after-quotas-disabled.patch +btrfs-don-t-skip-remaining-extrefs-if-dir-not-found-during-log-replay.patch +btrfs-clear-dirty-status-from-extent-buffer-on-error-at-insert_new_root.patch +btrfs-fix-log-tree-replay-failure-due-to-file-with-0-links-and-extents.patch +btrfs-error-on-missing-block-group-when-unaccounting-log-tree-extent-buffers.patch +btrfs-zoned-do-not-select-metadata-bg-as-finish-target.patch +btrfs-fix-iteration-bug-in-__qgroup_excl_accounting.patch +btrfs-do-not-allow-relocation-of-partially-dropped-subvolumes.patch +xfs-fix-scrub-trace-with-null-pointer-in-quotacheck.patch diff --git a/queue-6.12/thunderbolt-fix-copy-paste-error-in-match_service_id.patch b/queue-6.12/thunderbolt-fix-copy-paste-error-in-match_service_id.patch new file mode 100644 index 0000000000..44e1f33a08 --- /dev/null +++ b/queue-6.12/thunderbolt-fix-copy-paste-error-in-match_service_id.patch @@ -0,0 +1,32 @@ +From 5cc1f66cb23cccc704e3def27ad31ed479e934a5 Mon Sep 17 00:00:00 2001 +From: Eric Biggers +Date: Sun, 20 Jul 2025 22:01:36 -0700 +Subject: thunderbolt: Fix copy+paste error in match_service_id() + +From: Eric Biggers + +commit 5cc1f66cb23cccc704e3def27ad31ed479e934a5 upstream. + +The second instance of TBSVC_MATCH_PROTOCOL_VERSION seems to have been +intended to be TBSVC_MATCH_PROTOCOL_REVISION. + +Fixes: d1ff70241a27 ("thunderbolt: Add support for XDomain discovery protocol") +Cc: stable +Signed-off-by: Eric Biggers +Link: https://lore.kernel.org/r/20250721050136.30004-1-ebiggers@kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + drivers/thunderbolt/domain.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/thunderbolt/domain.c ++++ b/drivers/thunderbolt/domain.c +@@ -36,7 +36,7 @@ static bool match_service_id(const struc + return false; + } + +- if (id->match_flags & TBSVC_MATCH_PROTOCOL_VERSION) { ++ if (id->match_flags & TBSVC_MATCH_PROTOCOL_REVISION) { + if (id->protocol_revision != svc->prtcrevs) + return false; + } diff --git a/queue-6.12/xfs-fix-scrub-trace-with-null-pointer-in-quotacheck.patch b/queue-6.12/xfs-fix-scrub-trace-with-null-pointer-in-quotacheck.patch new file mode 100644 index 0000000000..bd25bdb9a9 --- /dev/null +++ b/queue-6.12/xfs-fix-scrub-trace-with-null-pointer-in-quotacheck.patch @@ -0,0 +1,32 @@ +From 5d94b19f066480addfcdcb5efde66152ad5a7c0e Mon Sep 17 00:00:00 2001 +From: Andrey Albershteyn +Date: Thu, 31 Jul 2025 19:07:22 +0200 +Subject: xfs: fix scrub trace with null pointer in quotacheck + +From: Andrey Albershteyn + +commit 5d94b19f066480addfcdcb5efde66152ad5a7c0e upstream. + +The quotacheck doesn't initialize sc->ip. + +Cc: stable@vger.kernel.org # v6.8 +Fixes: 21d7500929c8a0 ("xfs: improve dquot iteration for scrub") +Reviewed-by: Darrick J. Wong +Signed-off-by: Andrey Albershteyn +Signed-off-by: Carlos Maiolino +Signed-off-by: Greg Kroah-Hartman +--- + fs/xfs/scrub/trace.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/xfs/scrub/trace.h ++++ b/fs/xfs/scrub/trace.h +@@ -467,7 +467,7 @@ DECLARE_EVENT_CLASS(xchk_dqiter_class, + __field(xfs_exntst_t, state) + ), + TP_fast_assign( +- __entry->dev = cursor->sc->ip->i_mount->m_super->s_dev; ++ __entry->dev = cursor->sc->mp->m_super->s_dev; + __entry->dqtype = cursor->dqtype; + __entry->ino = cursor->quota_ip->i_ino; + __entry->cur_id = cursor->id;