From 3ea1a4961a0b50b431bb8040fb769e52c17cbc8d Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 18 Feb 2024 10:41:36 +0100 Subject: [PATCH] 6.1-stable patches added patches: btrfs-add-and-use-helper-to-check-if-block-group-is-used.patch btrfs-do-not-assert-if-the-newly-created-subvolume-already-got-read.patch btrfs-do-not-delete-unused-block-group-if-it-may-be-used-soon.patch btrfs-don-t-drop-extent_map-for-free-space-inode-on-write-error.patch btrfs-don-t-reserve-space-for-checksums-when-writing-to-nocow-files.patch btrfs-forbid-creating-subvol-qgroups.patch btrfs-forbid-deleting-live-subvol-qgroup.patch btrfs-reject-encoded-write-if-inode-has-nodatasum-flag-set.patch btrfs-send-return-eopnotsupp-on-unknown-flags.patch driver-core-fix-device_link_flag_is_sync_state_only.patch --- ...lper-to-check-if-block-group-is-used.patch | 53 +++++++ ...y-created-subvolume-already-got-read.patch | 92 +++++++++++ ...d-block-group-if-it-may-be-used-soon.patch | 150 ++++++++++++++++++ ...-for-free-space-inode-on-write-error.patch | 94 +++++++++++ ...hecksums-when-writing-to-nocow-files.patch | 109 +++++++++++++ ...btrfs-forbid-creating-subvol-qgroups.patch | 40 +++++ ...s-forbid-deleting-live-subvol-qgroup.patch | 54 +++++++ ...rite-if-inode-has-nodatasum-flag-set.patch | 55 +++++++ ...d-return-eopnotsupp-on-unknown-flags.patch | 35 ++++ ...-device_link_flag_is_sync_state_only.patch | 43 +++++ queue-6.1/series | 10 ++ 11 files changed, 735 insertions(+) create mode 100644 queue-6.1/btrfs-add-and-use-helper-to-check-if-block-group-is-used.patch create mode 100644 queue-6.1/btrfs-do-not-assert-if-the-newly-created-subvolume-already-got-read.patch create mode 100644 queue-6.1/btrfs-do-not-delete-unused-block-group-if-it-may-be-used-soon.patch create mode 100644 queue-6.1/btrfs-don-t-drop-extent_map-for-free-space-inode-on-write-error.patch create mode 100644 queue-6.1/btrfs-don-t-reserve-space-for-checksums-when-writing-to-nocow-files.patch create mode 100644 queue-6.1/btrfs-forbid-creating-subvol-qgroups.patch create mode 100644 queue-6.1/btrfs-forbid-deleting-live-subvol-qgroup.patch create mode 100644 queue-6.1/btrfs-reject-encoded-write-if-inode-has-nodatasum-flag-set.patch create mode 100644 queue-6.1/btrfs-send-return-eopnotsupp-on-unknown-flags.patch create mode 100644 queue-6.1/driver-core-fix-device_link_flag_is_sync_state_only.patch diff --git a/queue-6.1/btrfs-add-and-use-helper-to-check-if-block-group-is-used.patch b/queue-6.1/btrfs-add-and-use-helper-to-check-if-block-group-is-used.patch new file mode 100644 index 00000000000..52498e3a2a9 --- /dev/null +++ b/queue-6.1/btrfs-add-and-use-helper-to-check-if-block-group-is-used.patch @@ -0,0 +1,53 @@ +From 1693d5442c458ae8d5b0d58463b873cd879569ed Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Thu, 25 Jan 2024 09:53:06 +0000 +Subject: btrfs: add and use helper to check if block group is used + +From: Filipe Manana + +commit 1693d5442c458ae8d5b0d58463b873cd879569ed upstream. + +Add a helper function to determine if a block group is being used and make +use of it at btrfs_delete_unused_bgs(). This helper will also be used in +future code changes. + +Reviewed-by: Johannes Thumshirn +Reviewed-by: Josef Bacik +Reviewed-by: Boris Burkov +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/block-group.c | 3 +-- + fs/btrfs/block-group.h | 7 +++++++ + 2 files changed, 8 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/block-group.c ++++ b/fs/btrfs/block-group.c +@@ -1375,8 +1375,7 @@ void btrfs_delete_unused_bgs(struct btrf + } + + spin_lock(&block_group->lock); +- if (block_group->reserved || block_group->pinned || +- block_group->used || block_group->ro || ++ if (btrfs_is_block_group_used(block_group) || block_group->ro || + list_is_singular(&block_group->list)) { + /* + * We want to bail if we made new allocations or have +--- a/fs/btrfs/block-group.h ++++ b/fs/btrfs/block-group.h +@@ -241,6 +241,13 @@ static inline u64 btrfs_block_group_end( + return (block_group->start + block_group->length); + } + ++static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg) ++{ ++ lockdep_assert_held(&bg->lock); ++ ++ return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0); ++} ++ + static inline bool btrfs_is_block_group_data_only( + struct btrfs_block_group *block_group) + { diff --git a/queue-6.1/btrfs-do-not-assert-if-the-newly-created-subvolume-already-got-read.patch b/queue-6.1/btrfs-do-not-assert-if-the-newly-created-subvolume-already-got-read.patch new file mode 100644 index 00000000000..25d3641cf83 --- /dev/null +++ b/queue-6.1/btrfs-do-not-assert-if-the-newly-created-subvolume-already-got-read.patch @@ -0,0 +1,92 @@ +From e03ee2fe873eb68c1f9ba5112fee70303ebf9dfb Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Sat, 20 Jan 2024 19:41:28 +1030 +Subject: btrfs: do not ASSERT() if the newly created subvolume already got read + +From: Qu Wenruo + +commit e03ee2fe873eb68c1f9ba5112fee70303ebf9dfb upstream. + +[BUG] +There is a syzbot crash, triggered by the ASSERT() during subvolume +creation: + + assertion failed: !anon_dev, in fs/btrfs/disk-io.c:1319 + ------------[ cut here ]------------ + kernel BUG at fs/btrfs/disk-io.c:1319! + invalid opcode: 0000 [#1] PREEMPT SMP KASAN + RIP: 0010:btrfs_get_root_ref.part.0+0x9aa/0xa60 + + btrfs_get_new_fs_root+0xd3/0xf0 + create_subvol+0xd02/0x1650 + btrfs_mksubvol+0xe95/0x12b0 + __btrfs_ioctl_snap_create+0x2f9/0x4f0 + btrfs_ioctl_snap_create+0x16b/0x200 + btrfs_ioctl+0x35f0/0x5cf0 + __x64_sys_ioctl+0x19d/0x210 + do_syscall_64+0x3f/0xe0 + entry_SYSCALL_64_after_hwframe+0x63/0x6b + ---[ end trace 0000000000000000 ]--- + +[CAUSE] +During create_subvol(), after inserting root item for the newly created +subvolume, we would trigger btrfs_get_new_fs_root() to get the +btrfs_root of that subvolume. + +The idea here is, we have preallocated an anonymous device number for +the subvolume, thus we can assign it to the new subvolume. + +But there is really nothing preventing things like backref walk to read +the new subvolume. +If that happens before we call btrfs_get_new_fs_root(), the subvolume +would be read out, with a new anonymous device number assigned already. + +In that case, we would trigger ASSERT(), as we really expect no one to +read out that subvolume (which is not yet accessible from the fs). +But things like backref walk is still possible to trigger the read on +the subvolume. + +Thus our assumption on the ASSERT() is not correct in the first place. + +[FIX] +Fix it by removing the ASSERT(), and just free the @anon_dev, reset it +to 0, and continue. + +If the subvolume tree is read out by something else, it should have +already get a new anon_dev assigned thus we only need to free the +preallocated one. + +Reported-by: Chenyuan Yang +Fixes: 2dfb1e43f57d ("btrfs: preallocate anon block device at first phase of snapshot creation") +CC: stable@vger.kernel.org # 5.15+ +Reviewed-by: Filipe Manana +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/disk-io.c | 13 +++++++++++-- + 1 file changed, 11 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -1662,8 +1662,17 @@ static struct btrfs_root *btrfs_get_root + again: + root = btrfs_lookup_fs_root(fs_info, objectid); + if (root) { +- /* Shouldn't get preallocated anon_dev for cached roots */ +- ASSERT(!anon_dev); ++ /* ++ * Some other caller may have read out the newly inserted ++ * subvolume already (for things like backref walk etc). Not ++ * that common but still possible. In that case, we just need ++ * to free the anon_dev. ++ */ ++ if (unlikely(anon_dev)) { ++ free_anon_bdev(anon_dev); ++ anon_dev = 0; ++ } ++ + if (check_ref && btrfs_root_refs(&root->root_item) == 0) { + btrfs_put_root(root); + return ERR_PTR(-ENOENT); diff --git a/queue-6.1/btrfs-do-not-delete-unused-block-group-if-it-may-be-used-soon.patch b/queue-6.1/btrfs-do-not-delete-unused-block-group-if-it-may-be-used-soon.patch new file mode 100644 index 00000000000..066cb6f664a --- /dev/null +++ b/queue-6.1/btrfs-do-not-delete-unused-block-group-if-it-may-be-used-soon.patch @@ -0,0 +1,150 @@ +From f4a9f219411f318ae60d6ff7f129082a75686c6c Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Thu, 25 Jan 2024 09:53:14 +0000 +Subject: btrfs: do not delete unused block group if it may be used soon + +From: Filipe Manana + +commit f4a9f219411f318ae60d6ff7f129082a75686c6c upstream. + +Before deleting a block group that is in the list of unused block groups +(fs_info->unused_bgs), we check if the block group became used before +deleting it, as extents from it may have been allocated after it was added +to the list. + +However even if the block group was not yet used, there may be tasks that +have only reserved space and have not yet allocated extents, and they +might be relying on the availability of the unused block group in order +to allocate extents. The reservation works first by increasing the +"bytes_may_use" field of the corresponding space_info object (which may +first require flushing delayed items, allocating a new block group, etc), +and only later a task does the actual allocation of extents. + +For metadata we usually don't end up using all reserved space, as we are +pessimistic and typically account for the worst cases (need to COW every +single node in a path of a tree at maximum possible height, etc). For +data we usually reserve the exact amount of space we're going to allocate +later, except when using compression where we always reserve space based +on the uncompressed size, as compression is only triggered when writeback +starts so we don't know in advance how much space we'll actually need, or +if the data is compressible. + +So don't delete an unused block group if the total size of its space_info +object minus the block group's size is less then the sum of used space and +space that may be used (space_info->bytes_may_use), as that means we have +tasks that reserved space and may need to allocate extents from the block +group. In this case, besides skipping the deletion, re-add the block group +to the list of unused block groups so that it may be reconsidered later, +in case the tasks that reserved space end up not needing to allocate +extents from it. + +Allowing the deletion of the block group while we have reserved space, can +result in tasks failing to allocate metadata extents (-ENOSPC) while under +a transaction handle, resulting in a transaction abort, or failure during +writeback for the case of data extents. + +CC: stable@vger.kernel.org # 6.0+ +Reviewed-by: Johannes Thumshirn +Reviewed-by: Josef Bacik +Reviewed-by: Boris Burkov +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/block-group.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 46 insertions(+) + +--- a/fs/btrfs/block-group.c ++++ b/fs/btrfs/block-group.c +@@ -1318,6 +1318,7 @@ out: + */ + void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) + { ++ LIST_HEAD(retry_list); + struct btrfs_block_group *block_group; + struct btrfs_space_info *space_info; + struct btrfs_trans_handle *trans; +@@ -1339,6 +1340,7 @@ void btrfs_delete_unused_bgs(struct btrf + + spin_lock(&fs_info->unused_bgs_lock); + while (!list_empty(&fs_info->unused_bgs)) { ++ u64 used; + int trimming; + + block_group = list_first_entry(&fs_info->unused_bgs, +@@ -1374,6 +1376,7 @@ void btrfs_delete_unused_bgs(struct btrf + goto next; + } + ++ spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + if (btrfs_is_block_group_used(block_group) || block_group->ro || + list_is_singular(&block_group->list)) { +@@ -1385,10 +1388,49 @@ void btrfs_delete_unused_bgs(struct btrf + */ + trace_btrfs_skip_unused_block_group(block_group); + spin_unlock(&block_group->lock); ++ spin_unlock(&space_info->lock); + up_write(&space_info->groups_sem); + goto next; + } ++ ++ /* ++ * The block group may be unused but there may be space reserved ++ * accounting with the existence of that block group, that is, ++ * space_info->bytes_may_use was incremented by a task but no ++ * space was yet allocated from the block group by the task. ++ * That space may or may not be allocated, as we are generally ++ * pessimistic about space reservation for metadata as well as ++ * for data when using compression (as we reserve space based on ++ * the worst case, when data can't be compressed, and before ++ * actually attempting compression, before starting writeback). ++ * ++ * So check if the total space of the space_info minus the size ++ * of this block group is less than the used space of the ++ * space_info - if that's the case, then it means we have tasks ++ * that might be relying on the block group in order to allocate ++ * extents, and add back the block group to the unused list when ++ * we finish, so that we retry later in case no tasks ended up ++ * needing to allocate extents from the block group. ++ */ ++ used = btrfs_space_info_used(space_info, true); ++ if (space_info->total_bytes - block_group->length < used) { ++ /* ++ * Add a reference for the list, compensate for the ref ++ * drop under the "next" label for the ++ * fs_info->unused_bgs list. ++ */ ++ btrfs_get_block_group(block_group); ++ list_add_tail(&block_group->bg_list, &retry_list); ++ ++ trace_btrfs_skip_unused_block_group(block_group); ++ spin_unlock(&block_group->lock); ++ spin_unlock(&space_info->lock); ++ up_write(&space_info->groups_sem); ++ goto next; ++ } ++ + spin_unlock(&block_group->lock); ++ spin_unlock(&space_info->lock); + + /* We don't want to force the issue, only flip if it's ok. */ + ret = inc_block_group_ro(block_group, 0); +@@ -1512,12 +1554,16 @@ next: + btrfs_put_block_group(block_group); + spin_lock(&fs_info->unused_bgs_lock); + } ++ list_splice_tail(&retry_list, &fs_info->unused_bgs); + spin_unlock(&fs_info->unused_bgs_lock); + mutex_unlock(&fs_info->reclaim_bgs_lock); + return; + + flip_async: + btrfs_end_transaction(trans); ++ spin_lock(&fs_info->unused_bgs_lock); ++ list_splice_tail(&retry_list, &fs_info->unused_bgs); ++ spin_unlock(&fs_info->unused_bgs_lock); + mutex_unlock(&fs_info->reclaim_bgs_lock); + btrfs_put_block_group(block_group); + btrfs_discard_punt_unused_bgs_list(fs_info); diff --git a/queue-6.1/btrfs-don-t-drop-extent_map-for-free-space-inode-on-write-error.patch b/queue-6.1/btrfs-don-t-drop-extent_map-for-free-space-inode-on-write-error.patch new file mode 100644 index 00000000000..4efa647cdfd --- /dev/null +++ b/queue-6.1/btrfs-don-t-drop-extent_map-for-free-space-inode-on-write-error.patch @@ -0,0 +1,94 @@ +From 5571e41ec6e56e35f34ae9f5b3a335ef510e0ade Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Wed, 31 Jan 2024 14:27:25 -0500 +Subject: btrfs: don't drop extent_map for free space inode on write error + +From: Josef Bacik + +commit 5571e41ec6e56e35f34ae9f5b3a335ef510e0ade upstream. + +While running the CI for an unrelated change I hit the following panic +with generic/648 on btrfs_holes_spacecache. + +assertion failed: block_start != EXTENT_MAP_HOLE, in fs/btrfs/extent_io.c:1385 +------------[ cut here ]------------ +kernel BUG at fs/btrfs/extent_io.c:1385! +invalid opcode: 0000 [#1] PREEMPT SMP NOPTI +CPU: 1 PID: 2695096 Comm: fsstress Kdump: loaded Tainted: G W 6.8.0-rc2+ #1 +RIP: 0010:__extent_writepage_io.constprop.0+0x4c1/0x5c0 +Call Trace: + + extent_write_cache_pages+0x2ac/0x8f0 + extent_writepages+0x87/0x110 + do_writepages+0xd5/0x1f0 + filemap_fdatawrite_wbc+0x63/0x90 + __filemap_fdatawrite_range+0x5c/0x80 + btrfs_fdatawrite_range+0x1f/0x50 + btrfs_write_out_cache+0x507/0x560 + btrfs_write_dirty_block_groups+0x32a/0x420 + commit_cowonly_roots+0x21b/0x290 + btrfs_commit_transaction+0x813/0x1360 + btrfs_sync_file+0x51a/0x640 + __x64_sys_fdatasync+0x52/0x90 + do_syscall_64+0x9c/0x190 + entry_SYSCALL_64_after_hwframe+0x6e/0x76 + +This happens because we fail to write out the free space cache in one +instance, come back around and attempt to write it again. However on +the second pass through we go to call btrfs_get_extent() on the inode to +get the extent mapping. Because this is a new block group, and with the +free space inode we always search the commit root to avoid deadlocking +with the tree, we find nothing and return a EXTENT_MAP_HOLE for the +requested range. + +This happens because the first time we try to write the space cache out +we hit an error, and on an error we drop the extent mapping. This is +normal for normal files, but the free space cache inode is special. We +always expect the extent map to be correct. Thus the second time +through we end up with a bogus extent map. + +Since we're deprecating this feature, the most straightforward way to +fix this is to simply skip dropping the extent map range for this failed +range. + +I shortened the test by using error injection to stress the area to make +it easier to reproduce. With this patch in place we no longer panic +with my error injection test. + +CC: stable@vger.kernel.org # 4.14+ +Reviewed-by: Filipe Manana +Signed-off-by: Josef Bacik +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/inode.c | 19 +++++++++++++++++-- + 1 file changed, 17 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -3364,8 +3364,23 @@ out: + unwritten_start += logical_len; + clear_extent_uptodate(io_tree, unwritten_start, end, NULL); + +- /* Drop extent maps for the part of the extent we didn't write. */ +- btrfs_drop_extent_map_range(inode, unwritten_start, end, false); ++ /* ++ * Drop extent maps for the part of the extent we didn't write. ++ * ++ * We have an exception here for the free_space_inode, this is ++ * because when we do btrfs_get_extent() on the free space inode ++ * we will search the commit root. If this is a new block group ++ * we won't find anything, and we will trip over the assert in ++ * writepage where we do ASSERT(em->block_start != ++ * EXTENT_MAP_HOLE). ++ * ++ * Theoretically we could also skip this for any NOCOW extent as ++ * we don't mess with the extent map tree in the NOCOW case, but ++ * for now simply skip this if we are the free space inode. ++ */ ++ if (!btrfs_is_free_space_inode(inode)) ++ btrfs_drop_extent_map_range(inode, unwritten_start, ++ end, false); + + /* + * If the ordered extent had an IOERR or something else went diff --git a/queue-6.1/btrfs-don-t-reserve-space-for-checksums-when-writing-to-nocow-files.patch b/queue-6.1/btrfs-don-t-reserve-space-for-checksums-when-writing-to-nocow-files.patch new file mode 100644 index 00000000000..15a83f4b8df --- /dev/null +++ b/queue-6.1/btrfs-don-t-reserve-space-for-checksums-when-writing-to-nocow-files.patch @@ -0,0 +1,109 @@ +From feefe1f49d26bad9d8997096e3a200280fa7b1c5 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Wed, 31 Jan 2024 17:18:04 +0000 +Subject: btrfs: don't reserve space for checksums when writing to nocow files + +From: Filipe Manana + +commit feefe1f49d26bad9d8997096e3a200280fa7b1c5 upstream. + +Currently when doing a write to a file we always reserve metadata space +for inserting data checksums. However we don't need to do it if we have +a nodatacow file (-o nodatacow mount option or chattr +C) or if checksums +are disabled (-o nodatasum mount option), as in that case we are only +adding unnecessary pressure to metadata reservations. + +For example on x86_64, with the default node size of 16K, a 4K buffered +write into a nodatacow file is reserving 655360 bytes of metadata space, +as it's accounting for checksums. After this change, which stops reserving +space for checksums if we have a nodatacow file or checksums are disabled, +we only need to reserve 393216 bytes of metadata. + +CC: stable@vger.kernel.org # 6.1+ +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/delalloc-space.c | 29 +++++++++++++++++++---------- + 1 file changed, 19 insertions(+), 10 deletions(-) + +--- a/fs/btrfs/delalloc-space.c ++++ b/fs/btrfs/delalloc-space.c +@@ -243,7 +243,6 @@ static void btrfs_calculate_inode_block_ + struct btrfs_block_rsv *block_rsv = &inode->block_rsv; + u64 reserve_size = 0; + u64 qgroup_rsv_size = 0; +- u64 csum_leaves; + unsigned outstanding_extents; + + lockdep_assert_held(&inode->lock); +@@ -258,10 +257,12 @@ static void btrfs_calculate_inode_block_ + outstanding_extents); + reserve_size += btrfs_calc_metadata_size(fs_info, 1); + } +- csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, +- inode->csum_bytes); +- reserve_size += btrfs_calc_insert_metadata_size(fs_info, +- csum_leaves); ++ if (!(inode->flags & BTRFS_INODE_NODATASUM)) { ++ u64 csum_leaves; ++ ++ csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes); ++ reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves); ++ } + /* + * For qgroup rsv, the calculation is very simple: + * account one nodesize for each outstanding extent +@@ -276,14 +277,20 @@ static void btrfs_calculate_inode_block_ + spin_unlock(&block_rsv->lock); + } + +-static void calc_inode_reservations(struct btrfs_fs_info *fs_info, ++static void calc_inode_reservations(struct btrfs_inode *inode, + u64 num_bytes, u64 disk_num_bytes, + u64 *meta_reserve, u64 *qgroup_reserve) + { ++ struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 nr_extents = count_max_extents(fs_info, num_bytes); +- u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); ++ u64 csum_leaves; + u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); + ++ if (inode->flags & BTRFS_INODE_NODATASUM) ++ csum_leaves = 0; ++ else ++ csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); ++ + *meta_reserve = btrfs_calc_insert_metadata_size(fs_info, + nr_extents + csum_leaves); + +@@ -335,7 +342,7 @@ int btrfs_delalloc_reserve_metadata(stru + * everything out and try again, which is bad. This way we just + * over-reserve slightly, and clean up the mess when we are done. + */ +- calc_inode_reservations(fs_info, num_bytes, disk_num_bytes, ++ calc_inode_reservations(inode, num_bytes, disk_num_bytes, + &meta_reserve, &qgroup_reserve); + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true, + noflush); +@@ -356,7 +363,8 @@ int btrfs_delalloc_reserve_metadata(stru + spin_lock(&inode->lock); + nr_extents = count_max_extents(fs_info, num_bytes); + btrfs_mod_outstanding_extents(inode, nr_extents); +- inode->csum_bytes += disk_num_bytes; ++ if (!(inode->flags & BTRFS_INODE_NODATASUM)) ++ inode->csum_bytes += disk_num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + +@@ -390,7 +398,8 @@ void btrfs_delalloc_release_metadata(str + + num_bytes = ALIGN(num_bytes, fs_info->sectorsize); + spin_lock(&inode->lock); +- inode->csum_bytes -= num_bytes; ++ if (!(inode->flags & BTRFS_INODE_NODATASUM)) ++ inode->csum_bytes -= num_bytes; + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + diff --git a/queue-6.1/btrfs-forbid-creating-subvol-qgroups.patch b/queue-6.1/btrfs-forbid-creating-subvol-qgroups.patch new file mode 100644 index 00000000000..85568be10c1 --- /dev/null +++ b/queue-6.1/btrfs-forbid-creating-subvol-qgroups.patch @@ -0,0 +1,40 @@ +From 0c309d66dacddf8ce939b891d9ead4a8e21ad6f0 Mon Sep 17 00:00:00 2001 +From: Boris Burkov +Date: Wed, 10 Jan 2024 17:51:26 -0800 +Subject: btrfs: forbid creating subvol qgroups + +From: Boris Burkov + +commit 0c309d66dacddf8ce939b891d9ead4a8e21ad6f0 upstream. + +Creating a qgroup 0/subvolid leads to various races and it isn't +helpful, because you can't specify a subvol id when creating a subvol, +so you can't be sure it will be the right one. Any requirements on the +automatic subvol can be gratified by using a higher level qgroup and the +inheritance parameters of subvol creation. + +Fixes: cecbb533b5fc ("btrfs: record simple quota deltas in delayed refs") +CC: stable@vger.kernel.org # 4.14+ +Reviewed-by: Qu Wenruo +Signed-off-by: Boris Burkov +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/ioctl.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/fs/btrfs/ioctl.c ++++ b/fs/btrfs/ioctl.c +@@ -4695,6 +4695,11 @@ static long btrfs_ioctl_qgroup_create(st + goto out; + } + ++ if (sa->create && is_fstree(sa->qgroupid)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); diff --git a/queue-6.1/btrfs-forbid-deleting-live-subvol-qgroup.patch b/queue-6.1/btrfs-forbid-deleting-live-subvol-qgroup.patch new file mode 100644 index 00000000000..fa8a698c6b4 --- /dev/null +++ b/queue-6.1/btrfs-forbid-deleting-live-subvol-qgroup.patch @@ -0,0 +1,54 @@ +From a8df35619948bd8363d330c20a90c9a7fbff28c0 Mon Sep 17 00:00:00 2001 +From: Boris Burkov +Date: Wed, 10 Jan 2024 17:30:00 -0800 +Subject: btrfs: forbid deleting live subvol qgroup + +From: Boris Burkov + +commit a8df35619948bd8363d330c20a90c9a7fbff28c0 upstream. + +If a subvolume still exists, forbid deleting its qgroup 0/subvolid. +This behavior generally leads to incorrect behavior in squotas and +doesn't have a legitimate purpose. + +Fixes: cecbb533b5fc ("btrfs: record simple quota deltas in delayed refs") +CC: stable@vger.kernel.org # 5.4+ +Reviewed-by: Qu Wenruo +Signed-off-by: Boris Burkov +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/qgroup.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -1635,6 +1635,15 @@ out: + return ret; + } + ++static bool qgroup_has_usage(struct btrfs_qgroup *qgroup) ++{ ++ return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 || ++ qgroup->excl > 0 || qgroup->excl_cmpr > 0 || ++ qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 || ++ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 || ++ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0); ++} ++ + int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) + { + struct btrfs_fs_info *fs_info = trans->fs_info; +@@ -1654,6 +1663,11 @@ int btrfs_remove_qgroup(struct btrfs_tra + goto out; + } + ++ if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) { ++ ret = -EBUSY; ++ goto out; ++ } ++ + /* Check if there are no children of this qgroup */ + if (!list_empty(&qgroup->members)) { + ret = -EBUSY; diff --git a/queue-6.1/btrfs-reject-encoded-write-if-inode-has-nodatasum-flag-set.patch b/queue-6.1/btrfs-reject-encoded-write-if-inode-has-nodatasum-flag-set.patch new file mode 100644 index 00000000000..46f07be7d29 --- /dev/null +++ b/queue-6.1/btrfs-reject-encoded-write-if-inode-has-nodatasum-flag-set.patch @@ -0,0 +1,55 @@ +From 1bd96c92c6a0a4d43815eb685c15aa4b78879dc9 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Fri, 2 Feb 2024 12:09:22 +0000 +Subject: btrfs: reject encoded write if inode has nodatasum flag set + +From: Filipe Manana + +commit 1bd96c92c6a0a4d43815eb685c15aa4b78879dc9 upstream. + +Currently we allow an encoded write against inodes that have the NODATASUM +flag set, either because they are NOCOW files or they were created while +the filesystem was mounted with "-o nodatasum". This results in having +compressed extents without corresponding checksums, which is a filesystem +inconsistency reported by 'btrfs check'. + +For example, running btrfs/281 with MOUNT_OPTIONS="-o nodatacow" triggers +this and 'btrfs check' errors out with: + + [1/7] checking root items + [2/7] checking extents + [3/7] checking free space tree + [4/7] checking fs roots + root 256 inode 257 errors 1040, bad file extent, some csum missing + root 256 inode 258 errors 1040, bad file extent, some csum missing + ERROR: errors found in fs roots + (...) + +So reject encoded writes if the target inode has NODATASUM set. + +CC: stable@vger.kernel.org # 6.1+ +Reviewed-by: Johannes Thumshirn +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/inode.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -10774,6 +10774,13 @@ ssize_t btrfs_do_encoded_write(struct ki + if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE) + return -EINVAL; + ++ /* ++ * Compressed extents should always have checksums, so error out if we ++ * have a NOCOW file or inode was created while mounted with NODATASUM. ++ */ ++ if (inode->flags & BTRFS_INODE_NODATASUM) ++ return -EINVAL; ++ + orig_count = iov_iter_count(from); + + /* The extent size must be sane. */ diff --git a/queue-6.1/btrfs-send-return-eopnotsupp-on-unknown-flags.patch b/queue-6.1/btrfs-send-return-eopnotsupp-on-unknown-flags.patch new file mode 100644 index 00000000000..b5a801bd364 --- /dev/null +++ b/queue-6.1/btrfs-send-return-eopnotsupp-on-unknown-flags.patch @@ -0,0 +1,35 @@ +From f884a9f9e59206a2d41f265e7e403f080d10b493 Mon Sep 17 00:00:00 2001 +From: David Sterba +Date: Wed, 10 Jan 2024 17:48:44 +0100 +Subject: btrfs: send: return EOPNOTSUPP on unknown flags + +From: David Sterba + +commit f884a9f9e59206a2d41f265e7e403f080d10b493 upstream. + +When some ioctl flags are checked we return EOPNOTSUPP, like for +BTRFS_SCRUB_SUPPORTED_FLAGS, BTRFS_SUBVOL_CREATE_ARGS_MASK or fallocate +modes. The EINVAL is supposed to be for a supported but invalid +values or combination of options. Fix that when checking send flags so +it's consistent with the rest. + +CC: stable@vger.kernel.org # 4.14+ +Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5rryOLzp3EKq8RTbjMHMHeaJubfpsVLF6H4qJnKCUR1w@mail.gmail.com/ +Reviewed-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/send.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/send.c ++++ b/fs/btrfs/send.c +@@ -7852,7 +7852,7 @@ long btrfs_ioctl_send(struct inode *inod + } + + if (arg->flags & ~BTRFS_SEND_FLAG_MASK) { +- ret = -EINVAL; ++ ret = -EOPNOTSUPP; + goto out; + } + diff --git a/queue-6.1/driver-core-fix-device_link_flag_is_sync_state_only.patch b/queue-6.1/driver-core-fix-device_link_flag_is_sync_state_only.patch new file mode 100644 index 00000000000..311cd8b1a82 --- /dev/null +++ b/queue-6.1/driver-core-fix-device_link_flag_is_sync_state_only.patch @@ -0,0 +1,43 @@ +From 7fddac12c38237252431d5b8af7b6d5771b6d125 Mon Sep 17 00:00:00 2001 +From: Saravana Kannan +Date: Fri, 2 Feb 2024 01:56:33 -0800 +Subject: driver core: Fix device_link_flag_is_sync_state_only() + +From: Saravana Kannan + +commit 7fddac12c38237252431d5b8af7b6d5771b6d125 upstream. + +device_link_flag_is_sync_state_only() correctly returns true on the flags +of an existing device link that only implements sync_state() functionality. +However, it incorrectly and confusingly returns false if it's called with +DL_FLAG_SYNC_STATE_ONLY. + +This bug doesn't manifest in any of the existing calls to this function, +but fix this confusing behavior to avoid future bugs. + +Fixes: 67cad5c67019 ("driver core: fw_devlink: Add DL_FLAG_CYCLE support to device links") +Signed-off-by: Saravana Kannan +Tested-by: Xu Yang +Link: https://lore.kernel.org/r/20240202095636.868578-2-saravanak@google.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/base/core.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/drivers/base/core.c ++++ b/drivers/base/core.c +@@ -337,10 +337,12 @@ static bool device_is_ancestor(struct de + return false; + } + ++#define DL_MARKER_FLAGS (DL_FLAG_INFERRED | \ ++ DL_FLAG_CYCLE | \ ++ DL_FLAG_MANAGED) + static inline bool device_link_flag_is_sync_state_only(u32 flags) + { +- return (flags & ~(DL_FLAG_INFERRED | DL_FLAG_CYCLE)) == +- (DL_FLAG_SYNC_STATE_ONLY | DL_FLAG_MANAGED); ++ return (flags & ~DL_MARKER_FLAGS) == DL_FLAG_SYNC_STATE_ONLY; + } + + /** diff --git a/queue-6.1/series b/queue-6.1/series index b8ea1fcc56e..6250b58b9c3 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -1,2 +1,12 @@ work-around-gcc-bugs-with-asm-goto-with-outputs.patch update-workarounds-for-gcc-asm-goto-issue.patch +btrfs-add-and-use-helper-to-check-if-block-group-is-used.patch +btrfs-do-not-delete-unused-block-group-if-it-may-be-used-soon.patch +btrfs-forbid-creating-subvol-qgroups.patch +btrfs-do-not-assert-if-the-newly-created-subvolume-already-got-read.patch +btrfs-forbid-deleting-live-subvol-qgroup.patch +btrfs-send-return-eopnotsupp-on-unknown-flags.patch +btrfs-don-t-reserve-space-for-checksums-when-writing-to-nocow-files.patch +btrfs-reject-encoded-write-if-inode-has-nodatasum-flag-set.patch +btrfs-don-t-drop-extent_map-for-free-space-inode-on-write-error.patch +driver-core-fix-device_link_flag_is_sync_state_only.patch -- 2.47.3