--- /dev/null
+From 1693d5442c458ae8d5b0d58463b873cd879569ed Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 25 Jan 2024 09:53:06 +0000
+Subject: btrfs: add and use helper to check if block group is used
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 1693d5442c458ae8d5b0d58463b873cd879569ed upstream.
+
+Add a helper function to determine if a block group is being used and make
+use of it at btrfs_delete_unused_bgs(). This helper will also be used in
+future code changes.
+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: Boris Burkov <boris@bur.io>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/block-group.c | 3 +--
+ fs/btrfs/block-group.h | 7 +++++++
+ 2 files changed, 8 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -1524,8 +1524,7 @@ void btrfs_delete_unused_bgs(struct btrf
+ }
+
+ spin_lock(&block_group->lock);
+- if (block_group->reserved || block_group->pinned ||
+- block_group->used || block_group->ro ||
++ if (btrfs_is_block_group_used(block_group) || block_group->ro ||
+ list_is_singular(&block_group->list)) {
+ /*
+ * We want to bail if we made new allocations or have
+--- a/fs/btrfs/block-group.h
++++ b/fs/btrfs/block-group.h
+@@ -255,6 +255,13 @@ static inline u64 btrfs_block_group_end(
+ return (block_group->start + block_group->length);
+ }
+
++static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
++{
++ lockdep_assert_held(&bg->lock);
++
++ return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
++}
++
+ static inline bool btrfs_is_block_group_data_only(
+ struct btrfs_block_group *block_group)
+ {
--- /dev/null
+From 12c5128f101bfa47a08e4c0e1a75cfa2d0872bcd Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 25 Jan 2024 09:53:19 +0000
+Subject: btrfs: add new unused block groups to the list of unused block groups
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 12c5128f101bfa47a08e4c0e1a75cfa2d0872bcd upstream.
+
+Space reservations for metadata are, most of the time, pessimistic as we
+reserve space for worst possible cases - where tree heights are at the
+maximum possible height (8), we need to COW every extent buffer in a tree
+path, need to split extent buffers, etc.
+
+For data, we generally reserve the exact amount of space we are going to
+allocate. The exception here is when using compression, in which case we
+reserve space matching the uncompressed size, as the compression only
+happens at writeback time and in the worst possible case we need that
+amount of space in case the data is not compressible.
+
+This means that when there's not available space in the corresponding
+space_info object, we may need to allocate a new block group, and then
+that block group might not be used after all. In this case the block
+group is never added to the list of unused block groups and ends up
+never being deleted - except if we unmount and mount again the fs, as
+when reading block groups from disk we add unused ones to the list of
+unused block groups (fs_info->unused_bgs). Otherwise a block group is
+only added to the list of unused block groups when we deallocate the
+last extent from it, so if no extent is ever allocated, the block group
+is kept around forever.
+
+This also means that if we have a bunch of tasks reserving space in
+parallel we can end up allocating many block groups that end up never
+being used or kept around for too long without being used, which has
+the potential to result in ENOSPC failures in case for example we over
+allocate too many metadata block groups and then end up in a state
+without enough unallocated space to allocate a new data block group.
+
+This is more likely to happen with metadata reservations as of kernel
+6.7, namely since commit 28270e25c69a ("btrfs: always reserve space for
+delayed refs when starting transaction"), because we started to always
+reserve space for delayed references when starting a transaction handle
+for a non-zero number of items, and also to try to reserve space to fill
+the gap between the delayed block reserve's reserved space and its size.
+
+So to avoid this, when finishing the creation a new block group, add the
+block group to the list of unused block groups if it's still unused at
+that time. This way the next time the cleaner kthread runs, it will delete
+the block group if it's still unused and not needed to satisfy existing
+space reservations.
+
+Reported-by: Ivan Shapovalov <intelfx@intelfx.name>
+Link: https://lore.kernel.org/linux-btrfs/9cdbf0ca9cdda1b4c84e15e548af7d7f9f926382.camel@intelfx.name/
+CC: stable@vger.kernel.org # 6.7+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: Boris Burkov <boris@bur.io>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/block-group.c | 31 +++++++++++++++++++++++++++++++
+ 1 file changed, 31 insertions(+)
+
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -2757,6 +2757,37 @@ next:
+ btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
+ list_del_init(&block_group->bg_list);
+ clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
++
++ /*
++ * If the block group is still unused, add it to the list of
++ * unused block groups. The block group may have been created in
++ * order to satisfy a space reservation, in which case the
++ * extent allocation only happens later. But often we don't
++ * actually need to allocate space that we previously reserved,
++ * so the block group may become unused for a long time. For
++ * example for metadata we generally reserve space for a worst
++ * possible scenario, but then don't end up allocating all that
++ * space or none at all (due to no need to COW, extent buffers
++ * were already COWed in the current transaction and still
++ * unwritten, tree heights lower than the maximum possible
++ * height, etc). For data we generally reserve the axact amount
++ * of space we are going to allocate later, the exception is
++ * when using compression, as we must reserve space based on the
++ * uncompressed data size, because the compression is only done
++ * when writeback triggered and we don't know how much space we
++ * are actually going to need, so we reserve the uncompressed
++ * size because the data may be uncompressible in the worst case.
++ */
++ if (ret == 0) {
++ bool used;
++
++ spin_lock(&block_group->lock);
++ used = btrfs_is_block_group_used(block_group);
++ spin_unlock(&block_group->lock);
++
++ if (!used)
++ btrfs_mark_bg_unused(block_group);
++ }
+ }
+ btrfs_trans_release_chunk_metadata(trans);
+ }
--- /dev/null
+From e03ee2fe873eb68c1f9ba5112fee70303ebf9dfb Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Sat, 20 Jan 2024 19:41:28 +1030
+Subject: btrfs: do not ASSERT() if the newly created subvolume already got read
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit e03ee2fe873eb68c1f9ba5112fee70303ebf9dfb upstream.
+
+[BUG]
+There is a syzbot crash, triggered by the ASSERT() during subvolume
+creation:
+
+ assertion failed: !anon_dev, in fs/btrfs/disk-io.c:1319
+ ------------[ cut here ]------------
+ kernel BUG at fs/btrfs/disk-io.c:1319!
+ invalid opcode: 0000 [#1] PREEMPT SMP KASAN
+ RIP: 0010:btrfs_get_root_ref.part.0+0x9aa/0xa60
+ <TASK>
+ btrfs_get_new_fs_root+0xd3/0xf0
+ create_subvol+0xd02/0x1650
+ btrfs_mksubvol+0xe95/0x12b0
+ __btrfs_ioctl_snap_create+0x2f9/0x4f0
+ btrfs_ioctl_snap_create+0x16b/0x200
+ btrfs_ioctl+0x35f0/0x5cf0
+ __x64_sys_ioctl+0x19d/0x210
+ do_syscall_64+0x3f/0xe0
+ entry_SYSCALL_64_after_hwframe+0x63/0x6b
+ ---[ end trace 0000000000000000 ]---
+
+[CAUSE]
+During create_subvol(), after inserting root item for the newly created
+subvolume, we would trigger btrfs_get_new_fs_root() to get the
+btrfs_root of that subvolume.
+
+The idea here is, we have preallocated an anonymous device number for
+the subvolume, thus we can assign it to the new subvolume.
+
+But there is really nothing preventing things like backref walk to read
+the new subvolume.
+If that happens before we call btrfs_get_new_fs_root(), the subvolume
+would be read out, with a new anonymous device number assigned already.
+
+In that case, we would trigger ASSERT(), as we really expect no one to
+read out that subvolume (which is not yet accessible from the fs).
+But things like backref walk is still possible to trigger the read on
+the subvolume.
+
+Thus our assumption on the ASSERT() is not correct in the first place.
+
+[FIX]
+Fix it by removing the ASSERT(), and just free the @anon_dev, reset it
+to 0, and continue.
+
+If the subvolume tree is read out by something else, it should have
+already get a new anon_dev assigned thus we only need to free the
+preallocated one.
+
+Reported-by: Chenyuan Yang <chenyuan0y@gmail.com>
+Fixes: 2dfb1e43f57d ("btrfs: preallocate anon block device at first phase of snapshot creation")
+CC: stable@vger.kernel.org # 5.15+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/disk-io.c | 13 +++++++++++--
+ 1 file changed, 11 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -1315,8 +1315,17 @@ static struct btrfs_root *btrfs_get_root
+ again:
+ root = btrfs_lookup_fs_root(fs_info, objectid);
+ if (root) {
+- /* Shouldn't get preallocated anon_dev for cached roots */
+- ASSERT(!anon_dev);
++ /*
++ * Some other caller may have read out the newly inserted
++ * subvolume already (for things like backref walk etc). Not
++ * that common but still possible. In that case, we just need
++ * to free the anon_dev.
++ */
++ if (unlikely(anon_dev)) {
++ free_anon_bdev(anon_dev);
++ anon_dev = 0;
++ }
++
+ if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
+ btrfs_put_root(root);
+ return ERR_PTR(-ENOENT);
--- /dev/null
+From f4a9f219411f318ae60d6ff7f129082a75686c6c Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 25 Jan 2024 09:53:14 +0000
+Subject: btrfs: do not delete unused block group if it may be used soon
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit f4a9f219411f318ae60d6ff7f129082a75686c6c upstream.
+
+Before deleting a block group that is in the list of unused block groups
+(fs_info->unused_bgs), we check if the block group became used before
+deleting it, as extents from it may have been allocated after it was added
+to the list.
+
+However even if the block group was not yet used, there may be tasks that
+have only reserved space and have not yet allocated extents, and they
+might be relying on the availability of the unused block group in order
+to allocate extents. The reservation works first by increasing the
+"bytes_may_use" field of the corresponding space_info object (which may
+first require flushing delayed items, allocating a new block group, etc),
+and only later a task does the actual allocation of extents.
+
+For metadata we usually don't end up using all reserved space, as we are
+pessimistic and typically account for the worst cases (need to COW every
+single node in a path of a tree at maximum possible height, etc). For
+data we usually reserve the exact amount of space we're going to allocate
+later, except when using compression where we always reserve space based
+on the uncompressed size, as compression is only triggered when writeback
+starts so we don't know in advance how much space we'll actually need, or
+if the data is compressible.
+
+So don't delete an unused block group if the total size of its space_info
+object minus the block group's size is less then the sum of used space and
+space that may be used (space_info->bytes_may_use), as that means we have
+tasks that reserved space and may need to allocate extents from the block
+group. In this case, besides skipping the deletion, re-add the block group
+to the list of unused block groups so that it may be reconsidered later,
+in case the tasks that reserved space end up not needing to allocate
+extents from it.
+
+Allowing the deletion of the block group while we have reserved space, can
+result in tasks failing to allocate metadata extents (-ENOSPC) while under
+a transaction handle, resulting in a transaction abort, or failure during
+writeback for the case of data extents.
+
+CC: stable@vger.kernel.org # 6.0+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Reviewed-by: Boris Burkov <boris@bur.io>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/block-group.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 46 insertions(+)
+
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -1467,6 +1467,7 @@ out:
+ */
+ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
+ {
++ LIST_HEAD(retry_list);
+ struct btrfs_block_group *block_group;
+ struct btrfs_space_info *space_info;
+ struct btrfs_trans_handle *trans;
+@@ -1488,6 +1489,7 @@ void btrfs_delete_unused_bgs(struct btrf
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ while (!list_empty(&fs_info->unused_bgs)) {
++ u64 used;
+ int trimming;
+
+ block_group = list_first_entry(&fs_info->unused_bgs,
+@@ -1523,6 +1525,7 @@ void btrfs_delete_unused_bgs(struct btrf
+ goto next;
+ }
+
++ spin_lock(&space_info->lock);
+ spin_lock(&block_group->lock);
+ if (btrfs_is_block_group_used(block_group) || block_group->ro ||
+ list_is_singular(&block_group->list)) {
+@@ -1534,10 +1537,49 @@ void btrfs_delete_unused_bgs(struct btrf
+ */
+ trace_btrfs_skip_unused_block_group(block_group);
+ spin_unlock(&block_group->lock);
++ spin_unlock(&space_info->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
++
++ /*
++ * The block group may be unused but there may be space reserved
++ * accounting with the existence of that block group, that is,
++ * space_info->bytes_may_use was incremented by a task but no
++ * space was yet allocated from the block group by the task.
++ * That space may or may not be allocated, as we are generally
++ * pessimistic about space reservation for metadata as well as
++ * for data when using compression (as we reserve space based on
++ * the worst case, when data can't be compressed, and before
++ * actually attempting compression, before starting writeback).
++ *
++ * So check if the total space of the space_info minus the size
++ * of this block group is less than the used space of the
++ * space_info - if that's the case, then it means we have tasks
++ * that might be relying on the block group in order to allocate
++ * extents, and add back the block group to the unused list when
++ * we finish, so that we retry later in case no tasks ended up
++ * needing to allocate extents from the block group.
++ */
++ used = btrfs_space_info_used(space_info, true);
++ if (space_info->total_bytes - block_group->length < used) {
++ /*
++ * Add a reference for the list, compensate for the ref
++ * drop under the "next" label for the
++ * fs_info->unused_bgs list.
++ */
++ btrfs_get_block_group(block_group);
++ list_add_tail(&block_group->bg_list, &retry_list);
++
++ trace_btrfs_skip_unused_block_group(block_group);
++ spin_unlock(&block_group->lock);
++ spin_unlock(&space_info->lock);
++ up_write(&space_info->groups_sem);
++ goto next;
++ }
++
+ spin_unlock(&block_group->lock);
++ spin_unlock(&space_info->lock);
+
+ /* We don't want to force the issue, only flip if it's ok. */
+ ret = inc_block_group_ro(block_group, 0);
+@@ -1661,12 +1703,16 @@ next:
+ btrfs_put_block_group(block_group);
+ spin_lock(&fs_info->unused_bgs_lock);
+ }
++ list_splice_tail(&retry_list, &fs_info->unused_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ return;
+
+ flip_async:
+ btrfs_end_transaction(trans);
++ spin_lock(&fs_info->unused_bgs_lock);
++ list_splice_tail(&retry_list, &fs_info->unused_bgs);
++ spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ btrfs_put_block_group(block_group);
+ btrfs_discard_punt_unused_bgs_list(fs_info);
--- /dev/null
+From 5571e41ec6e56e35f34ae9f5b3a335ef510e0ade Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 31 Jan 2024 14:27:25 -0500
+Subject: btrfs: don't drop extent_map for free space inode on write error
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit 5571e41ec6e56e35f34ae9f5b3a335ef510e0ade upstream.
+
+While running the CI for an unrelated change I hit the following panic
+with generic/648 on btrfs_holes_spacecache.
+
+assertion failed: block_start != EXTENT_MAP_HOLE, in fs/btrfs/extent_io.c:1385
+------------[ cut here ]------------
+kernel BUG at fs/btrfs/extent_io.c:1385!
+invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
+CPU: 1 PID: 2695096 Comm: fsstress Kdump: loaded Tainted: G W 6.8.0-rc2+ #1
+RIP: 0010:__extent_writepage_io.constprop.0+0x4c1/0x5c0
+Call Trace:
+ <TASK>
+ extent_write_cache_pages+0x2ac/0x8f0
+ extent_writepages+0x87/0x110
+ do_writepages+0xd5/0x1f0
+ filemap_fdatawrite_wbc+0x63/0x90
+ __filemap_fdatawrite_range+0x5c/0x80
+ btrfs_fdatawrite_range+0x1f/0x50
+ btrfs_write_out_cache+0x507/0x560
+ btrfs_write_dirty_block_groups+0x32a/0x420
+ commit_cowonly_roots+0x21b/0x290
+ btrfs_commit_transaction+0x813/0x1360
+ btrfs_sync_file+0x51a/0x640
+ __x64_sys_fdatasync+0x52/0x90
+ do_syscall_64+0x9c/0x190
+ entry_SYSCALL_64_after_hwframe+0x6e/0x76
+
+This happens because we fail to write out the free space cache in one
+instance, come back around and attempt to write it again. However on
+the second pass through we go to call btrfs_get_extent() on the inode to
+get the extent mapping. Because this is a new block group, and with the
+free space inode we always search the commit root to avoid deadlocking
+with the tree, we find nothing and return a EXTENT_MAP_HOLE for the
+requested range.
+
+This happens because the first time we try to write the space cache out
+we hit an error, and on an error we drop the extent mapping. This is
+normal for normal files, but the free space cache inode is special. We
+always expect the extent map to be correct. Thus the second time
+through we end up with a bogus extent map.
+
+Since we're deprecating this feature, the most straightforward way to
+fix this is to simply skip dropping the extent map range for this failed
+range.
+
+I shortened the test by using error injection to stress the area to make
+it easier to reproduce. With this patch in place we no longer panic
+with my error injection test.
+
+CC: stable@vger.kernel.org # 4.14+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/inode.c | 19 +++++++++++++++++--
+ 1 file changed, 17 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -3175,8 +3175,23 @@ out:
+ unwritten_start += logical_len;
+ clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
+
+- /* Drop extent maps for the part of the extent we didn't write. */
+- btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
++ /*
++ * Drop extent maps for the part of the extent we didn't write.
++ *
++ * We have an exception here for the free_space_inode, this is
++ * because when we do btrfs_get_extent() on the free space inode
++ * we will search the commit root. If this is a new block group
++ * we won't find anything, and we will trip over the assert in
++ * writepage where we do ASSERT(em->block_start !=
++ * EXTENT_MAP_HOLE).
++ *
++ * Theoretically we could also skip this for any NOCOW extent as
++ * we don't mess with the extent map tree in the NOCOW case, but
++ * for now simply skip this if we are the free space inode.
++ */
++ if (!btrfs_is_free_space_inode(inode))
++ btrfs_drop_extent_map_range(inode, unwritten_start,
++ end, false);
+
+ /*
+ * If the ordered extent had an IOERR or something else went
--- /dev/null
+From 2f6397e448e689adf57e6788c90f913abd7e1af8 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Fri, 2 Feb 2024 14:32:17 +0000
+Subject: btrfs: don't refill whole delayed refs block reserve when starting transaction
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 2f6397e448e689adf57e6788c90f913abd7e1af8 upstream.
+
+Since commit 28270e25c69a ("btrfs: always reserve space for delayed refs
+when starting transaction") we started not only to reserve metadata space
+for the delayed refs a caller of btrfs_start_transaction() might generate
+but also to try to fully refill the delayed refs block reserve, because
+there are several case where we generate delayed refs and haven't reserved
+space for them, relying on the global block reserve. Relying too much on
+the global block reserve is not always safe, and can result in hitting
+-ENOSPC during transaction commits or worst, in rare cases, being unable
+to mount a filesystem that needs to do orphan cleanup or anything that
+requires modifying the filesystem during mount, and has no more
+unallocated space and the metadata space is nearly full. This was
+explained in detail in that commit's change log.
+
+However the gap between the reserved amount and the size of the delayed
+refs block reserve can be huge, so attempting to reserve space for such
+a gap can result in allocating many metadata block groups that end up
+not being used. After a recent patch, with the subject:
+
+ "btrfs: add new unused block groups to the list of unused block groups"
+
+We started to add new block groups that are unused to the list of unused
+block groups, to avoid having them around for a very long time in case
+they are never used, because a block group is only added to the list of
+unused block groups when we deallocate the last extent or when mounting
+the filesystem and the block group has 0 bytes used. This is not a problem
+introduced by the commit mentioned earlier, it always existed as our
+metadata space reservations are, most of the time, pessimistic and end up
+not using all the space they reserved, so we can occasionally end up with
+one or two unused metadata block groups for a long period. However after
+that commit mentioned earlier, we are just more pessimistic in the
+metadata space reservations when starting a transaction and therefore the
+issue is more likely to happen.
+
+This however is not always enough because we might create unused metadata
+block groups when reserving metadata space at a high rate if there's
+always a gap in the delayed refs block reserve and the cleaner kthread
+isn't triggered often enough or is busy with other work (running delayed
+iputs, cleaning deleted roots, etc), not to mention the block group's
+allocated space is only usable for a new block group after the transaction
+used to remove it is committed.
+
+A user reported that he's getting a lot of allocated metadata block groups
+but the usage percentage of metadata space was very low compared to the
+total allocated space, specially after running a series of block group
+relocations.
+
+So for now stop trying to refill the gap in the delayed refs block reserve
+and reserve space only for the delayed refs we are expected to generate
+when starting a transaction.
+
+CC: stable@vger.kernel.org # 6.7+
+Reported-by: Ivan Shapovalov <intelfx@intelfx.name>
+Link: https://lore.kernel.org/linux-btrfs/9cdbf0ca9cdda1b4c84e15e548af7d7f9f926382.camel@intelfx.name/
+Link: https://lore.kernel.org/linux-btrfs/CAL3q7H6802ayLHUJFztzZAVzBLJAGdFx=6FHNNy87+obZXXZpQ@mail.gmail.com/
+Tested-by: Ivan Shapovalov <intelfx@intelfx.name>
+Reported-by: Heddxh <g311571057@gmail.com>
+Link: https://lore.kernel.org/linux-btrfs/CAE93xANEby6RezOD=zcofENYZOT-wpYygJyauyUAZkLv6XVFOA@mail.gmail.com/
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/transaction.c | 38 ++------------------------------------
+ 1 file changed, 2 insertions(+), 36 deletions(-)
+
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -564,56 +564,22 @@ static int btrfs_reserve_trans_metadata(
+ u64 num_bytes,
+ u64 *delayed_refs_bytes)
+ {
+- struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
+- u64 extra_delayed_refs_bytes = 0;
+- u64 bytes;
++ u64 bytes = num_bytes + *delayed_refs_bytes;
+ int ret;
+
+ /*
+- * If there's a gap between the size of the delayed refs reserve and
+- * its reserved space, than some tasks have added delayed refs or bumped
+- * its size otherwise (due to block group creation or removal, or block
+- * group item update). Also try to allocate that gap in order to prevent
+- * using (and possibly abusing) the global reserve when committing the
+- * transaction.
+- */
+- if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+- !btrfs_block_rsv_full(delayed_refs_rsv)) {
+- spin_lock(&delayed_refs_rsv->lock);
+- if (delayed_refs_rsv->size > delayed_refs_rsv->reserved)
+- extra_delayed_refs_bytes = delayed_refs_rsv->size -
+- delayed_refs_rsv->reserved;
+- spin_unlock(&delayed_refs_rsv->lock);
+- }
+-
+- bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes;
+-
+- /*
+ * We want to reserve all the bytes we may need all at once, so we only
+ * do 1 enospc flushing cycle per transaction start.
+ */
+ ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+- if (ret == 0) {
+- if (extra_delayed_refs_bytes > 0)
+- btrfs_migrate_to_delayed_refs_rsv(fs_info,
+- extra_delayed_refs_bytes);
+- return 0;
+- }
+-
+- if (extra_delayed_refs_bytes > 0) {
+- bytes -= extra_delayed_refs_bytes;
+- ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
+- if (ret == 0)
+- return 0;
+- }
+
+ /*
+ * If we are an emergency flush, which can steal from the global block
+ * reserve, then attempt to not reserve space for the delayed refs, as
+ * we will consume space for them from the global block reserve.
+ */
+- if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
++ if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+ bytes -= *delayed_refs_bytes;
+ *delayed_refs_bytes = 0;
+ ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
--- /dev/null
+From feefe1f49d26bad9d8997096e3a200280fa7b1c5 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 31 Jan 2024 17:18:04 +0000
+Subject: btrfs: don't reserve space for checksums when writing to nocow files
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit feefe1f49d26bad9d8997096e3a200280fa7b1c5 upstream.
+
+Currently when doing a write to a file we always reserve metadata space
+for inserting data checksums. However we don't need to do it if we have
+a nodatacow file (-o nodatacow mount option or chattr +C) or if checksums
+are disabled (-o nodatasum mount option), as in that case we are only
+adding unnecessary pressure to metadata reservations.
+
+For example on x86_64, with the default node size of 16K, a 4K buffered
+write into a nodatacow file is reserving 655360 bytes of metadata space,
+as it's accounting for checksums. After this change, which stops reserving
+space for checksums if we have a nodatacow file or checksums are disabled,
+we only need to reserve 393216 bytes of metadata.
+
+CC: stable@vger.kernel.org # 6.1+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/delalloc-space.c | 29 +++++++++++++++++++----------
+ 1 file changed, 19 insertions(+), 10 deletions(-)
+
+--- a/fs/btrfs/delalloc-space.c
++++ b/fs/btrfs/delalloc-space.c
+@@ -245,7 +245,6 @@ static void btrfs_calculate_inode_block_
+ struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
+ u64 reserve_size = 0;
+ u64 qgroup_rsv_size = 0;
+- u64 csum_leaves;
+ unsigned outstanding_extents;
+
+ lockdep_assert_held(&inode->lock);
+@@ -260,10 +259,12 @@ static void btrfs_calculate_inode_block_
+ outstanding_extents);
+ reserve_size += btrfs_calc_metadata_size(fs_info, 1);
+ }
+- csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
+- inode->csum_bytes);
+- reserve_size += btrfs_calc_insert_metadata_size(fs_info,
+- csum_leaves);
++ if (!(inode->flags & BTRFS_INODE_NODATASUM)) {
++ u64 csum_leaves;
++
++ csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
++ reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves);
++ }
+ /*
+ * For qgroup rsv, the calculation is very simple:
+ * account one nodesize for each outstanding extent
+@@ -278,14 +279,20 @@ static void btrfs_calculate_inode_block_
+ spin_unlock(&block_rsv->lock);
+ }
+
+-static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
++static void calc_inode_reservations(struct btrfs_inode *inode,
+ u64 num_bytes, u64 disk_num_bytes,
+ u64 *meta_reserve, u64 *qgroup_reserve)
+ {
++ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ u64 nr_extents = count_max_extents(fs_info, num_bytes);
+- u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
++ u64 csum_leaves;
+ u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
+
++ if (inode->flags & BTRFS_INODE_NODATASUM)
++ csum_leaves = 0;
++ else
++ csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
++
+ *meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
+ nr_extents + csum_leaves);
+
+@@ -337,7 +344,7 @@ int btrfs_delalloc_reserve_metadata(stru
+ * everything out and try again, which is bad. This way we just
+ * over-reserve slightly, and clean up the mess when we are done.
+ */
+- calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
++ calc_inode_reservations(inode, num_bytes, disk_num_bytes,
+ &meta_reserve, &qgroup_reserve);
+ ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
+ noflush);
+@@ -359,7 +366,8 @@ int btrfs_delalloc_reserve_metadata(stru
+ nr_extents = count_max_extents(fs_info, num_bytes);
+ spin_lock(&inode->lock);
+ btrfs_mod_outstanding_extents(inode, nr_extents);
+- inode->csum_bytes += disk_num_bytes;
++ if (!(inode->flags & BTRFS_INODE_NODATASUM))
++ inode->csum_bytes += disk_num_bytes;
+ btrfs_calculate_inode_block_rsv_size(fs_info, inode);
+ spin_unlock(&inode->lock);
+
+@@ -393,7 +401,8 @@ void btrfs_delalloc_release_metadata(str
+
+ num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
+ spin_lock(&inode->lock);
+- inode->csum_bytes -= num_bytes;
++ if (!(inode->flags & BTRFS_INODE_NODATASUM))
++ inode->csum_bytes -= num_bytes;
+ btrfs_calculate_inode_block_rsv_size(fs_info, inode);
+ spin_unlock(&inode->lock);
+
--- /dev/null
+From 0c309d66dacddf8ce939b891d9ead4a8e21ad6f0 Mon Sep 17 00:00:00 2001
+From: Boris Burkov <boris@bur.io>
+Date: Wed, 10 Jan 2024 17:51:26 -0800
+Subject: btrfs: forbid creating subvol qgroups
+
+From: Boris Burkov <boris@bur.io>
+
+commit 0c309d66dacddf8ce939b891d9ead4a8e21ad6f0 upstream.
+
+Creating a qgroup 0/subvolid leads to various races and it isn't
+helpful, because you can't specify a subvol id when creating a subvol,
+so you can't be sure it will be the right one. Any requirements on the
+automatic subvol can be gratified by using a higher level qgroup and the
+inheritance parameters of subvol creation.
+
+Fixes: cecbb533b5fc ("btrfs: record simple quota deltas in delayed refs")
+CC: stable@vger.kernel.org # 4.14+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Boris Burkov <boris@bur.io>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ioctl.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/fs/btrfs/ioctl.c
++++ b/fs/btrfs/ioctl.c
+@@ -3815,6 +3815,11 @@ static long btrfs_ioctl_qgroup_create(st
+ goto out;
+ }
+
++ if (sa->create && is_fstree(sa->qgroupid)) {
++ ret = -EINVAL;
++ goto out;
++ }
++
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
--- /dev/null
+From a8df35619948bd8363d330c20a90c9a7fbff28c0 Mon Sep 17 00:00:00 2001
+From: Boris Burkov <boris@bur.io>
+Date: Wed, 10 Jan 2024 17:30:00 -0800
+Subject: btrfs: forbid deleting live subvol qgroup
+
+From: Boris Burkov <boris@bur.io>
+
+commit a8df35619948bd8363d330c20a90c9a7fbff28c0 upstream.
+
+If a subvolume still exists, forbid deleting its qgroup 0/subvolid.
+This behavior generally leads to incorrect behavior in squotas and
+doesn't have a legitimate purpose.
+
+Fixes: cecbb533b5fc ("btrfs: record simple quota deltas in delayed refs")
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Boris Burkov <boris@bur.io>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/qgroup.c | 14 ++++++++++++++
+ 1 file changed, 14 insertions(+)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -1736,6 +1736,15 @@ out:
+ return ret;
+ }
+
++static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
++{
++ return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 ||
++ qgroup->excl > 0 || qgroup->excl_cmpr > 0 ||
++ qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 ||
++ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 ||
++ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0);
++}
++
+ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
+ {
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+@@ -1755,6 +1764,11 @@ int btrfs_remove_qgroup(struct btrfs_tra
+ goto out;
+ }
+
++ if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
++ ret = -EBUSY;
++ goto out;
++ }
++
+ /* Check if there are no children of this qgroup */
+ if (!list_empty(&qgroup->members)) {
+ ret = -EBUSY;
--- /dev/null
+From 1bd96c92c6a0a4d43815eb685c15aa4b78879dc9 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Fri, 2 Feb 2024 12:09:22 +0000
+Subject: btrfs: reject encoded write if inode has nodatasum flag set
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 1bd96c92c6a0a4d43815eb685c15aa4b78879dc9 upstream.
+
+Currently we allow an encoded write against inodes that have the NODATASUM
+flag set, either because they are NOCOW files or they were created while
+the filesystem was mounted with "-o nodatasum". This results in having
+compressed extents without corresponding checksums, which is a filesystem
+inconsistency reported by 'btrfs check'.
+
+For example, running btrfs/281 with MOUNT_OPTIONS="-o nodatacow" triggers
+this and 'btrfs check' errors out with:
+
+ [1/7] checking root items
+ [2/7] checking extents
+ [3/7] checking free space tree
+ [4/7] checking fs roots
+ root 256 inode 257 errors 1040, bad file extent, some csum missing
+ root 256 inode 258 errors 1040, bad file extent, some csum missing
+ ERROR: errors found in fs roots
+ (...)
+
+So reject encoded writes if the target inode has NODATASUM set.
+
+CC: stable@vger.kernel.org # 6.1+
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/inode.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/fs/btrfs/inode.c
++++ b/fs/btrfs/inode.c
+@@ -10233,6 +10233,13 @@ ssize_t btrfs_do_encoded_write(struct ki
+ if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
+ return -EINVAL;
+
++ /*
++ * Compressed extents should always have checksums, so error out if we
++ * have a NOCOW file or inode was created while mounted with NODATASUM.
++ */
++ if (inode->flags & BTRFS_INODE_NODATASUM)
++ return -EINVAL;
++
+ orig_count = iov_iter_count(from);
+
+ /* The extent size must be sane. */
--- /dev/null
+From f884a9f9e59206a2d41f265e7e403f080d10b493 Mon Sep 17 00:00:00 2001
+From: David Sterba <dsterba@suse.com>
+Date: Wed, 10 Jan 2024 17:48:44 +0100
+Subject: btrfs: send: return EOPNOTSUPP on unknown flags
+
+From: David Sterba <dsterba@suse.com>
+
+commit f884a9f9e59206a2d41f265e7e403f080d10b493 upstream.
+
+When some ioctl flags are checked we return EOPNOTSUPP, like for
+BTRFS_SCRUB_SUPPORTED_FLAGS, BTRFS_SUBVOL_CREATE_ARGS_MASK or fallocate
+modes. The EINVAL is supposed to be for a supported but invalid
+values or combination of options. Fix that when checking send flags so
+it's consistent with the rest.
+
+CC: stable@vger.kernel.org # 4.14+
+Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5rryOLzp3EKq8RTbjMHMHeaJubfpsVLF6H4qJnKCUR1w@mail.gmail.com/
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/send.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/fs/btrfs/send.c
++++ b/fs/btrfs/send.c
+@@ -8111,7 +8111,7 @@ long btrfs_ioctl_send(struct inode *inod
+ }
+
+ if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
+- ret = -EINVAL;
++ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
--- /dev/null
+From 7fddac12c38237252431d5b8af7b6d5771b6d125 Mon Sep 17 00:00:00 2001
+From: Saravana Kannan <saravanak@google.com>
+Date: Fri, 2 Feb 2024 01:56:33 -0800
+Subject: driver core: Fix device_link_flag_is_sync_state_only()
+
+From: Saravana Kannan <saravanak@google.com>
+
+commit 7fddac12c38237252431d5b8af7b6d5771b6d125 upstream.
+
+device_link_flag_is_sync_state_only() correctly returns true on the flags
+of an existing device link that only implements sync_state() functionality.
+However, it incorrectly and confusingly returns false if it's called with
+DL_FLAG_SYNC_STATE_ONLY.
+
+This bug doesn't manifest in any of the existing calls to this function,
+but fix this confusing behavior to avoid future bugs.
+
+Fixes: 67cad5c67019 ("driver core: fw_devlink: Add DL_FLAG_CYCLE support to device links")
+Signed-off-by: Saravana Kannan <saravanak@google.com>
+Tested-by: Xu Yang <xu.yang_2@nxp.com>
+Link: https://lore.kernel.org/r/20240202095636.868578-2-saravanak@google.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/base/core.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/drivers/base/core.c
++++ b/drivers/base/core.c
+@@ -284,10 +284,12 @@ static bool device_is_ancestor(struct de
+ return false;
+ }
+
++#define DL_MARKER_FLAGS (DL_FLAG_INFERRED | \
++ DL_FLAG_CYCLE | \
++ DL_FLAG_MANAGED)
+ static inline bool device_link_flag_is_sync_state_only(u32 flags)
+ {
+- return (flags & ~(DL_FLAG_INFERRED | DL_FLAG_CYCLE)) ==
+- (DL_FLAG_SYNC_STATE_ONLY | DL_FLAG_MANAGED);
++ return (flags & ~DL_MARKER_FLAGS) == DL_FLAG_SYNC_STATE_ONLY;
+ }
+
+ /**
update-workarounds-for-gcc-asm-goto-issue.patch
mm-huge_memory-don-t-force-huge-page-alignment-on-32-bit.patch
mm-mmap-map-map_stack-to-vm_nohugepage.patch
+btrfs-add-and-use-helper-to-check-if-block-group-is-used.patch
+btrfs-do-not-delete-unused-block-group-if-it-may-be-used-soon.patch
+btrfs-add-new-unused-block-groups-to-the-list-of-unused-block-groups.patch
+btrfs-don-t-refill-whole-delayed-refs-block-reserve-when-starting-transaction.patch
+btrfs-forbid-creating-subvol-qgroups.patch
+btrfs-do-not-assert-if-the-newly-created-subvolume-already-got-read.patch
+btrfs-forbid-deleting-live-subvol-qgroup.patch
+btrfs-send-return-eopnotsupp-on-unknown-flags.patch
+btrfs-don-t-reserve-space-for-checksums-when-writing-to-nocow-files.patch
+btrfs-reject-encoded-write-if-inode-has-nodatasum-flag-set.patch
+btrfs-don-t-drop-extent_map-for-free-space-inode-on-write-error.patch
+driver-core-fix-device_link_flag_is_sync_state_only.patch