btrfs: introduce the device layout aware per-profile available space

author Qu Wenruo <wqu@suse.com>

Wed, 4 Feb 2026 02:54:06 +0000 (13:24 +1030)

committer David Sterba <dsterba@suse.com>

Tue, 7 Apr 2026 16:55:53 +0000 (18:55 +0200)
author Qu Wenruo <wqu@suse.com>
Wed, 4 Feb 2026 02:54:06 +0000 (13:24 +1030)
committer David Sterba <dsterba@suse.com>
Tue, 7 Apr 2026 16:55:53 +0000 (18:55 +0200)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index 6b8e810a35ce1a652750bd5f44ad0644638f3a0e..9eb07f48cf827e235041165ae8c0c3263ca4b8fa 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -392,6 +392,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
         INIT_LIST_HEAD(&fs_devs->alloc_list);
         INIT_LIST_HEAD(&fs_devs->fs_list);
         INIT_LIST_HEAD(&fs_devs->seed_list);
+       spin_lock_init(&fs_devs->per_profile_lock);
  
         if (fsid) {
                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
@@ -5387,6 +5388,169 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
         return 0;
  }
  
+/*
+ * Return 0 if we allocated any virtual(*) chunk, and restore the size to
+ * @allocated.
+ * Return -ENOSPC if we have no more space to allocate virtual chunk
+ *
+ * *: A virtual chunk is a chunk that only exists during per-profile available
+ *    estimation.
+ *    Those numbers won't really take on-disk space, but only to emulate
+ *    chunk allocator behavior to get accurate estimation on available space.
+ *
+ *    Another difference is, a virtual chunk has no size limit and doesn't care
+ *    about holes in the device tree, allowing us to exhaust device space
+ *    much faster.
+ */
+static int alloc_virtual_chunk(struct btrfs_fs_info *fs_info,
+                              struct btrfs_device_info *devices_info,
+                              enum btrfs_raid_types type,
+                              u64 *allocated)
+{
+       const struct btrfs_raid_attr *raid_attr = &btrfs_raid_array[type];
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+       struct btrfs_device *device;
+       u64 stripe_size;
+       int ndevs = 0;
+
+       lockdep_assert_held(&fs_info->chunk_mutex);
+
+       /* Go through devices to collect their unallocated space. */
+       list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+               u64 avail;
+
+               if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
+                                       &device->dev_state) ||
+                   test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
+                       continue;
+
+               if (device->total_bytes > device->bytes_used +
+                               device->per_profile_allocated)
+                       avail = device->total_bytes - device->bytes_used -
+                               device->per_profile_allocated;
+               else
+                       avail = 0;
+
+               avail = round_down(avail, fs_info->sectorsize);
+
+               /* And exclude the [0, 1M) reserved space. */
+               if (avail > BTRFS_DEVICE_RANGE_RESERVED)
+                       avail -= BTRFS_DEVICE_RANGE_RESERVED;
+               else
+                       avail = 0;
+
+               /*
+                * Not enough to support a single stripe, this device
+                * can not be utilized for chunk allocation.
+                */
+               if (avail < BTRFS_STRIPE_LEN)
+                       continue;
+
+               /*
+                * Unlike chunk allocator, we don't care about stripe or hole
+                * size, so here we use @avail directly.
+                */
+               devices_info[ndevs].dev_offset = 0;
+               devices_info[ndevs].total_avail = avail;
+               devices_info[ndevs].max_avail = avail;
+               devices_info[ndevs].dev = device;
+               ++ndevs;
+       }
+       sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
+            btrfs_cmp_device_info, NULL);
+       ndevs = rounddown(ndevs, raid_attr->devs_increment);
+       if (ndevs < raid_attr->devs_min)
+               return -ENOSPC;
+       if (raid_attr->devs_max)
+               ndevs = min(ndevs, (int)raid_attr->devs_max);
+       else
+               ndevs = min(ndevs, (int)BTRFS_MAX_DEVS(fs_info));
+
+       /*
+        * Stripe size will be determined by the device with the least
+        * unallocated space.
+        */
+       stripe_size = devices_info[ndevs - 1].total_avail;
+
+       for (int i = 0; i < ndevs; i++)
+               devices_info[i].dev->per_profile_allocated += stripe_size;
+       *allocated = div_u64(stripe_size * (ndevs - raid_attr->nparity),
+                            raid_attr->ncopies);
+       return 0;
+}
+
+static int calc_one_profile_avail(struct btrfs_fs_info *fs_info,
+                                 enum btrfs_raid_types type,
+                                 u64 *result_ret)
+{
+       struct btrfs_device_info *devices_info = NULL;
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+       struct btrfs_device *device;
+       u64 allocated;
+       u64 result = 0;
+       int ret = 0;
+
+       lockdep_assert_held(&fs_info->chunk_mutex);
+       ASSERT(type >= 0 && type < BTRFS_NR_RAID_TYPES);
+
+       /* Not enough devices, quick exit, just update the result. */
+       if (fs_devices->rw_devices < btrfs_raid_array[type].devs_min) {
+               ret = -ENOSPC;
+               goto out;
+       }
+
+       devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
+                              GFP_NOFS);
+       if (!devices_info) {
+               ret = -ENOMEM;
+               goto out;
+       }
+       /* Clear virtual chunk used space for each device. */
+       list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list)
+               device->per_profile_allocated = 0;
+
+       while (!alloc_virtual_chunk(fs_info, devices_info, type, &allocated))
+               result += allocated;
+
+out:
+       kfree(devices_info);
+       if (ret < 0 && ret != -ENOSPC)
+               return ret;
+       *result_ret = result;
+       return 0;
+}
+
+/* Update the per-profile available space array. */
+void btrfs_update_per_profile_avail(struct btrfs_fs_info *fs_info)
+{
+       u64 results[BTRFS_NR_RAID_TYPES];
+       int ret;
+
+       /*
+        * Zoned is more complex as we can not simply get the amount of
+        * available space for each device.
+        */
+       if (btrfs_is_zoned(fs_info))
+               goto error;
+
+       for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
+               ret = calc_one_profile_avail(fs_info, i, &results[i]);
+               if (ret < 0)
+                       goto error;
+       }
+
+       spin_lock(&fs_info->fs_devices->per_profile_lock);
+       for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+               fs_info->fs_devices->per_profile_avail[i] = results[i];
+       spin_unlock(&fs_info->fs_devices->per_profile_lock);
+       return;
+error:
+       spin_lock(&fs_info->fs_devices->per_profile_lock);
+       for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+               fs_info->fs_devices->per_profile_avail[i] = U64_MAX;
+       spin_unlock(&fs_info->fs_devices->per_profile_lock);
+}
+
  static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
  {
         if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h

index 8288d79372a5604f756936ed76060c4990c5d5bf..0082c166af91fdbee91913e65ba66a2c7fc014e4 100644 (file)
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -22,6 +22,7 @@
  #include <uapi/linux/btrfs_tree.h>
  #include "messages.h"
  #include "extent-io-tree.h"
+#include "fs.h"
  
  struct block_device;
  struct bdev_handle;
@@ -213,6 +214,12 @@ struct btrfs_device {
  
         /* Bandwidth limit for scrub, in bytes */
         u64 scrub_speed_max;
+
+       /*
+        * A temporary number of allocated space during per-profile
+        * available space calculation.
+        */
+       u64 per_profile_allocated;
  };
  
  /*
@@ -458,6 +465,15 @@ struct btrfs_fs_devices {
         /* Device to be used for reading in case of RAID1. */
         u64 read_devid;
  #endif
+
+       /*
+        * Each value indicates the available space for that profile.
+        * U64_MAX means the estimation is unavailable.
+        *
+        * Protected by per_profile_lock;
+        */
+       u64 per_profile_avail[BTRFS_NR_RAID_TYPES];
+       spinlock_t per_profile_lock;
  };
  
  #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info)       \
@@ -887,6 +903,24 @@ int btrfs_bg_type_to_factor(u64 flags);
  const char *btrfs_bg_type_to_raid_name(u64 flags);
  int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
  bool btrfs_verify_dev_items(const struct btrfs_fs_info *fs_info);
+void btrfs_update_per_profile_avail(struct btrfs_fs_info *fs_info);
+
+static inline bool btrfs_get_per_profile_avail(struct btrfs_fs_info *fs_info,
+                                              u64 profile, u64 *avail_ret)
+{
+       enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(profile);
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+       bool uptodate = false;
+
+       spin_lock(&fs_devices->per_profile_lock);
+       if (fs_devices->per_profile_avail[index] != U64_MAX) {
+               uptodate = true;
+               *avail_ret = fs_devices->per_profile_avail[index];
+       }
+       spin_unlock(&fs_info->fs_devices->per_profile_lock);
+       return uptodate;
+}
+
  bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical);
  
  bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
author	Qu Wenruo <wqu@suse.com>
	Wed, 4 Feb 2026 02:54:06 +0000 (13:24 +1030)
committer	David Sterba <dsterba@suse.com>
	Tue, 7 Apr 2026 16:55:53 +0000 (18:55 +0200)
fs/btrfs/volumes.c		patch \| blob \| blame \| history
fs/btrfs/volumes.h		patch \| blob \| blame \| history