From b291ad4458df8311626dfa0a089918f6a542d6bc Mon Sep 17 00:00:00 2001 From: jinbaohong Date: Wed, 28 Jan 2026 07:06:41 +0000 Subject: [PATCH] btrfs: fix transaction commit blocking during trim of unallocated space When trimming unallocated space, btrfs_trim_fs() holds the device_list_mutex for the entire duration while iterating through all devices. On large filesystems with significant unallocated space, this operation can take minutes to hours on large storage systems. This causes a problem because btrfs_run_dev_stats(), which is called during transaction commit, also requires device_list_mutex: btrfs_trim_fs() mutex_lock(&fs_devices->device_list_mutex) list_for_each_entry(device, ...) btrfs_trim_free_extents(device) mutex_unlock(&fs_devices->device_list_mutex) commit_transaction() btrfs_run_dev_stats() mutex_lock(&fs_devices->device_list_mutex) // blocked! ... While trim is running, all transaction commits are blocked waiting for the mutex. Fix this by refactoring btrfs_trim_free_extents() to process devices in bounded chunks (up to 2GB per iteration) and release device_list_mutex between chunks. Signed-off-by: robbieko Signed-off-by: jinbaohong Reviewed-by: Filipe Manana Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 156 +++++++++++++++++++++++++++++++++++------ fs/btrfs/fs.h | 6 ++ 2 files changed, 140 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 87fd94449f11b..03cf9f242c700 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6513,10 +6513,12 @@ void btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u6 * it while performing the free space search since we have already * held back allocations. */ -static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) +static int btrfs_trim_free_extents_throttle(struct btrfs_device *device, + u64 *trimmed, u64 pos, u64 *ret_next_pos) { - u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0; int ret; + u64 start = pos; + u64 trim_len = 0; *trimmed = 0; @@ -6536,15 +6538,20 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) while (1) { struct btrfs_fs_info *fs_info = device->fs_info; + u64 cur_start; + u64 end; + u64 len; u64 bytes; ret = mutex_lock_interruptible(&fs_info->chunk_mutex); if (ret) break; + cur_start = start; btrfs_find_first_clear_extent_bit(&device->alloc_state, start, &start, &end, CHUNK_TRIMMED | CHUNK_ALLOCATED); + start = max(start, cur_start); /* Check if there are any CHUNK_* bits left */ if (start > device->total_bytes) { @@ -6570,6 +6577,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) end = min(end, device->total_bytes - 1); len = end - start + 1; + len = min(len, BTRFS_MAX_TRIM_LENGTH); /* We didn't find any extents */ if (!len) { @@ -6590,6 +6598,12 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) start += len; *trimmed += bytes; + trim_len += len; + if (trim_len >= BTRFS_MAX_TRIM_LENGTH) { + *ret_next_pos = start; + ret = -EAGAIN; + break; + } if (btrfs_trim_interrupted()) { ret = -ERESTARTSYS; @@ -6602,6 +6616,122 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) return ret; } +static int btrfs_trim_free_extents(struct btrfs_fs_info *fs_info, u64 *trimmed, + u64 *dev_failed, int *dev_ret) +{ + struct btrfs_device *dev; + struct btrfs_device *working_dev = NULL; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + u8 uuid[BTRFS_UUID_SIZE]; + u64 start = BTRFS_DEVICE_RANGE_RESERVED; + + *trimmed = 0; + *dev_failed = 0; + *dev_ret = 0; + + /* Find the device with the smallest UUID to start. */ + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) + continue; + if (!working_dev || + memcmp(dev->uuid, working_dev->uuid, BTRFS_UUID_SIZE) < 0) + working_dev = dev; + } + if (working_dev) + memcpy(uuid, working_dev->uuid, BTRFS_UUID_SIZE); + mutex_unlock(&fs_devices->device_list_mutex); + + if (!working_dev) + return 0; + + while (1) { + u64 group_trimmed = 0; + u64 next_pos = 0; + int ret = 0; + + mutex_lock(&fs_devices->device_list_mutex); + + /* Find and trim the current device. */ + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) + continue; + if (dev == working_dev) { + ret = btrfs_trim_free_extents_throttle(working_dev, + &group_trimmed, start, &next_pos); + break; + } + } + + /* Throttle: continue the same device from the new position. */ + if (ret == -EAGAIN && next_pos > start) { + mutex_unlock(&fs_devices->device_list_mutex); + *trimmed += group_trimmed; + start = next_pos; + cond_resched(); + continue; + } + + /* User interrupted. */ + if (ret == -ERESTARTSYS || ret == -EINTR) { + mutex_unlock(&fs_devices->device_list_mutex); + *trimmed += group_trimmed; + return ret; + } + + /* + * Device completed (ret == 0), failed, or EAGAIN with no progress. + * Record error if any, then move to next device. + */ + if (ret == -EAGAIN) { + /* No progress - log and skip device. */ + btrfs_warn(fs_info, + "trim throttle: no progress, offset=%llu device %s, skipping", + start, btrfs_dev_name(working_dev)); + (*dev_failed)++; + if (!*dev_ret) + *dev_ret = ret; + } else if (ret) { + /* Device failed with error. */ + (*dev_failed)++; + if (!*dev_ret) + *dev_ret = ret; + } + + /* + * Find next device: smallest UUID larger than current. + * Devices added during trim with smaller UUID will be skipped. + */ + working_dev = NULL; + list_for_each_entry(dev, &fs_devices->devices, dev_list) { + if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) + continue; + /* Must larger than current UUID. */ + if (memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE) <= 0) + continue; + /* Find the smallest. */ + if (!working_dev || + memcmp(dev->uuid, working_dev->uuid, BTRFS_UUID_SIZE) < 0) + working_dev = dev; + } + if (working_dev) + memcpy(uuid, working_dev->uuid, BTRFS_UUID_SIZE); + + mutex_unlock(&fs_devices->device_list_mutex); + + *trimmed += group_trimmed; + start = BTRFS_DEVICE_RANGE_RESERVED; + + /* No more devices. */ + if (!working_dev) + break; + + cond_resched(); + } + + return 0; +} + /* * Trim the whole filesystem by: * 1) trimming the free space in each block group @@ -6613,9 +6743,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) */ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_block_group *cache = NULL; - struct btrfs_device *device; u64 group_trimmed; u64 range_end = U64_MAX; u64 start; @@ -6686,24 +6814,8 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) if (ret == -ERESTARTSYS || ret == -EINTR) return ret; - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) - continue; - - ret = btrfs_trim_free_extents(device, &group_trimmed); - - trimmed += group_trimmed; - if (ret == -ERESTARTSYS || ret == -EINTR) - break; - if (ret) { - dev_failed++; - if (!dev_ret) - dev_ret = ret; - continue; - } - } - mutex_unlock(&fs_devices->device_list_mutex); + ret = btrfs_trim_free_extents(fs_info, &group_trimmed, &dev_failed, &dev_ret); + trimmed += group_trimmed; if (dev_failed) btrfs_warn(fs_info, diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index d3762fbe7267a..3de3b517810ed 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -65,6 +65,12 @@ struct btrfs_space_info; #define BTRFS_MAX_EXTENT_SIZE SZ_128M +/* + * Maximum length to trim in a single iteration to avoid holding device list + * mutex for too long. + */ +#define BTRFS_MAX_TRIM_LENGTH SZ_2G + #define BTRFS_OLDEST_GENERATION 0ULL #define BTRFS_EMPTY_DIR_SIZE 0 -- 2.47.3