]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
btrfs: zoned: fixup last alloc pointer after extent removal for RAID0/10
authorNaohiro Aota <naohiro.aota@wdc.com>
Fri, 23 Jan 2026 12:41:36 +0000 (21:41 +0900)
committerDavid Sterba <dsterba@suse.com>
Tue, 3 Feb 2026 06:56:23 +0000 (07:56 +0100)
When a block group is composed of a sequential write zone and a
conventional zone, we recover the (pseudo) write pointer of the
conventional zone using the end of the last allocated position.

However, if the last extent in a block group is removed, the last extent
position will be smaller than the other real write pointer position.
Then, that will cause an error due to mismatch of the write pointers.

We can fixup this case by moving the alloc_offset to the corresponding
write pointer position.

Fixes: 568220fa9657 ("btrfs: zoned: support RAID0/1/10 on top of raid stripe tree")
CC: stable@vger.kernel.org # 6.12+
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
fs/btrfs/zoned.c

index a10e1076c881608ffd69b1da28fcadf5262910ac..7fa60a44d71666796ce1ab50155cfc817ae43d68 100644 (file)
@@ -1561,7 +1561,9 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
 {
        struct btrfs_fs_info *fs_info = bg->fs_info;
        u64 stripe_nr = 0, stripe_offset = 0;
+       u64 prev_offset = 0;
        u32 stripe_index = 0;
+       bool has_partial = false, has_conventional = false;
 
        if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
                btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
@@ -1569,6 +1571,35 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
                return -EINVAL;
        }
 
+       /*
+        * When the last extent is removed, last_alloc can be smaller than the other write
+        * pointer. In that case, last_alloc should be moved to the corresponding write
+        * pointer position.
+        */
+       for (int i = 0; i < map->num_stripes; i++) {
+               u64 alloc;
+
+               if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+                   zone_info[i].alloc_offset == WP_CONVENTIONAL)
+                       continue;
+
+               stripe_nr = zone_info[i].alloc_offset >> BTRFS_STRIPE_LEN_SHIFT;
+               stripe_offset = zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK;
+               if (stripe_offset == 0 && stripe_nr > 0) {
+                       stripe_nr--;
+                       stripe_offset = BTRFS_STRIPE_LEN;
+               }
+               alloc = ((stripe_nr * map->num_stripes + i) << BTRFS_STRIPE_LEN_SHIFT) +
+                       stripe_offset;
+               last_alloc = max(last_alloc, alloc);
+
+               /* Partially written stripe found. It should be last. */
+               if (zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK)
+                       break;
+       }
+       stripe_nr = 0;
+       stripe_offset = 0;
+
        if (last_alloc) {
                u32 factor = map->num_stripes;
 
@@ -1582,7 +1613,7 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
                        continue;
 
                if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
-
+                       has_conventional = true;
                        zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr);
 
                        if (stripe_index > i)
@@ -1591,6 +1622,28 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
                                zone_info[i].alloc_offset += stripe_offset;
                }
 
+               /* Verification */
+               if (i != 0) {
+                       if (unlikely(prev_offset < zone_info[i].alloc_offset)) {
+                               btrfs_err(fs_info,
+                               "zoned: stripe position disorder found in block group %llu",
+                                         bg->start);
+                               return -EIO;
+                       }
+
+                       if (unlikely(has_partial &&
+                                    (zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK))) {
+                               btrfs_err(fs_info,
+                               "zoned: multiple partial written stripe found in block group %llu",
+                                         bg->start);
+                               return -EIO;
+                       }
+               }
+               prev_offset = zone_info[i].alloc_offset;
+
+               if ((zone_info[i].alloc_offset & BTRFS_STRIPE_LEN_MASK) != 0)
+                       has_partial = true;
+
                if (test_bit(0, active) != test_bit(i, active)) {
                        if (unlikely(!btrfs_zone_activate(bg)))
                                return -EIO;
@@ -1602,6 +1655,19 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
                bg->alloc_offset += zone_info[i].alloc_offset;
        }
 
+       /* Check if all devices stay in the same stripe row. */
+       if (unlikely(zone_info[0].alloc_offset -
+                    zone_info[map->num_stripes - 1].alloc_offset > BTRFS_STRIPE_LEN)) {
+               btrfs_err(fs_info, "zoned: stripe gap too large in block group %llu", bg->start);
+               return -EIO;
+       }
+
+       if (unlikely(has_conventional && bg->alloc_offset < last_alloc)) {
+               btrfs_err(fs_info, "zoned: allocated extent stays beyond write pointers %llu %llu",
+                         bg->alloc_offset, last_alloc);
+               return -EIO;
+       }
+
        return 0;
 }
 
@@ -1612,8 +1678,11 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
                                         u64 last_alloc)
 {
        struct btrfs_fs_info *fs_info = bg->fs_info;
+       u64 AUTO_KFREE(raid0_allocs);
        u64 stripe_nr = 0, stripe_offset = 0;
        u32 stripe_index = 0;
+       bool has_partial = false, has_conventional = false;
+       u64 prev_offset = 0;
 
        if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) {
                btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
@@ -1621,6 +1690,60 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
                return -EINVAL;
        }
 
+       raid0_allocs = kcalloc(map->num_stripes / map->sub_stripes, sizeof(*raid0_allocs),
+                              GFP_NOFS);
+       if (!raid0_allocs)
+               return -ENOMEM;
+
+       /*
+        * When the last extent is removed, last_alloc can be smaller than the other write
+        * pointer. In that case, last_alloc should be moved to the corresponding write
+        * pointer position.
+        */
+       for (int i = 0; i < map->num_stripes; i += map->sub_stripes) {
+               u64 alloc = zone_info[i].alloc_offset;
+
+               for (int j = 1; j < map->sub_stripes; j++) {
+                       int idx = i + j;
+
+                       if (zone_info[idx].alloc_offset == WP_MISSING_DEV ||
+                           zone_info[idx].alloc_offset == WP_CONVENTIONAL)
+                               continue;
+                       if (alloc == WP_MISSING_DEV || alloc == WP_CONVENTIONAL) {
+                               alloc = zone_info[idx].alloc_offset;
+                       } else if (unlikely(zone_info[idx].alloc_offset != alloc)) {
+                               btrfs_err(fs_info,
+                               "zoned: write pointer mismatch found in block group %llu",
+                                         bg->start);
+                               return -EIO;
+                       }
+               }
+
+               raid0_allocs[i / map->sub_stripes] = alloc;
+               if (alloc == WP_CONVENTIONAL)
+                       continue;
+               if (unlikely(alloc == WP_MISSING_DEV)) {
+                       btrfs_err(fs_info,
+                       "zoned: cannot recover write pointer of block group %llu due to missing device",
+                                 bg->start);
+                       return -EIO;
+               }
+
+               stripe_nr = alloc >> BTRFS_STRIPE_LEN_SHIFT;
+               stripe_offset = alloc & BTRFS_STRIPE_LEN_MASK;
+               if (stripe_offset == 0 && stripe_nr > 0) {
+                       stripe_nr--;
+                       stripe_offset = BTRFS_STRIPE_LEN;
+               }
+
+               alloc = ((stripe_nr * (map->num_stripes / map->sub_stripes) +
+                         (i / map->sub_stripes)) <<
+                        BTRFS_STRIPE_LEN_SHIFT) + stripe_offset;
+               last_alloc = max(last_alloc, alloc);
+       }
+       stripe_nr = 0;
+       stripe_offset = 0;
+
        if (last_alloc) {
                u32 factor = map->num_stripes / map->sub_stripes;
 
@@ -1630,24 +1753,51 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
        }
 
        for (int i = 0; i < map->num_stripes; i++) {
-               if (zone_info[i].alloc_offset == WP_MISSING_DEV)
-                       continue;
+               int idx = i / map->sub_stripes;
 
-               if (test_bit(0, active) != test_bit(i, active)) {
-                       if (unlikely(!btrfs_zone_activate(bg)))
-                               return -EIO;
-               } else {
-                       if (test_bit(0, active))
-                               set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
+               if (raid0_allocs[idx] == WP_CONVENTIONAL) {
+                       has_conventional = true;
+                       raid0_allocs[idx] = btrfs_stripe_nr_to_offset(stripe_nr);
+
+                       if (stripe_index > idx)
+                               raid0_allocs[idx] += BTRFS_STRIPE_LEN;
+                       else if (stripe_index == idx)
+                               raid0_allocs[idx] += stripe_offset;
                }
 
-               if (zone_info[i].alloc_offset == WP_CONVENTIONAL) {
-                       zone_info[i].alloc_offset = btrfs_stripe_nr_to_offset(stripe_nr);
+               if ((i % map->sub_stripes) == 0) {
+                       /* Verification */
+                       if (i != 0) {
+                               if (unlikely(prev_offset < raid0_allocs[idx])) {
+                                       btrfs_err(fs_info,
+                                       "zoned: stripe position disorder found in block group %llu",
+                                                 bg->start);
+                                       return -EIO;
+                               }
 
-                       if (stripe_index > (i / map->sub_stripes))
-                               zone_info[i].alloc_offset += BTRFS_STRIPE_LEN;
-                       else if (stripe_index == (i / map->sub_stripes))
-                               zone_info[i].alloc_offset += stripe_offset;
+                               if (unlikely(has_partial &&
+                                            (raid0_allocs[idx] & BTRFS_STRIPE_LEN_MASK))) {
+                                       btrfs_err(fs_info,
+                                       "zoned: multiple partial written stripe found in block group %llu",
+                                                 bg->start);
+                                       return -EIO;
+                               }
+                       }
+                       prev_offset = raid0_allocs[idx];
+
+                       if ((raid0_allocs[idx] & BTRFS_STRIPE_LEN_MASK) != 0)
+                               has_partial = true;
+               }
+
+               if (zone_info[i].alloc_offset == WP_MISSING_DEV ||
+                   zone_info[i].alloc_offset == WP_CONVENTIONAL)
+                       zone_info[i].alloc_offset = raid0_allocs[idx];
+
+               if (test_bit(0, active) != test_bit(i, active)) {
+                       if (unlikely(!btrfs_zone_activate(bg)))
+                               return -EIO;
+               } else if (test_bit(0, active)) {
+                       set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags);
                }
 
                if ((i % map->sub_stripes) == 0) {
@@ -1656,6 +1806,20 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
                }
        }
 
+       /* Check if all devices stay in the same stripe row. */
+       if (unlikely(zone_info[0].alloc_offset -
+                    zone_info[map->num_stripes - 1].alloc_offset > BTRFS_STRIPE_LEN)) {
+               btrfs_err(fs_info, "zoned: stripe gap too large in block group %llu",
+                         bg->start);
+               return -EIO;
+       }
+
+       if (unlikely(has_conventional && bg->alloc_offset < last_alloc)) {
+               btrfs_err(fs_info, "zoned: allocated extent stays beyond write pointers %llu %llu",
+                         bg->alloc_offset, last_alloc);
+               return -EIO;
+       }
+
        return 0;
 }