1 // SPDX-License-Identifier: GPL-2.0
3 #include <linux/bitops.h>
4 #include <linux/slab.h>
5 #include <linux/blkdev.h>
6 #include <linux/sched/mm.h>
7 #include <linux/atomic.h>
8 #include <linux/vmalloc.h>
12 #include "rcu-string.h"
14 #include "block-group.h"
15 #include "dev-replace.h"
16 #include "space-info.h"
18 #include "accessors.h"
21 /* Maximum number of zones to report per blkdev_report_zones() call */
22 #define BTRFS_REPORT_NR_ZONES 4096
23 /* Invalid allocation pointer value for missing devices */
24 #define WP_MISSING_DEV ((u64)-1)
25 /* Pseudo write pointer value for conventional zone */
26 #define WP_CONVENTIONAL ((u64)-2)
29 * Location of the first zone of superblock logging zone pairs.
31 * - primary superblock: 0B (zone 0)
32 * - first copy: 512G (zone starting at that offset)
33 * - second copy: 4T (zone starting at that offset)
35 #define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL)
36 #define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G)
37 #define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G)
39 #define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
40 #define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
42 /* Number of superblock log zones */
43 #define BTRFS_NR_SB_LOG_ZONES 2
46 * Minimum of active zones we need:
48 * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
49 * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
50 * - 1 zone for tree-log dedicated block group
51 * - 1 zone for relocation
53 #define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5)
56 * Minimum / maximum supported zone size. Currently, SMR disks have a zone
57 * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
58 * We do not expect the zone size to become larger than 8GiB or smaller than
59 * 4MiB in the near future.
61 #define BTRFS_MAX_ZONE_SIZE SZ_8G
62 #define BTRFS_MIN_ZONE_SIZE SZ_4M
64 #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
66 static void wait_eb_writebacks(struct btrfs_block_group
*block_group
);
67 static int do_zone_finish(struct btrfs_block_group
*block_group
, bool fully_written
);
69 static inline bool sb_zone_is_full(const struct blk_zone
*zone
)
71 return (zone
->cond
== BLK_ZONE_COND_FULL
) ||
72 (zone
->wp
+ SUPER_INFO_SECTORS
> zone
->start
+ zone
->capacity
);
75 static int copy_zone_info_cb(struct blk_zone
*zone
, unsigned int idx
, void *data
)
77 struct blk_zone
*zones
= data
;
79 memcpy(&zones
[idx
], zone
, sizeof(*zone
));
84 static int sb_write_pointer(struct block_device
*bdev
, struct blk_zone
*zones
,
87 bool empty
[BTRFS_NR_SB_LOG_ZONES
];
88 bool full
[BTRFS_NR_SB_LOG_ZONES
];
92 for (i
= 0; i
< BTRFS_NR_SB_LOG_ZONES
; i
++) {
93 ASSERT(zones
[i
].type
!= BLK_ZONE_TYPE_CONVENTIONAL
);
94 empty
[i
] = (zones
[i
].cond
== BLK_ZONE_COND_EMPTY
);
95 full
[i
] = sb_zone_is_full(&zones
[i
]);
99 * Possible states of log buffer zones
101 * Empty[0] In use[0] Full[0]
107 * *: Special case, no superblock is written
108 * 0: Use write pointer of zones[0]
109 * 1: Use write pointer of zones[1]
110 * C: Compare super blocks from zones[0] and zones[1], use the latest
111 * one determined by generation
115 if (empty
[0] && empty
[1]) {
116 /* Special case to distinguish no superblock to read */
117 *wp_ret
= zones
[0].start
<< SECTOR_SHIFT
;
119 } else if (full
[0] && full
[1]) {
120 /* Compare two super blocks */
121 struct address_space
*mapping
= bdev
->bd_inode
->i_mapping
;
122 struct page
*page
[BTRFS_NR_SB_LOG_ZONES
];
123 struct btrfs_super_block
*super
[BTRFS_NR_SB_LOG_ZONES
];
126 for (i
= 0; i
< BTRFS_NR_SB_LOG_ZONES
; i
++) {
127 u64 zone_end
= (zones
[i
].start
+ zones
[i
].capacity
) << SECTOR_SHIFT
;
128 u64 bytenr
= ALIGN_DOWN(zone_end
, BTRFS_SUPER_INFO_SIZE
) -
129 BTRFS_SUPER_INFO_SIZE
;
131 page
[i
] = read_cache_page_gfp(mapping
,
132 bytenr
>> PAGE_SHIFT
, GFP_NOFS
);
133 if (IS_ERR(page
[i
])) {
135 btrfs_release_disk_super(super
[0]);
136 return PTR_ERR(page
[i
]);
138 super
[i
] = page_address(page
[i
]);
141 if (btrfs_super_generation(super
[0]) >
142 btrfs_super_generation(super
[1]))
143 sector
= zones
[1].start
;
145 sector
= zones
[0].start
;
147 for (i
= 0; i
< BTRFS_NR_SB_LOG_ZONES
; i
++)
148 btrfs_release_disk_super(super
[i
]);
149 } else if (!full
[0] && (empty
[1] || full
[1])) {
150 sector
= zones
[0].wp
;
151 } else if (full
[0]) {
152 sector
= zones
[1].wp
;
156 *wp_ret
= sector
<< SECTOR_SHIFT
;
161 * Get the first zone number of the superblock mirror
163 static inline u32
sb_zone_number(int shift
, int mirror
)
167 ASSERT(mirror
< BTRFS_SUPER_MIRROR_MAX
);
169 case 0: zone
= 0; break;
170 case 1: zone
= 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT
- shift
); break;
171 case 2: zone
= 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT
- shift
); break;
174 ASSERT(zone
<= U32_MAX
);
179 static inline sector_t
zone_start_sector(u32 zone_number
,
180 struct block_device
*bdev
)
182 return (sector_t
)zone_number
<< ilog2(bdev_zone_sectors(bdev
));
185 static inline u64
zone_start_physical(u32 zone_number
,
186 struct btrfs_zoned_device_info
*zone_info
)
188 return (u64
)zone_number
<< zone_info
->zone_size_shift
;
192 * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
193 * device into static sized chunks and fake a conventional zone on each of
196 static int emulate_report_zones(struct btrfs_device
*device
, u64 pos
,
197 struct blk_zone
*zones
, unsigned int nr_zones
)
199 const sector_t zone_sectors
= device
->fs_info
->zone_size
>> SECTOR_SHIFT
;
200 sector_t bdev_size
= bdev_nr_sectors(device
->bdev
);
203 pos
>>= SECTOR_SHIFT
;
204 for (i
= 0; i
< nr_zones
; i
++) {
205 zones
[i
].start
= i
* zone_sectors
+ pos
;
206 zones
[i
].len
= zone_sectors
;
207 zones
[i
].capacity
= zone_sectors
;
208 zones
[i
].wp
= zones
[i
].start
+ zone_sectors
;
209 zones
[i
].type
= BLK_ZONE_TYPE_CONVENTIONAL
;
210 zones
[i
].cond
= BLK_ZONE_COND_NOT_WP
;
212 if (zones
[i
].wp
>= bdev_size
) {
221 static int btrfs_get_dev_zones(struct btrfs_device
*device
, u64 pos
,
222 struct blk_zone
*zones
, unsigned int *nr_zones
)
224 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
230 if (!bdev_is_zoned(device
->bdev
)) {
231 ret
= emulate_report_zones(device
, pos
, zones
, *nr_zones
);
237 if (zinfo
->zone_cache
) {
241 ASSERT(IS_ALIGNED(pos
, zinfo
->zone_size
));
242 zno
= pos
>> zinfo
->zone_size_shift
;
244 * We cannot report zones beyond the zone end. So, it is OK to
245 * cap *nr_zones to at the end.
247 *nr_zones
= min_t(u32
, *nr_zones
, zinfo
->nr_zones
- zno
);
249 for (i
= 0; i
< *nr_zones
; i
++) {
250 struct blk_zone
*zone_info
;
252 zone_info
= &zinfo
->zone_cache
[zno
+ i
];
257 if (i
== *nr_zones
) {
258 /* Cache hit on all the zones */
259 memcpy(zones
, zinfo
->zone_cache
+ zno
,
260 sizeof(*zinfo
->zone_cache
) * *nr_zones
);
265 ret
= blkdev_report_zones(device
->bdev
, pos
>> SECTOR_SHIFT
, *nr_zones
,
266 copy_zone_info_cb
, zones
);
268 btrfs_err_in_rcu(device
->fs_info
,
269 "zoned: failed to read zone %llu on %s (devid %llu)",
270 pos
, rcu_str_deref(device
->name
),
279 if (zinfo
->zone_cache
) {
280 u32 zno
= pos
>> zinfo
->zone_size_shift
;
282 memcpy(zinfo
->zone_cache
+ zno
, zones
,
283 sizeof(*zinfo
->zone_cache
) * *nr_zones
);
289 /* The emulated zone size is determined from the size of device extent */
290 static int calculate_emulated_zone_size(struct btrfs_fs_info
*fs_info
)
292 struct btrfs_path
*path
;
293 struct btrfs_root
*root
= fs_info
->dev_root
;
294 struct btrfs_key key
;
295 struct extent_buffer
*leaf
;
296 struct btrfs_dev_extent
*dext
;
300 key
.type
= BTRFS_DEV_EXTENT_KEY
;
303 path
= btrfs_alloc_path();
307 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
311 if (path
->slots
[0] >= btrfs_header_nritems(path
->nodes
[0])) {
312 ret
= btrfs_next_leaf(root
, path
);
315 /* No dev extents at all? Not good */
322 leaf
= path
->nodes
[0];
323 dext
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_dev_extent
);
324 fs_info
->zone_size
= btrfs_dev_extent_length(leaf
, dext
);
328 btrfs_free_path(path
);
333 int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info
*fs_info
)
335 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
336 struct btrfs_device
*device
;
339 /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
340 if (!btrfs_fs_incompat(fs_info
, ZONED
))
343 mutex_lock(&fs_devices
->device_list_mutex
);
344 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
345 /* We can skip reading of zone info for missing devices */
349 ret
= btrfs_get_dev_zone_info(device
, true);
353 mutex_unlock(&fs_devices
->device_list_mutex
);
358 int btrfs_get_dev_zone_info(struct btrfs_device
*device
, bool populate_cache
)
360 struct btrfs_fs_info
*fs_info
= device
->fs_info
;
361 struct btrfs_zoned_device_info
*zone_info
= NULL
;
362 struct block_device
*bdev
= device
->bdev
;
363 unsigned int max_active_zones
;
364 unsigned int nactive
;
367 struct blk_zone
*zones
= NULL
;
368 unsigned int i
, nreported
= 0, nr_zones
;
369 sector_t zone_sectors
;
370 char *model
, *emulated
;
374 * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
377 if (!btrfs_fs_incompat(fs_info
, ZONED
))
380 if (device
->zone_info
)
383 zone_info
= kzalloc(sizeof(*zone_info
), GFP_KERNEL
);
387 device
->zone_info
= zone_info
;
389 if (!bdev_is_zoned(bdev
)) {
390 if (!fs_info
->zone_size
) {
391 ret
= calculate_emulated_zone_size(fs_info
);
396 ASSERT(fs_info
->zone_size
);
397 zone_sectors
= fs_info
->zone_size
>> SECTOR_SHIFT
;
399 zone_sectors
= bdev_zone_sectors(bdev
);
402 ASSERT(is_power_of_two_u64(zone_sectors
));
403 zone_info
->zone_size
= zone_sectors
<< SECTOR_SHIFT
;
405 /* We reject devices with a zone size larger than 8GB */
406 if (zone_info
->zone_size
> BTRFS_MAX_ZONE_SIZE
) {
407 btrfs_err_in_rcu(fs_info
,
408 "zoned: %s: zone size %llu larger than supported maximum %llu",
409 rcu_str_deref(device
->name
),
410 zone_info
->zone_size
, BTRFS_MAX_ZONE_SIZE
);
413 } else if (zone_info
->zone_size
< BTRFS_MIN_ZONE_SIZE
) {
414 btrfs_err_in_rcu(fs_info
,
415 "zoned: %s: zone size %llu smaller than supported minimum %u",
416 rcu_str_deref(device
->name
),
417 zone_info
->zone_size
, BTRFS_MIN_ZONE_SIZE
);
422 nr_sectors
= bdev_nr_sectors(bdev
);
423 zone_info
->zone_size_shift
= ilog2(zone_info
->zone_size
);
424 zone_info
->nr_zones
= nr_sectors
>> ilog2(zone_sectors
);
425 if (!IS_ALIGNED(nr_sectors
, zone_sectors
))
426 zone_info
->nr_zones
++;
428 max_active_zones
= bdev_max_active_zones(bdev
);
429 if (max_active_zones
&& max_active_zones
< BTRFS_MIN_ACTIVE_ZONES
) {
430 btrfs_err_in_rcu(fs_info
,
431 "zoned: %s: max active zones %u is too small, need at least %u active zones",
432 rcu_str_deref(device
->name
), max_active_zones
,
433 BTRFS_MIN_ACTIVE_ZONES
);
437 zone_info
->max_active_zones
= max_active_zones
;
439 zone_info
->seq_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
440 if (!zone_info
->seq_zones
) {
445 zone_info
->empty_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
446 if (!zone_info
->empty_zones
) {
451 zone_info
->active_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
452 if (!zone_info
->active_zones
) {
457 zones
= kvcalloc(BTRFS_REPORT_NR_ZONES
, sizeof(struct blk_zone
), GFP_KERNEL
);
464 * Enable zone cache only for a zoned device. On a non-zoned device, we
465 * fill the zone info with emulated CONVENTIONAL zones, so no need to
468 if (populate_cache
&& bdev_is_zoned(device
->bdev
)) {
469 zone_info
->zone_cache
= vcalloc(zone_info
->nr_zones
,
470 sizeof(struct blk_zone
));
471 if (!zone_info
->zone_cache
) {
472 btrfs_err_in_rcu(device
->fs_info
,
473 "zoned: failed to allocate zone cache for %s",
474 rcu_str_deref(device
->name
));
482 while (sector
< nr_sectors
) {
483 nr_zones
= BTRFS_REPORT_NR_ZONES
;
484 ret
= btrfs_get_dev_zones(device
, sector
<< SECTOR_SHIFT
, zones
,
489 for (i
= 0; i
< nr_zones
; i
++) {
490 if (zones
[i
].type
== BLK_ZONE_TYPE_SEQWRITE_REQ
)
491 __set_bit(nreported
, zone_info
->seq_zones
);
492 switch (zones
[i
].cond
) {
493 case BLK_ZONE_COND_EMPTY
:
494 __set_bit(nreported
, zone_info
->empty_zones
);
496 case BLK_ZONE_COND_IMP_OPEN
:
497 case BLK_ZONE_COND_EXP_OPEN
:
498 case BLK_ZONE_COND_CLOSED
:
499 __set_bit(nreported
, zone_info
->active_zones
);
505 sector
= zones
[nr_zones
- 1].start
+ zones
[nr_zones
- 1].len
;
508 if (nreported
!= zone_info
->nr_zones
) {
509 btrfs_err_in_rcu(device
->fs_info
,
510 "inconsistent number of zones on %s (%u/%u)",
511 rcu_str_deref(device
->name
), nreported
,
512 zone_info
->nr_zones
);
517 if (max_active_zones
) {
518 if (nactive
> max_active_zones
) {
519 btrfs_err_in_rcu(device
->fs_info
,
520 "zoned: %u active zones on %s exceeds max_active_zones %u",
521 nactive
, rcu_str_deref(device
->name
),
526 atomic_set(&zone_info
->active_zones_left
,
527 max_active_zones
- nactive
);
528 set_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING
, &fs_info
->flags
);
531 /* Validate superblock log */
532 nr_zones
= BTRFS_NR_SB_LOG_ZONES
;
533 for (i
= 0; i
< BTRFS_SUPER_MIRROR_MAX
; i
++) {
536 int sb_pos
= BTRFS_NR_SB_LOG_ZONES
* i
;
538 sb_zone
= sb_zone_number(zone_info
->zone_size_shift
, i
);
539 if (sb_zone
+ 1 >= zone_info
->nr_zones
)
542 ret
= btrfs_get_dev_zones(device
,
543 zone_start_physical(sb_zone
, zone_info
),
544 &zone_info
->sb_zones
[sb_pos
],
549 if (nr_zones
!= BTRFS_NR_SB_LOG_ZONES
) {
550 btrfs_err_in_rcu(device
->fs_info
,
551 "zoned: failed to read super block log zone info at devid %llu zone %u",
552 device
->devid
, sb_zone
);
558 * If zones[0] is conventional, always use the beginning of the
559 * zone to record superblock. No need to validate in that case.
561 if (zone_info
->sb_zones
[BTRFS_NR_SB_LOG_ZONES
* i
].type
==
562 BLK_ZONE_TYPE_CONVENTIONAL
)
565 ret
= sb_write_pointer(device
->bdev
,
566 &zone_info
->sb_zones
[sb_pos
], &sb_wp
);
567 if (ret
!= -ENOENT
&& ret
) {
568 btrfs_err_in_rcu(device
->fs_info
,
569 "zoned: super block log zone corrupted devid %llu zone %u",
570 device
->devid
, sb_zone
);
579 if (bdev_is_zoned(bdev
)) {
580 model
= "host-managed zoned";
584 emulated
= "emulated ";
587 btrfs_info_in_rcu(fs_info
,
588 "%s block device %s, %u %szones of %llu bytes",
589 model
, rcu_str_deref(device
->name
), zone_info
->nr_zones
,
590 emulated
, zone_info
->zone_size
);
596 btrfs_destroy_dev_zone_info(device
);
600 void btrfs_destroy_dev_zone_info(struct btrfs_device
*device
)
602 struct btrfs_zoned_device_info
*zone_info
= device
->zone_info
;
607 bitmap_free(zone_info
->active_zones
);
608 bitmap_free(zone_info
->seq_zones
);
609 bitmap_free(zone_info
->empty_zones
);
610 vfree(zone_info
->zone_cache
);
612 device
->zone_info
= NULL
;
615 struct btrfs_zoned_device_info
*btrfs_clone_dev_zone_info(struct btrfs_device
*orig_dev
)
617 struct btrfs_zoned_device_info
*zone_info
;
619 zone_info
= kmemdup(orig_dev
->zone_info
, sizeof(*zone_info
), GFP_KERNEL
);
623 zone_info
->seq_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
624 if (!zone_info
->seq_zones
)
627 bitmap_copy(zone_info
->seq_zones
, orig_dev
->zone_info
->seq_zones
,
628 zone_info
->nr_zones
);
630 zone_info
->empty_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
631 if (!zone_info
->empty_zones
)
634 bitmap_copy(zone_info
->empty_zones
, orig_dev
->zone_info
->empty_zones
,
635 zone_info
->nr_zones
);
637 zone_info
->active_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
638 if (!zone_info
->active_zones
)
641 bitmap_copy(zone_info
->active_zones
, orig_dev
->zone_info
->active_zones
,
642 zone_info
->nr_zones
);
643 zone_info
->zone_cache
= NULL
;
648 bitmap_free(zone_info
->seq_zones
);
649 bitmap_free(zone_info
->empty_zones
);
650 bitmap_free(zone_info
->active_zones
);
655 int btrfs_get_dev_zone(struct btrfs_device
*device
, u64 pos
,
656 struct blk_zone
*zone
)
658 unsigned int nr_zones
= 1;
661 ret
= btrfs_get_dev_zones(device
, pos
, zone
, &nr_zones
);
662 if (ret
!= 0 || !nr_zones
)
663 return ret
? ret
: -EIO
;
668 static int btrfs_check_for_zoned_device(struct btrfs_fs_info
*fs_info
)
670 struct btrfs_device
*device
;
672 list_for_each_entry(device
, &fs_info
->fs_devices
->devices
, dev_list
) {
673 if (device
->bdev
&& bdev_is_zoned(device
->bdev
)) {
675 "zoned: mode not enabled but zoned device found: %pg",
684 int btrfs_check_zoned_mode(struct btrfs_fs_info
*fs_info
)
686 struct queue_limits
*lim
= &fs_info
->limits
;
687 struct btrfs_device
*device
;
692 * Host-Managed devices can't be used without the ZONED flag. With the
693 * ZONED all devices can be used, using zone emulation if required.
695 if (!btrfs_fs_incompat(fs_info
, ZONED
))
696 return btrfs_check_for_zoned_device(fs_info
);
698 blk_set_stacking_limits(lim
);
700 list_for_each_entry(device
, &fs_info
->fs_devices
->devices
, dev_list
) {
701 struct btrfs_zoned_device_info
*zone_info
= device
->zone_info
;
707 zone_size
= zone_info
->zone_size
;
708 } else if (zone_info
->zone_size
!= zone_size
) {
710 "zoned: unequal block device zone sizes: have %llu found %llu",
711 zone_info
->zone_size
, zone_size
);
716 * With the zoned emulation, we can have non-zoned device on the
717 * zoned mode. In this case, we don't have a valid max zone
720 if (bdev_is_zoned(device
->bdev
)) {
721 blk_stack_limits(lim
,
722 &bdev_get_queue(device
->bdev
)->limits
,
728 * stripe_size is always aligned to BTRFS_STRIPE_LEN in
729 * btrfs_create_chunk(). Since we want stripe_len == zone_size,
730 * check the alignment here.
732 if (!IS_ALIGNED(zone_size
, BTRFS_STRIPE_LEN
)) {
734 "zoned: zone size %llu not aligned to stripe %u",
735 zone_size
, BTRFS_STRIPE_LEN
);
739 if (btrfs_fs_incompat(fs_info
, MIXED_GROUPS
)) {
740 btrfs_err(fs_info
, "zoned: mixed block groups not supported");
744 fs_info
->zone_size
= zone_size
;
746 * Also limit max_zone_append_size by max_segments * PAGE_SIZE.
747 * Technically, we can have multiple pages per segment. But, since
748 * we add the pages one by one to a bio, and cannot increase the
749 * metadata reservation even if it increases the number of extents, it
750 * is safe to stick with the limit.
752 fs_info
->max_zone_append_size
= ALIGN_DOWN(
753 min3((u64
)lim
->max_zone_append_sectors
<< SECTOR_SHIFT
,
754 (u64
)lim
->max_sectors
<< SECTOR_SHIFT
,
755 (u64
)lim
->max_segments
<< PAGE_SHIFT
),
756 fs_info
->sectorsize
);
757 fs_info
->fs_devices
->chunk_alloc_policy
= BTRFS_CHUNK_ALLOC_ZONED
;
758 if (fs_info
->max_zone_append_size
< fs_info
->max_extent_size
)
759 fs_info
->max_extent_size
= fs_info
->max_zone_append_size
;
762 * Check mount options here, because we might change fs_info->zoned
763 * from fs_info->zone_size.
765 ret
= btrfs_check_mountopts_zoned(fs_info
, &fs_info
->mount_opt
);
769 btrfs_info(fs_info
, "zoned mode enabled with zone size %llu", zone_size
);
773 int btrfs_check_mountopts_zoned(struct btrfs_fs_info
*info
, unsigned long *mount_opt
)
775 if (!btrfs_is_zoned(info
))
779 * Space cache writing is not COWed. Disable that to avoid write errors
780 * in sequential zones.
782 if (btrfs_raw_test_opt(*mount_opt
, SPACE_CACHE
)) {
783 btrfs_err(info
, "zoned: space cache v1 is not supported");
787 if (btrfs_raw_test_opt(*mount_opt
, NODATACOW
)) {
788 btrfs_err(info
, "zoned: NODATACOW not supported");
792 if (btrfs_raw_test_opt(*mount_opt
, DISCARD_ASYNC
)) {
794 "zoned: async discard ignored and disabled for zoned mode");
795 btrfs_clear_opt(*mount_opt
, DISCARD_ASYNC
);
801 static int sb_log_location(struct block_device
*bdev
, struct blk_zone
*zones
,
802 int rw
, u64
*bytenr_ret
)
807 if (zones
[0].type
== BLK_ZONE_TYPE_CONVENTIONAL
) {
808 *bytenr_ret
= zones
[0].start
<< SECTOR_SHIFT
;
812 ret
= sb_write_pointer(bdev
, zones
, &wp
);
813 if (ret
!= -ENOENT
&& ret
< 0)
817 struct blk_zone
*reset
= NULL
;
819 if (wp
== zones
[0].start
<< SECTOR_SHIFT
)
821 else if (wp
== zones
[1].start
<< SECTOR_SHIFT
)
824 if (reset
&& reset
->cond
!= BLK_ZONE_COND_EMPTY
) {
825 ASSERT(sb_zone_is_full(reset
));
827 ret
= blkdev_zone_mgmt(bdev
, REQ_OP_ZONE_RESET
,
828 reset
->start
, reset
->len
,
833 reset
->cond
= BLK_ZONE_COND_EMPTY
;
834 reset
->wp
= reset
->start
;
836 } else if (ret
!= -ENOENT
) {
838 * For READ, we want the previous one. Move write pointer to
839 * the end of a zone, if it is at the head of a zone.
843 if (wp
== zones
[0].start
<< SECTOR_SHIFT
)
844 zone_end
= zones
[1].start
+ zones
[1].capacity
;
845 else if (wp
== zones
[1].start
<< SECTOR_SHIFT
)
846 zone_end
= zones
[0].start
+ zones
[0].capacity
;
848 wp
= ALIGN_DOWN(zone_end
<< SECTOR_SHIFT
,
849 BTRFS_SUPER_INFO_SIZE
);
851 wp
-= BTRFS_SUPER_INFO_SIZE
;
859 int btrfs_sb_log_location_bdev(struct block_device
*bdev
, int mirror
, int rw
,
862 struct blk_zone zones
[BTRFS_NR_SB_LOG_ZONES
];
863 sector_t zone_sectors
;
866 u8 zone_sectors_shift
;
870 if (!bdev_is_zoned(bdev
)) {
871 *bytenr_ret
= btrfs_sb_offset(mirror
);
875 ASSERT(rw
== READ
|| rw
== WRITE
);
877 zone_sectors
= bdev_zone_sectors(bdev
);
878 if (!is_power_of_2(zone_sectors
))
880 zone_sectors_shift
= ilog2(zone_sectors
);
881 nr_sectors
= bdev_nr_sectors(bdev
);
882 nr_zones
= nr_sectors
>> zone_sectors_shift
;
884 sb_zone
= sb_zone_number(zone_sectors_shift
+ SECTOR_SHIFT
, mirror
);
885 if (sb_zone
+ 1 >= nr_zones
)
888 ret
= blkdev_report_zones(bdev
, zone_start_sector(sb_zone
, bdev
),
889 BTRFS_NR_SB_LOG_ZONES
, copy_zone_info_cb
,
893 if (ret
!= BTRFS_NR_SB_LOG_ZONES
)
896 return sb_log_location(bdev
, zones
, rw
, bytenr_ret
);
899 int btrfs_sb_log_location(struct btrfs_device
*device
, int mirror
, int rw
,
902 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
906 * For a zoned filesystem on a non-zoned block device, use the same
907 * super block locations as regular filesystem. Doing so, the super
908 * block can always be retrieved and the zoned flag of the volume
909 * detected from the super block information.
911 if (!bdev_is_zoned(device
->bdev
)) {
912 *bytenr_ret
= btrfs_sb_offset(mirror
);
916 zone_num
= sb_zone_number(zinfo
->zone_size_shift
, mirror
);
917 if (zone_num
+ 1 >= zinfo
->nr_zones
)
920 return sb_log_location(device
->bdev
,
921 &zinfo
->sb_zones
[BTRFS_NR_SB_LOG_ZONES
* mirror
],
925 static inline bool is_sb_log_zone(struct btrfs_zoned_device_info
*zinfo
,
933 zone_num
= sb_zone_number(zinfo
->zone_size_shift
, mirror
);
934 if (zone_num
+ 1 >= zinfo
->nr_zones
)
937 if (!test_bit(zone_num
, zinfo
->seq_zones
))
943 int btrfs_advance_sb_log(struct btrfs_device
*device
, int mirror
)
945 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
946 struct blk_zone
*zone
;
949 if (!is_sb_log_zone(zinfo
, mirror
))
952 zone
= &zinfo
->sb_zones
[BTRFS_NR_SB_LOG_ZONES
* mirror
];
953 for (i
= 0; i
< BTRFS_NR_SB_LOG_ZONES
; i
++) {
954 /* Advance the next zone */
955 if (zone
->cond
== BLK_ZONE_COND_FULL
) {
960 if (zone
->cond
== BLK_ZONE_COND_EMPTY
)
961 zone
->cond
= BLK_ZONE_COND_IMP_OPEN
;
963 zone
->wp
+= SUPER_INFO_SECTORS
;
965 if (sb_zone_is_full(zone
)) {
967 * No room left to write new superblock. Since
968 * superblock is written with REQ_SYNC, it is safe to
969 * finish the zone now.
971 * If the write pointer is exactly at the capacity,
972 * explicit ZONE_FINISH is not necessary.
974 if (zone
->wp
!= zone
->start
+ zone
->capacity
) {
977 ret
= blkdev_zone_mgmt(device
->bdev
,
978 REQ_OP_ZONE_FINISH
, zone
->start
,
979 zone
->len
, GFP_NOFS
);
984 zone
->wp
= zone
->start
+ zone
->len
;
985 zone
->cond
= BLK_ZONE_COND_FULL
;
990 /* All the zones are FULL. Should not reach here. */
995 int btrfs_reset_sb_log_zones(struct block_device
*bdev
, int mirror
)
997 sector_t zone_sectors
;
999 u8 zone_sectors_shift
;
1003 zone_sectors
= bdev_zone_sectors(bdev
);
1004 zone_sectors_shift
= ilog2(zone_sectors
);
1005 nr_sectors
= bdev_nr_sectors(bdev
);
1006 nr_zones
= nr_sectors
>> zone_sectors_shift
;
1008 sb_zone
= sb_zone_number(zone_sectors_shift
+ SECTOR_SHIFT
, mirror
);
1009 if (sb_zone
+ 1 >= nr_zones
)
1012 return blkdev_zone_mgmt(bdev
, REQ_OP_ZONE_RESET
,
1013 zone_start_sector(sb_zone
, bdev
),
1014 zone_sectors
* BTRFS_NR_SB_LOG_ZONES
, GFP_NOFS
);
1018 * Find allocatable zones within a given region.
1020 * @device: the device to allocate a region on
1021 * @hole_start: the position of the hole to allocate the region
1022 * @num_bytes: size of wanted region
1023 * @hole_end: the end of the hole
1024 * @return: position of allocatable zones
1026 * Allocatable region should not contain any superblock locations.
1028 u64
btrfs_find_allocatable_zones(struct btrfs_device
*device
, u64 hole_start
,
1029 u64 hole_end
, u64 num_bytes
)
1031 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
1032 const u8 shift
= zinfo
->zone_size_shift
;
1033 u64 nzones
= num_bytes
>> shift
;
1034 u64 pos
= hole_start
;
1039 ASSERT(IS_ALIGNED(hole_start
, zinfo
->zone_size
));
1040 ASSERT(IS_ALIGNED(num_bytes
, zinfo
->zone_size
));
1042 while (pos
< hole_end
) {
1043 begin
= pos
>> shift
;
1044 end
= begin
+ nzones
;
1046 if (end
> zinfo
->nr_zones
)
1049 /* Check if zones in the region are all empty */
1050 if (btrfs_dev_is_sequential(device
, pos
) &&
1051 !bitmap_test_range_all_set(zinfo
->empty_zones
, begin
, nzones
)) {
1052 pos
+= zinfo
->zone_size
;
1057 for (i
= 0; i
< BTRFS_SUPER_MIRROR_MAX
; i
++) {
1061 sb_zone
= sb_zone_number(shift
, i
);
1062 if (!(end
<= sb_zone
||
1063 sb_zone
+ BTRFS_NR_SB_LOG_ZONES
<= begin
)) {
1065 pos
= zone_start_physical(
1066 sb_zone
+ BTRFS_NR_SB_LOG_ZONES
, zinfo
);
1070 /* We also need to exclude regular superblock positions */
1071 sb_pos
= btrfs_sb_offset(i
);
1072 if (!(pos
+ num_bytes
<= sb_pos
||
1073 sb_pos
+ BTRFS_SUPER_INFO_SIZE
<= pos
)) {
1075 pos
= ALIGN(sb_pos
+ BTRFS_SUPER_INFO_SIZE
,
1087 static bool btrfs_dev_set_active_zone(struct btrfs_device
*device
, u64 pos
)
1089 struct btrfs_zoned_device_info
*zone_info
= device
->zone_info
;
1090 unsigned int zno
= (pos
>> zone_info
->zone_size_shift
);
1092 /* We can use any number of zones */
1093 if (zone_info
->max_active_zones
== 0)
1096 if (!test_bit(zno
, zone_info
->active_zones
)) {
1097 /* Active zone left? */
1098 if (atomic_dec_if_positive(&zone_info
->active_zones_left
) < 0)
1100 if (test_and_set_bit(zno
, zone_info
->active_zones
)) {
1101 /* Someone already set the bit */
1102 atomic_inc(&zone_info
->active_zones_left
);
1109 static void btrfs_dev_clear_active_zone(struct btrfs_device
*device
, u64 pos
)
1111 struct btrfs_zoned_device_info
*zone_info
= device
->zone_info
;
1112 unsigned int zno
= (pos
>> zone_info
->zone_size_shift
);
1114 /* We can use any number of zones */
1115 if (zone_info
->max_active_zones
== 0)
1118 if (test_and_clear_bit(zno
, zone_info
->active_zones
))
1119 atomic_inc(&zone_info
->active_zones_left
);
1122 int btrfs_reset_device_zone(struct btrfs_device
*device
, u64 physical
,
1123 u64 length
, u64
*bytes
)
1128 ret
= blkdev_zone_mgmt(device
->bdev
, REQ_OP_ZONE_RESET
,
1129 physical
>> SECTOR_SHIFT
, length
>> SECTOR_SHIFT
,
1136 btrfs_dev_set_zone_empty(device
, physical
);
1137 btrfs_dev_clear_active_zone(device
, physical
);
1138 physical
+= device
->zone_info
->zone_size
;
1139 length
-= device
->zone_info
->zone_size
;
1145 int btrfs_ensure_empty_zones(struct btrfs_device
*device
, u64 start
, u64 size
)
1147 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
1148 const u8 shift
= zinfo
->zone_size_shift
;
1149 unsigned long begin
= start
>> shift
;
1150 unsigned long nbits
= size
>> shift
;
1154 ASSERT(IS_ALIGNED(start
, zinfo
->zone_size
));
1155 ASSERT(IS_ALIGNED(size
, zinfo
->zone_size
));
1157 if (begin
+ nbits
> zinfo
->nr_zones
)
1160 /* All the zones are conventional */
1161 if (bitmap_test_range_all_zero(zinfo
->seq_zones
, begin
, nbits
))
1164 /* All the zones are sequential and empty */
1165 if (bitmap_test_range_all_set(zinfo
->seq_zones
, begin
, nbits
) &&
1166 bitmap_test_range_all_set(zinfo
->empty_zones
, begin
, nbits
))
1169 for (pos
= start
; pos
< start
+ size
; pos
+= zinfo
->zone_size
) {
1172 if (!btrfs_dev_is_sequential(device
, pos
) ||
1173 btrfs_dev_is_empty_zone(device
, pos
))
1176 /* Free regions should be empty */
1179 "zoned: resetting device %s (devid %llu) zone %llu for allocation",
1180 rcu_str_deref(device
->name
), device
->devid
, pos
>> shift
);
1183 ret
= btrfs_reset_device_zone(device
, pos
, zinfo
->zone_size
,
1193 * Calculate an allocation pointer from the extent allocation information
1194 * for a block group consist of conventional zones. It is pointed to the
1195 * end of the highest addressed extent in the block group as an allocation
1198 static int calculate_alloc_pointer(struct btrfs_block_group
*cache
,
1199 u64
*offset_ret
, bool new)
1201 struct btrfs_fs_info
*fs_info
= cache
->fs_info
;
1202 struct btrfs_root
*root
;
1203 struct btrfs_path
*path
;
1204 struct btrfs_key key
;
1205 struct btrfs_key found_key
;
1210 * Avoid tree lookups for a new block group, there's no use for it.
1211 * It must always be 0.
1213 * Also, we have a lock chain of extent buffer lock -> chunk mutex.
1214 * For new a block group, this function is called from
1215 * btrfs_make_block_group() which is already taking the chunk mutex.
1216 * Thus, we cannot call calculate_alloc_pointer() which takes extent
1217 * buffer locks to avoid deadlock.
1224 path
= btrfs_alloc_path();
1228 key
.objectid
= cache
->start
+ cache
->length
;
1232 root
= btrfs_extent_root(fs_info
, key
.objectid
);
1233 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1234 /* We should not find the exact match */
1240 ret
= btrfs_previous_extent_item(root
, path
, cache
->start
);
1249 btrfs_item_key_to_cpu(path
->nodes
[0], &found_key
, path
->slots
[0]);
1251 if (found_key
.type
== BTRFS_EXTENT_ITEM_KEY
)
1252 length
= found_key
.offset
;
1254 length
= fs_info
->nodesize
;
1256 if (!(found_key
.objectid
>= cache
->start
&&
1257 found_key
.objectid
+ length
<= cache
->start
+ cache
->length
)) {
1261 *offset_ret
= found_key
.objectid
+ length
- cache
->start
;
1265 btrfs_free_path(path
);
1275 static int btrfs_load_zone_info(struct btrfs_fs_info
*fs_info
, int zone_idx
,
1276 struct zone_info
*info
, unsigned long *active
,
1277 struct btrfs_chunk_map
*map
)
1279 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
1280 struct btrfs_device
*device
= map
->stripes
[zone_idx
].dev
;
1281 int dev_replace_is_ongoing
= 0;
1282 unsigned int nofs_flag
;
1283 struct blk_zone zone
;
1286 info
->physical
= map
->stripes
[zone_idx
].physical
;
1288 if (!device
->bdev
) {
1289 info
->alloc_offset
= WP_MISSING_DEV
;
1293 /* Consider a zone as active if we can allow any number of active zones. */
1294 if (!device
->zone_info
->max_active_zones
)
1295 __set_bit(zone_idx
, active
);
1297 if (!btrfs_dev_is_sequential(device
, info
->physical
)) {
1298 info
->alloc_offset
= WP_CONVENTIONAL
;
1302 /* This zone will be used for allocation, so mark this zone non-empty. */
1303 btrfs_dev_clear_zone_empty(device
, info
->physical
);
1305 down_read(&dev_replace
->rwsem
);
1306 dev_replace_is_ongoing
= btrfs_dev_replace_is_ongoing(dev_replace
);
1307 if (dev_replace_is_ongoing
&& dev_replace
->tgtdev
!= NULL
)
1308 btrfs_dev_clear_zone_empty(dev_replace
->tgtdev
, info
->physical
);
1309 up_read(&dev_replace
->rwsem
);
1312 * The group is mapped to a sequential zone. Get the zone write pointer
1313 * to determine the allocation offset within the zone.
1315 WARN_ON(!IS_ALIGNED(info
->physical
, fs_info
->zone_size
));
1316 nofs_flag
= memalloc_nofs_save();
1317 ret
= btrfs_get_dev_zone(device
, info
->physical
, &zone
);
1318 memalloc_nofs_restore(nofs_flag
);
1320 if (ret
!= -EIO
&& ret
!= -EOPNOTSUPP
)
1322 info
->alloc_offset
= WP_MISSING_DEV
;
1326 if (zone
.type
== BLK_ZONE_TYPE_CONVENTIONAL
) {
1327 btrfs_err_in_rcu(fs_info
,
1328 "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
1329 zone
.start
<< SECTOR_SHIFT
, rcu_str_deref(device
->name
),
1334 info
->capacity
= (zone
.capacity
<< SECTOR_SHIFT
);
1336 switch (zone
.cond
) {
1337 case BLK_ZONE_COND_OFFLINE
:
1338 case BLK_ZONE_COND_READONLY
:
1340 "zoned: offline/readonly zone %llu on device %s (devid %llu)",
1341 (info
->physical
>> device
->zone_info
->zone_size_shift
),
1342 rcu_str_deref(device
->name
), device
->devid
);
1343 info
->alloc_offset
= WP_MISSING_DEV
;
1345 case BLK_ZONE_COND_EMPTY
:
1346 info
->alloc_offset
= 0;
1348 case BLK_ZONE_COND_FULL
:
1349 info
->alloc_offset
= info
->capacity
;
1352 /* Partially used zone. */
1353 info
->alloc_offset
= ((zone
.wp
- zone
.start
) << SECTOR_SHIFT
);
1354 __set_bit(zone_idx
, active
);
1361 static int btrfs_load_block_group_single(struct btrfs_block_group
*bg
,
1362 struct zone_info
*info
,
1363 unsigned long *active
)
1365 if (info
->alloc_offset
== WP_MISSING_DEV
) {
1366 btrfs_err(bg
->fs_info
,
1367 "zoned: cannot recover write pointer for zone %llu",
1372 bg
->alloc_offset
= info
->alloc_offset
;
1373 bg
->zone_capacity
= info
->capacity
;
1374 if (test_bit(0, active
))
1375 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &bg
->runtime_flags
);
1379 static int btrfs_load_block_group_dup(struct btrfs_block_group
*bg
,
1380 struct btrfs_chunk_map
*map
,
1381 struct zone_info
*zone_info
,
1382 unsigned long *active
)
1384 struct btrfs_fs_info
*fs_info
= bg
->fs_info
;
1386 if ((map
->type
& BTRFS_BLOCK_GROUP_DATA
) && !fs_info
->stripe_root
) {
1387 btrfs_err(fs_info
, "zoned: data DUP profile needs raid-stripe-tree");
1391 if (zone_info
[0].alloc_offset
== WP_MISSING_DEV
) {
1392 btrfs_err(bg
->fs_info
,
1393 "zoned: cannot recover write pointer for zone %llu",
1394 zone_info
[0].physical
);
1397 if (zone_info
[1].alloc_offset
== WP_MISSING_DEV
) {
1398 btrfs_err(bg
->fs_info
,
1399 "zoned: cannot recover write pointer for zone %llu",
1400 zone_info
[1].physical
);
1403 if (zone_info
[0].alloc_offset
!= zone_info
[1].alloc_offset
) {
1404 btrfs_err(bg
->fs_info
,
1405 "zoned: write pointer offset mismatch of zones in DUP profile");
1409 if (test_bit(0, active
) != test_bit(1, active
)) {
1410 if (!btrfs_zone_activate(bg
))
1412 } else if (test_bit(0, active
)) {
1413 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &bg
->runtime_flags
);
1416 bg
->alloc_offset
= zone_info
[0].alloc_offset
;
1417 bg
->zone_capacity
= min(zone_info
[0].capacity
, zone_info
[1].capacity
);
1421 static int btrfs_load_block_group_raid1(struct btrfs_block_group
*bg
,
1422 struct btrfs_chunk_map
*map
,
1423 struct zone_info
*zone_info
,
1424 unsigned long *active
)
1426 struct btrfs_fs_info
*fs_info
= bg
->fs_info
;
1429 if ((map
->type
& BTRFS_BLOCK_GROUP_DATA
) && !fs_info
->stripe_root
) {
1430 btrfs_err(fs_info
, "zoned: data %s needs raid-stripe-tree",
1431 btrfs_bg_type_to_raid_name(map
->type
));
1435 for (i
= 0; i
< map
->num_stripes
; i
++) {
1436 if (zone_info
[i
].alloc_offset
== WP_MISSING_DEV
||
1437 zone_info
[i
].alloc_offset
== WP_CONVENTIONAL
)
1440 if ((zone_info
[0].alloc_offset
!= zone_info
[i
].alloc_offset
) &&
1441 !btrfs_test_opt(fs_info
, DEGRADED
)) {
1443 "zoned: write pointer offset mismatch of zones in %s profile",
1444 btrfs_bg_type_to_raid_name(map
->type
));
1447 if (test_bit(0, active
) != test_bit(i
, active
)) {
1448 if (!btrfs_test_opt(fs_info
, DEGRADED
) &&
1449 !btrfs_zone_activate(bg
)) {
1453 if (test_bit(0, active
))
1454 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &bg
->runtime_flags
);
1456 /* In case a device is missing we have a cap of 0, so don't use it. */
1457 bg
->zone_capacity
= min_not_zero(zone_info
[0].capacity
,
1458 zone_info
[1].capacity
);
1461 if (zone_info
[0].alloc_offset
!= WP_MISSING_DEV
)
1462 bg
->alloc_offset
= zone_info
[0].alloc_offset
;
1464 bg
->alloc_offset
= zone_info
[i
- 1].alloc_offset
;
1469 static int btrfs_load_block_group_raid0(struct btrfs_block_group
*bg
,
1470 struct btrfs_chunk_map
*map
,
1471 struct zone_info
*zone_info
,
1472 unsigned long *active
)
1474 struct btrfs_fs_info
*fs_info
= bg
->fs_info
;
1476 if ((map
->type
& BTRFS_BLOCK_GROUP_DATA
) && !fs_info
->stripe_root
) {
1477 btrfs_err(fs_info
, "zoned: data %s needs raid-stripe-tree",
1478 btrfs_bg_type_to_raid_name(map
->type
));
1482 for (int i
= 0; i
< map
->num_stripes
; i
++) {
1483 if (zone_info
[i
].alloc_offset
== WP_MISSING_DEV
||
1484 zone_info
[i
].alloc_offset
== WP_CONVENTIONAL
)
1487 if (test_bit(0, active
) != test_bit(i
, active
)) {
1488 if (!btrfs_zone_activate(bg
))
1491 if (test_bit(0, active
))
1492 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &bg
->runtime_flags
);
1494 bg
->zone_capacity
+= zone_info
[i
].capacity
;
1495 bg
->alloc_offset
+= zone_info
[i
].alloc_offset
;
1501 static int btrfs_load_block_group_raid10(struct btrfs_block_group
*bg
,
1502 struct btrfs_chunk_map
*map
,
1503 struct zone_info
*zone_info
,
1504 unsigned long *active
)
1506 struct btrfs_fs_info
*fs_info
= bg
->fs_info
;
1508 if ((map
->type
& BTRFS_BLOCK_GROUP_DATA
) && !fs_info
->stripe_root
) {
1509 btrfs_err(fs_info
, "zoned: data %s needs raid-stripe-tree",
1510 btrfs_bg_type_to_raid_name(map
->type
));
1514 for (int i
= 0; i
< map
->num_stripes
; i
++) {
1515 if (zone_info
[i
].alloc_offset
== WP_MISSING_DEV
||
1516 zone_info
[i
].alloc_offset
== WP_CONVENTIONAL
)
1519 if (test_bit(0, active
) != test_bit(i
, active
)) {
1520 if (!btrfs_zone_activate(bg
))
1523 if (test_bit(0, active
))
1524 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &bg
->runtime_flags
);
1527 if ((i
% map
->sub_stripes
) == 0) {
1528 bg
->zone_capacity
+= zone_info
[i
].capacity
;
1529 bg
->alloc_offset
+= zone_info
[i
].alloc_offset
;
1536 int btrfs_load_block_group_zone_info(struct btrfs_block_group
*cache
, bool new)
1538 struct btrfs_fs_info
*fs_info
= cache
->fs_info
;
1539 struct btrfs_chunk_map
*map
;
1540 u64 logical
= cache
->start
;
1541 u64 length
= cache
->length
;
1542 struct zone_info
*zone_info
= NULL
;
1545 unsigned long *active
= NULL
;
1547 u32 num_sequential
= 0, num_conventional
= 0;
1549 if (!btrfs_is_zoned(fs_info
))
1553 if (!IS_ALIGNED(length
, fs_info
->zone_size
)) {
1555 "zoned: block group %llu len %llu unaligned to zone size %llu",
1556 logical
, length
, fs_info
->zone_size
);
1560 map
= btrfs_find_chunk_map(fs_info
, logical
, length
);
1564 cache
->physical_map
= map
;
1566 zone_info
= kcalloc(map
->num_stripes
, sizeof(*zone_info
), GFP_NOFS
);
1572 active
= bitmap_zalloc(map
->num_stripes
, GFP_NOFS
);
1578 for (i
= 0; i
< map
->num_stripes
; i
++) {
1579 ret
= btrfs_load_zone_info(fs_info
, i
, &zone_info
[i
], active
, map
);
1583 if (zone_info
[i
].alloc_offset
== WP_CONVENTIONAL
)
1589 if (num_sequential
> 0)
1590 set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE
, &cache
->runtime_flags
);
1592 if (num_conventional
> 0) {
1593 /* Zone capacity is always zone size in emulation */
1594 cache
->zone_capacity
= cache
->length
;
1595 ret
= calculate_alloc_pointer(cache
, &last_alloc
, new);
1598 "zoned: failed to determine allocation offset of bg %llu",
1601 } else if (map
->num_stripes
== num_conventional
) {
1602 cache
->alloc_offset
= last_alloc
;
1603 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &cache
->runtime_flags
);
1608 switch (map
->type
& BTRFS_BLOCK_GROUP_PROFILE_MASK
) {
1609 case 0: /* single */
1610 ret
= btrfs_load_block_group_single(cache
, &zone_info
[0], active
);
1612 case BTRFS_BLOCK_GROUP_DUP
:
1613 ret
= btrfs_load_block_group_dup(cache
, map
, zone_info
, active
);
1615 case BTRFS_BLOCK_GROUP_RAID1
:
1616 case BTRFS_BLOCK_GROUP_RAID1C3
:
1617 case BTRFS_BLOCK_GROUP_RAID1C4
:
1618 ret
= btrfs_load_block_group_raid1(cache
, map
, zone_info
, active
);
1620 case BTRFS_BLOCK_GROUP_RAID0
:
1621 ret
= btrfs_load_block_group_raid0(cache
, map
, zone_info
, active
);
1623 case BTRFS_BLOCK_GROUP_RAID10
:
1624 ret
= btrfs_load_block_group_raid10(cache
, map
, zone_info
, active
);
1626 case BTRFS_BLOCK_GROUP_RAID5
:
1627 case BTRFS_BLOCK_GROUP_RAID6
:
1629 btrfs_err(fs_info
, "zoned: profile %s not yet supported",
1630 btrfs_bg_type_to_raid_name(map
->type
));
1636 /* Reject non SINGLE data profiles without RST */
1637 if ((map
->type
& BTRFS_BLOCK_GROUP_DATA
) &&
1638 (map
->type
& BTRFS_BLOCK_GROUP_PROFILE_MASK
) &&
1639 !fs_info
->stripe_root
) {
1640 btrfs_err(fs_info
, "zoned: data %s needs raid-stripe-tree",
1641 btrfs_bg_type_to_raid_name(map
->type
));
1645 if (cache
->alloc_offset
> cache
->zone_capacity
) {
1647 "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
1648 cache
->alloc_offset
, cache
->zone_capacity
,
1653 /* An extent is allocated after the write pointer */
1654 if (!ret
&& num_conventional
&& last_alloc
> cache
->alloc_offset
) {
1656 "zoned: got wrong write pointer in BG %llu: %llu > %llu",
1657 logical
, last_alloc
, cache
->alloc_offset
);
1662 cache
->meta_write_pointer
= cache
->alloc_offset
+ cache
->start
;
1663 if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &cache
->runtime_flags
)) {
1664 btrfs_get_block_group(cache
);
1665 spin_lock(&fs_info
->zone_active_bgs_lock
);
1666 list_add_tail(&cache
->active_bg_list
,
1667 &fs_info
->zone_active_bgs
);
1668 spin_unlock(&fs_info
->zone_active_bgs_lock
);
1671 btrfs_free_chunk_map(cache
->physical_map
);
1672 cache
->physical_map
= NULL
;
1674 bitmap_free(active
);
1680 void btrfs_calc_zone_unusable(struct btrfs_block_group
*cache
)
1684 if (!btrfs_is_zoned(cache
->fs_info
))
1687 WARN_ON(cache
->bytes_super
!= 0);
1688 unusable
= (cache
->alloc_offset
- cache
->used
) +
1689 (cache
->length
- cache
->zone_capacity
);
1690 free
= cache
->zone_capacity
- cache
->alloc_offset
;
1692 /* We only need ->free_space in ALLOC_SEQ block groups */
1693 cache
->cached
= BTRFS_CACHE_FINISHED
;
1694 cache
->free_space_ctl
->free_space
= free
;
1695 cache
->zone_unusable
= unusable
;
1698 bool btrfs_use_zone_append(struct btrfs_bio
*bbio
)
1700 u64 start
= (bbio
->bio
.bi_iter
.bi_sector
<< SECTOR_SHIFT
);
1701 struct btrfs_inode
*inode
= bbio
->inode
;
1702 struct btrfs_fs_info
*fs_info
= bbio
->fs_info
;
1703 struct btrfs_block_group
*cache
;
1706 if (!btrfs_is_zoned(fs_info
))
1709 if (!inode
|| !is_data_inode(&inode
->vfs_inode
))
1712 if (btrfs_op(&bbio
->bio
) != BTRFS_MAP_WRITE
)
1716 * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
1717 * extent layout the relocation code has.
1718 * Furthermore we have set aside own block-group from which only the
1719 * relocation "process" can allocate and make sure only one process at a
1720 * time can add pages to an extent that gets relocated, so it's safe to
1721 * use regular REQ_OP_WRITE for this special case.
1723 if (btrfs_is_data_reloc_root(inode
->root
))
1726 cache
= btrfs_lookup_block_group(fs_info
, start
);
1731 ret
= !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE
, &cache
->runtime_flags
);
1732 btrfs_put_block_group(cache
);
1737 void btrfs_record_physical_zoned(struct btrfs_bio
*bbio
)
1739 const u64 physical
= bbio
->bio
.bi_iter
.bi_sector
<< SECTOR_SHIFT
;
1740 struct btrfs_ordered_sum
*sum
= bbio
->sums
;
1742 if (physical
< bbio
->orig_physical
)
1743 sum
->logical
-= bbio
->orig_physical
- physical
;
1745 sum
->logical
+= physical
- bbio
->orig_physical
;
1748 static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent
*ordered
,
1751 struct extent_map_tree
*em_tree
= &BTRFS_I(ordered
->inode
)->extent_tree
;
1752 struct extent_map
*em
;
1754 ordered
->disk_bytenr
= logical
;
1756 write_lock(&em_tree
->lock
);
1757 em
= search_extent_mapping(em_tree
, ordered
->file_offset
,
1758 ordered
->num_bytes
);
1759 em
->block_start
= logical
;
1760 free_extent_map(em
);
1761 write_unlock(&em_tree
->lock
);
1764 static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent
*ordered
,
1765 u64 logical
, u64 len
)
1767 struct btrfs_ordered_extent
*new;
1769 if (!test_bit(BTRFS_ORDERED_NOCOW
, &ordered
->flags
) &&
1770 split_extent_map(BTRFS_I(ordered
->inode
), ordered
->file_offset
,
1771 ordered
->num_bytes
, len
, logical
))
1774 new = btrfs_split_ordered_extent(ordered
, len
);
1777 new->disk_bytenr
= logical
;
1778 btrfs_finish_one_ordered(new);
1782 void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent
*ordered
)
1784 struct btrfs_inode
*inode
= BTRFS_I(ordered
->inode
);
1785 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
1786 struct btrfs_ordered_sum
*sum
;
1790 * Write to pre-allocated region is for the data relocation, and so
1791 * it should use WRITE operation. No split/rewrite are necessary.
1793 if (test_bit(BTRFS_ORDERED_PREALLOC
, &ordered
->flags
))
1796 ASSERT(!list_empty(&ordered
->list
));
1797 /* The ordered->list can be empty in the above pre-alloc case. */
1798 sum
= list_first_entry(&ordered
->list
, struct btrfs_ordered_sum
, list
);
1799 logical
= sum
->logical
;
1802 while (len
< ordered
->disk_num_bytes
) {
1803 sum
= list_next_entry(sum
, list
);
1804 if (sum
->logical
== logical
+ len
) {
1808 if (!btrfs_zoned_split_ordered(ordered
, logical
, len
)) {
1809 set_bit(BTRFS_ORDERED_IOERR
, &ordered
->flags
);
1810 btrfs_err(fs_info
, "failed to split ordered extent");
1813 logical
= sum
->logical
;
1817 if (ordered
->disk_bytenr
!= logical
)
1818 btrfs_rewrite_logical_zoned(ordered
, logical
);
1822 * If we end up here for nodatasum I/O, the btrfs_ordered_sum structures
1823 * were allocated by btrfs_alloc_dummy_sum only to record the logical
1824 * addresses and don't contain actual checksums. We thus must free them
1825 * here so that we don't attempt to log the csums later.
1827 if ((inode
->flags
& BTRFS_INODE_NODATASUM
) ||
1828 test_bit(BTRFS_FS_STATE_NO_CSUMS
, &fs_info
->fs_state
)) {
1829 while ((sum
= list_first_entry_or_null(&ordered
->list
,
1830 typeof(*sum
), list
))) {
1831 list_del(&sum
->list
);
1837 static bool check_bg_is_active(struct btrfs_eb_write_context
*ctx
,
1838 struct btrfs_block_group
**active_bg
)
1840 const struct writeback_control
*wbc
= ctx
->wbc
;
1841 struct btrfs_block_group
*block_group
= ctx
->zoned_bg
;
1842 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
1844 if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &block_group
->runtime_flags
))
1847 if (fs_info
->treelog_bg
== block_group
->start
) {
1848 if (!btrfs_zone_activate(block_group
)) {
1849 int ret_fin
= btrfs_zone_finish_one_bg(fs_info
);
1851 if (ret_fin
!= 1 || !btrfs_zone_activate(block_group
))
1854 } else if (*active_bg
!= block_group
) {
1855 struct btrfs_block_group
*tgt
= *active_bg
;
1857 /* zoned_meta_io_lock protects fs_info->active_{meta,system}_bg. */
1858 lockdep_assert_held(&fs_info
->zoned_meta_io_lock
);
1862 * If there is an unsent IO left in the allocated area,
1863 * we cannot wait for them as it may cause a deadlock.
1865 if (tgt
->meta_write_pointer
< tgt
->start
+ tgt
->alloc_offset
) {
1866 if (wbc
->sync_mode
== WB_SYNC_NONE
||
1867 (wbc
->sync_mode
== WB_SYNC_ALL
&& !wbc
->for_sync
))
1871 /* Pivot active metadata/system block group. */
1872 btrfs_zoned_meta_io_unlock(fs_info
);
1873 wait_eb_writebacks(tgt
);
1874 do_zone_finish(tgt
, true);
1875 btrfs_zoned_meta_io_lock(fs_info
);
1876 if (*active_bg
== tgt
) {
1877 btrfs_put_block_group(tgt
);
1881 if (!btrfs_zone_activate(block_group
))
1883 if (*active_bg
!= block_group
) {
1884 ASSERT(*active_bg
== NULL
);
1885 *active_bg
= block_group
;
1886 btrfs_get_block_group(block_group
);
1894 * Check if @ctx->eb is aligned to the write pointer.
1897 * 0: @ctx->eb is at the write pointer. You can write it.
1898 * -EAGAIN: There is a hole. The caller should handle the case.
1899 * -EBUSY: There is a hole, but the caller can just bail out.
1901 int btrfs_check_meta_write_pointer(struct btrfs_fs_info
*fs_info
,
1902 struct btrfs_eb_write_context
*ctx
)
1904 const struct writeback_control
*wbc
= ctx
->wbc
;
1905 const struct extent_buffer
*eb
= ctx
->eb
;
1906 struct btrfs_block_group
*block_group
= ctx
->zoned_bg
;
1908 if (!btrfs_is_zoned(fs_info
))
1912 if (block_group
->start
> eb
->start
||
1913 block_group
->start
+ block_group
->length
<= eb
->start
) {
1914 btrfs_put_block_group(block_group
);
1916 ctx
->zoned_bg
= NULL
;
1921 block_group
= btrfs_lookup_block_group(fs_info
, eb
->start
);
1924 ctx
->zoned_bg
= block_group
;
1927 if (block_group
->meta_write_pointer
== eb
->start
) {
1928 struct btrfs_block_group
**tgt
;
1930 if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING
, &fs_info
->flags
))
1933 if (block_group
->flags
& BTRFS_BLOCK_GROUP_SYSTEM
)
1934 tgt
= &fs_info
->active_system_bg
;
1936 tgt
= &fs_info
->active_meta_bg
;
1937 if (check_bg_is_active(ctx
, tgt
))
1942 * Since we may release fs_info->zoned_meta_io_lock, someone can already
1943 * start writing this eb. In that case, we can just bail out.
1945 if (block_group
->meta_write_pointer
> eb
->start
)
1948 /* If for_sync, this hole will be filled with trasnsaction commit. */
1949 if (wbc
->sync_mode
== WB_SYNC_ALL
&& !wbc
->for_sync
)
1954 int btrfs_zoned_issue_zeroout(struct btrfs_device
*device
, u64 physical
, u64 length
)
1956 if (!btrfs_dev_is_sequential(device
, physical
))
1959 return blkdev_issue_zeroout(device
->bdev
, physical
>> SECTOR_SHIFT
,
1960 length
>> SECTOR_SHIFT
, GFP_NOFS
, 0);
1963 static int read_zone_info(struct btrfs_fs_info
*fs_info
, u64 logical
,
1964 struct blk_zone
*zone
)
1966 struct btrfs_io_context
*bioc
= NULL
;
1967 u64 mapped_length
= PAGE_SIZE
;
1968 unsigned int nofs_flag
;
1972 ret
= btrfs_map_block(fs_info
, BTRFS_MAP_GET_READ_MIRRORS
, logical
,
1973 &mapped_length
, &bioc
, NULL
, NULL
);
1974 if (ret
|| !bioc
|| mapped_length
< PAGE_SIZE
) {
1979 if (bioc
->map_type
& BTRFS_BLOCK_GROUP_RAID56_MASK
) {
1984 nofs_flag
= memalloc_nofs_save();
1985 nmirrors
= (int)bioc
->num_stripes
;
1986 for (i
= 0; i
< nmirrors
; i
++) {
1987 u64 physical
= bioc
->stripes
[i
].physical
;
1988 struct btrfs_device
*dev
= bioc
->stripes
[i
].dev
;
1990 /* Missing device */
1994 ret
= btrfs_get_dev_zone(dev
, physical
, zone
);
1995 /* Failing device */
1996 if (ret
== -EIO
|| ret
== -EOPNOTSUPP
)
2000 memalloc_nofs_restore(nofs_flag
);
2002 btrfs_put_bioc(bioc
);
2007 * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
2008 * filling zeros between @physical_pos to a write pointer of dev-replace
2011 int btrfs_sync_zone_write_pointer(struct btrfs_device
*tgt_dev
, u64 logical
,
2012 u64 physical_start
, u64 physical_pos
)
2014 struct btrfs_fs_info
*fs_info
= tgt_dev
->fs_info
;
2015 struct blk_zone zone
;
2020 if (!btrfs_dev_is_sequential(tgt_dev
, physical_pos
))
2023 ret
= read_zone_info(fs_info
, logical
, &zone
);
2027 wp
= physical_start
+ ((zone
.wp
- zone
.start
) << SECTOR_SHIFT
);
2029 if (physical_pos
== wp
)
2032 if (physical_pos
> wp
)
2035 length
= wp
- physical_pos
;
2036 return btrfs_zoned_issue_zeroout(tgt_dev
, physical_pos
, length
);
2040 * Activate block group and underlying device zones
2042 * @block_group: the block group to activate
2044 * Return: true on success, false otherwise
2046 bool btrfs_zone_activate(struct btrfs_block_group
*block_group
)
2048 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
2049 struct btrfs_chunk_map
*map
;
2050 struct btrfs_device
*device
;
2052 const bool is_data
= (block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
);
2056 if (!btrfs_is_zoned(block_group
->fs_info
))
2059 map
= block_group
->physical_map
;
2061 spin_lock(&fs_info
->zone_active_bgs_lock
);
2062 spin_lock(&block_group
->lock
);
2063 if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &block_group
->runtime_flags
)) {
2069 if (btrfs_zoned_bg_is_full(block_group
)) {
2074 for (i
= 0; i
< map
->num_stripes
; i
++) {
2075 struct btrfs_zoned_device_info
*zinfo
;
2078 device
= map
->stripes
[i
].dev
;
2079 physical
= map
->stripes
[i
].physical
;
2080 zinfo
= device
->zone_info
;
2082 if (zinfo
->max_active_zones
== 0)
2086 reserved
= zinfo
->reserved_active_zones
;
2088 * For the data block group, leave active zones for one
2089 * metadata block group and one system block group.
2091 if (atomic_read(&zinfo
->active_zones_left
) <= reserved
) {
2096 if (!btrfs_dev_set_active_zone(device
, physical
)) {
2097 /* Cannot activate the zone */
2102 zinfo
->reserved_active_zones
--;
2105 /* Successfully activated all the zones */
2106 set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &block_group
->runtime_flags
);
2107 spin_unlock(&block_group
->lock
);
2109 /* For the active block group list */
2110 btrfs_get_block_group(block_group
);
2111 list_add_tail(&block_group
->active_bg_list
, &fs_info
->zone_active_bgs
);
2112 spin_unlock(&fs_info
->zone_active_bgs_lock
);
2117 spin_unlock(&block_group
->lock
);
2118 spin_unlock(&fs_info
->zone_active_bgs_lock
);
2122 static void wait_eb_writebacks(struct btrfs_block_group
*block_group
)
2124 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
2125 const u64 end
= block_group
->start
+ block_group
->length
;
2126 struct radix_tree_iter iter
;
2127 struct extent_buffer
*eb
;
2131 radix_tree_for_each_slot(slot
, &fs_info
->buffer_radix
, &iter
,
2132 block_group
->start
>> fs_info
->sectorsize_bits
) {
2133 eb
= radix_tree_deref_slot(slot
);
2136 if (radix_tree_deref_retry(eb
)) {
2137 slot
= radix_tree_iter_retry(&iter
);
2141 if (eb
->start
< block_group
->start
)
2143 if (eb
->start
>= end
)
2146 slot
= radix_tree_iter_resume(slot
, &iter
);
2148 wait_on_extent_buffer_writeback(eb
);
2154 static int do_zone_finish(struct btrfs_block_group
*block_group
, bool fully_written
)
2156 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
2157 struct btrfs_chunk_map
*map
;
2158 const bool is_metadata
= (block_group
->flags
&
2159 (BTRFS_BLOCK_GROUP_METADATA
| BTRFS_BLOCK_GROUP_SYSTEM
));
2160 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
2164 spin_lock(&block_group
->lock
);
2165 if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &block_group
->runtime_flags
)) {
2166 spin_unlock(&block_group
->lock
);
2170 /* Check if we have unwritten allocated space */
2172 block_group
->start
+ block_group
->alloc_offset
> block_group
->meta_write_pointer
) {
2173 spin_unlock(&block_group
->lock
);
2178 * If we are sure that the block group is full (= no more room left for
2179 * new allocation) and the IO for the last usable block is completed, we
2180 * don't need to wait for the other IOs. This holds because we ensure
2181 * the sequential IO submissions using the ZONE_APPEND command for data
2182 * and block_group->meta_write_pointer for metadata.
2184 if (!fully_written
) {
2185 if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC
, &block_group
->runtime_flags
)) {
2186 spin_unlock(&block_group
->lock
);
2189 spin_unlock(&block_group
->lock
);
2191 ret
= btrfs_inc_block_group_ro(block_group
, false);
2195 /* Ensure all writes in this block group finish */
2196 btrfs_wait_block_group_reservations(block_group
);
2197 /* No need to wait for NOCOW writers. Zoned mode does not allow that */
2198 btrfs_wait_ordered_roots(fs_info
, U64_MAX
, block_group
->start
,
2199 block_group
->length
);
2200 /* Wait for extent buffers to be written. */
2202 wait_eb_writebacks(block_group
);
2204 spin_lock(&block_group
->lock
);
2207 * Bail out if someone already deactivated the block group, or
2208 * allocated space is left in the block group.
2210 if (!test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
,
2211 &block_group
->runtime_flags
)) {
2212 spin_unlock(&block_group
->lock
);
2213 btrfs_dec_block_group_ro(block_group
);
2217 if (block_group
->reserved
||
2218 test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC
,
2219 &block_group
->runtime_flags
)) {
2220 spin_unlock(&block_group
->lock
);
2221 btrfs_dec_block_group_ro(block_group
);
2226 clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
, &block_group
->runtime_flags
);
2227 block_group
->alloc_offset
= block_group
->zone_capacity
;
2228 if (block_group
->flags
& (BTRFS_BLOCK_GROUP_METADATA
| BTRFS_BLOCK_GROUP_SYSTEM
))
2229 block_group
->meta_write_pointer
= block_group
->start
+
2230 block_group
->zone_capacity
;
2231 block_group
->free_space_ctl
->free_space
= 0;
2232 btrfs_clear_treelog_bg(block_group
);
2233 btrfs_clear_data_reloc_bg(block_group
);
2234 spin_unlock(&block_group
->lock
);
2236 down_read(&dev_replace
->rwsem
);
2237 map
= block_group
->physical_map
;
2238 for (i
= 0; i
< map
->num_stripes
; i
++) {
2239 struct btrfs_device
*device
= map
->stripes
[i
].dev
;
2240 const u64 physical
= map
->stripes
[i
].physical
;
2241 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
2243 if (zinfo
->max_active_zones
== 0)
2246 ret
= blkdev_zone_mgmt(device
->bdev
, REQ_OP_ZONE_FINISH
,
2247 physical
>> SECTOR_SHIFT
,
2248 zinfo
->zone_size
>> SECTOR_SHIFT
,
2252 up_read(&dev_replace
->rwsem
);
2256 if (!(block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
))
2257 zinfo
->reserved_active_zones
++;
2258 btrfs_dev_clear_active_zone(device
, physical
);
2260 up_read(&dev_replace
->rwsem
);
2263 btrfs_dec_block_group_ro(block_group
);
2265 spin_lock(&fs_info
->zone_active_bgs_lock
);
2266 ASSERT(!list_empty(&block_group
->active_bg_list
));
2267 list_del_init(&block_group
->active_bg_list
);
2268 spin_unlock(&fs_info
->zone_active_bgs_lock
);
2270 /* For active_bg_list */
2271 btrfs_put_block_group(block_group
);
2273 clear_and_wake_up_bit(BTRFS_FS_NEED_ZONE_FINISH
, &fs_info
->flags
);
2278 int btrfs_zone_finish(struct btrfs_block_group
*block_group
)
2280 if (!btrfs_is_zoned(block_group
->fs_info
))
2283 return do_zone_finish(block_group
, false);
2286 bool btrfs_can_activate_zone(struct btrfs_fs_devices
*fs_devices
, u64 flags
)
2288 struct btrfs_fs_info
*fs_info
= fs_devices
->fs_info
;
2289 struct btrfs_device
*device
;
2292 if (!btrfs_is_zoned(fs_info
))
2295 /* Check if there is a device with active zones left */
2296 mutex_lock(&fs_info
->chunk_mutex
);
2297 spin_lock(&fs_info
->zone_active_bgs_lock
);
2298 list_for_each_entry(device
, &fs_devices
->alloc_list
, dev_alloc_list
) {
2299 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
2305 if (!zinfo
->max_active_zones
) {
2310 if (flags
& BTRFS_BLOCK_GROUP_DATA
)
2311 reserved
= zinfo
->reserved_active_zones
;
2313 switch (flags
& BTRFS_BLOCK_GROUP_PROFILE_MASK
) {
2314 case 0: /* single */
2315 ret
= (atomic_read(&zinfo
->active_zones_left
) >= (1 + reserved
));
2317 case BTRFS_BLOCK_GROUP_DUP
:
2318 ret
= (atomic_read(&zinfo
->active_zones_left
) >= (2 + reserved
));
2324 spin_unlock(&fs_info
->zone_active_bgs_lock
);
2325 mutex_unlock(&fs_info
->chunk_mutex
);
2328 set_bit(BTRFS_FS_NEED_ZONE_FINISH
, &fs_info
->flags
);
2333 void btrfs_zone_finish_endio(struct btrfs_fs_info
*fs_info
, u64 logical
, u64 length
)
2335 struct btrfs_block_group
*block_group
;
2336 u64 min_alloc_bytes
;
2338 if (!btrfs_is_zoned(fs_info
))
2341 block_group
= btrfs_lookup_block_group(fs_info
, logical
);
2342 ASSERT(block_group
);
2344 /* No MIXED_BG on zoned btrfs. */
2345 if (block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
)
2346 min_alloc_bytes
= fs_info
->sectorsize
;
2348 min_alloc_bytes
= fs_info
->nodesize
;
2350 /* Bail out if we can allocate more data from this block group. */
2351 if (logical
+ length
+ min_alloc_bytes
<=
2352 block_group
->start
+ block_group
->zone_capacity
)
2355 do_zone_finish(block_group
, true);
2358 btrfs_put_block_group(block_group
);
2361 static void btrfs_zone_finish_endio_workfn(struct work_struct
*work
)
2363 struct btrfs_block_group
*bg
=
2364 container_of(work
, struct btrfs_block_group
, zone_finish_work
);
2366 wait_on_extent_buffer_writeback(bg
->last_eb
);
2367 free_extent_buffer(bg
->last_eb
);
2368 btrfs_zone_finish_endio(bg
->fs_info
, bg
->start
, bg
->length
);
2369 btrfs_put_block_group(bg
);
2372 void btrfs_schedule_zone_finish_bg(struct btrfs_block_group
*bg
,
2373 struct extent_buffer
*eb
)
2375 if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE
, &bg
->runtime_flags
) ||
2376 eb
->start
+ eb
->len
* 2 <= bg
->start
+ bg
->zone_capacity
)
2379 if (WARN_ON(bg
->zone_finish_work
.func
== btrfs_zone_finish_endio_workfn
)) {
2380 btrfs_err(bg
->fs_info
, "double scheduling of bg %llu zone finishing",
2386 btrfs_get_block_group(bg
);
2387 atomic_inc(&eb
->refs
);
2389 INIT_WORK(&bg
->zone_finish_work
, btrfs_zone_finish_endio_workfn
);
2390 queue_work(system_unbound_wq
, &bg
->zone_finish_work
);
2393 void btrfs_clear_data_reloc_bg(struct btrfs_block_group
*bg
)
2395 struct btrfs_fs_info
*fs_info
= bg
->fs_info
;
2397 spin_lock(&fs_info
->relocation_bg_lock
);
2398 if (fs_info
->data_reloc_bg
== bg
->start
)
2399 fs_info
->data_reloc_bg
= 0;
2400 spin_unlock(&fs_info
->relocation_bg_lock
);
2403 void btrfs_free_zone_cache(struct btrfs_fs_info
*fs_info
)
2405 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
2406 struct btrfs_device
*device
;
2408 if (!btrfs_is_zoned(fs_info
))
2411 mutex_lock(&fs_devices
->device_list_mutex
);
2412 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
2413 if (device
->zone_info
) {
2414 vfree(device
->zone_info
->zone_cache
);
2415 device
->zone_info
->zone_cache
= NULL
;
2418 mutex_unlock(&fs_devices
->device_list_mutex
);
2421 bool btrfs_zoned_should_reclaim(struct btrfs_fs_info
*fs_info
)
2423 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
2424 struct btrfs_device
*device
;
2429 ASSERT(btrfs_is_zoned(fs_info
));
2431 if (fs_info
->bg_reclaim_threshold
== 0)
2434 mutex_lock(&fs_devices
->device_list_mutex
);
2435 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
2439 total
+= device
->disk_total_bytes
;
2440 used
+= device
->bytes_used
;
2442 mutex_unlock(&fs_devices
->device_list_mutex
);
2444 factor
= div64_u64(used
* 100, total
);
2445 return factor
>= fs_info
->bg_reclaim_threshold
;
2448 void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info
*fs_info
, u64 logical
,
2451 struct btrfs_block_group
*block_group
;
2453 if (!btrfs_is_zoned(fs_info
))
2456 block_group
= btrfs_lookup_block_group(fs_info
, logical
);
2457 /* It should be called on a previous data relocation block group. */
2458 ASSERT(block_group
&& (block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
));
2460 spin_lock(&block_group
->lock
);
2461 if (!test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC
, &block_group
->runtime_flags
))
2464 /* All relocation extents are written. */
2465 if (block_group
->start
+ block_group
->alloc_offset
== logical
+ length
) {
2467 * Now, release this block group for further allocations and
2470 clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC
,
2471 &block_group
->runtime_flags
);
2475 spin_unlock(&block_group
->lock
);
2476 btrfs_put_block_group(block_group
);
2479 int btrfs_zone_finish_one_bg(struct btrfs_fs_info
*fs_info
)
2481 struct btrfs_block_group
*block_group
;
2482 struct btrfs_block_group
*min_bg
= NULL
;
2483 u64 min_avail
= U64_MAX
;
2486 spin_lock(&fs_info
->zone_active_bgs_lock
);
2487 list_for_each_entry(block_group
, &fs_info
->zone_active_bgs
,
2491 spin_lock(&block_group
->lock
);
2492 if (block_group
->reserved
|| block_group
->alloc_offset
== 0 ||
2493 (block_group
->flags
& BTRFS_BLOCK_GROUP_SYSTEM
) ||
2494 test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC
, &block_group
->runtime_flags
)) {
2495 spin_unlock(&block_group
->lock
);
2499 avail
= block_group
->zone_capacity
- block_group
->alloc_offset
;
2500 if (min_avail
> avail
) {
2502 btrfs_put_block_group(min_bg
);
2503 min_bg
= block_group
;
2505 btrfs_get_block_group(min_bg
);
2507 spin_unlock(&block_group
->lock
);
2509 spin_unlock(&fs_info
->zone_active_bgs_lock
);
2514 ret
= btrfs_zone_finish(min_bg
);
2515 btrfs_put_block_group(min_bg
);
2517 return ret
< 0 ? ret
: 1;
2520 int btrfs_zoned_activate_one_bg(struct btrfs_fs_info
*fs_info
,
2521 struct btrfs_space_info
*space_info
,
2524 struct btrfs_block_group
*bg
;
2527 if (!btrfs_is_zoned(fs_info
) || (space_info
->flags
& BTRFS_BLOCK_GROUP_DATA
))
2532 bool need_finish
= false;
2534 down_read(&space_info
->groups_sem
);
2535 for (index
= 0; index
< BTRFS_NR_RAID_TYPES
; index
++) {
2536 list_for_each_entry(bg
, &space_info
->block_groups
[index
],
2538 if (!spin_trylock(&bg
->lock
))
2540 if (btrfs_zoned_bg_is_full(bg
) ||
2541 test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE
,
2542 &bg
->runtime_flags
)) {
2543 spin_unlock(&bg
->lock
);
2546 spin_unlock(&bg
->lock
);
2548 if (btrfs_zone_activate(bg
)) {
2549 up_read(&space_info
->groups_sem
);
2556 up_read(&space_info
->groups_sem
);
2558 if (!do_finish
|| !need_finish
)
2561 ret
= btrfs_zone_finish_one_bg(fs_info
);
2572 * Reserve zones for one metadata block group, one tree-log block group, and one
2573 * system block group.
2575 void btrfs_check_active_zone_reservation(struct btrfs_fs_info
*fs_info
)
2577 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
2578 struct btrfs_block_group
*block_group
;
2579 struct btrfs_device
*device
;
2580 /* Reserve zones for normal SINGLE metadata and tree-log block group. */
2581 unsigned int metadata_reserve
= 2;
2582 /* Reserve a zone for SINGLE system block group. */
2583 unsigned int system_reserve
= 1;
2585 if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING
, &fs_info
->flags
))
2589 * This function is called from the mount context. So, there is no
2590 * parallel process touching the bits. No need for read_seqretry().
2592 if (fs_info
->avail_metadata_alloc_bits
& BTRFS_BLOCK_GROUP_DUP
)
2593 metadata_reserve
= 4;
2594 if (fs_info
->avail_system_alloc_bits
& BTRFS_BLOCK_GROUP_DUP
)
2597 /* Apply the reservation on all the devices. */
2598 mutex_lock(&fs_devices
->device_list_mutex
);
2599 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
2603 device
->zone_info
->reserved_active_zones
=
2604 metadata_reserve
+ system_reserve
;
2606 mutex_unlock(&fs_devices
->device_list_mutex
);
2608 /* Release reservation for currently active block groups. */
2609 spin_lock(&fs_info
->zone_active_bgs_lock
);
2610 list_for_each_entry(block_group
, &fs_info
->zone_active_bgs
, active_bg_list
) {
2611 struct btrfs_chunk_map
*map
= block_group
->physical_map
;
2613 if (!(block_group
->flags
&
2614 (BTRFS_BLOCK_GROUP_METADATA
| BTRFS_BLOCK_GROUP_SYSTEM
)))
2617 for (int i
= 0; i
< map
->num_stripes
; i
++)
2618 map
->stripes
[i
].dev
->zone_info
->reserved_active_zones
--;
2620 spin_unlock(&fs_info
->zone_active_bgs_lock
);