1 // SPDX-License-Identifier: GPL-2.0
3 #include <linux/bitops.h>
4 #include <linux/slab.h>
5 #include <linux/blkdev.h>
6 #include <linux/sched/mm.h>
7 #include <linux/atomic.h>
8 #include <linux/vmalloc.h>
12 #include "rcu-string.h"
14 #include "block-group.h"
15 #include "transaction.h"
16 #include "dev-replace.h"
17 #include "space-info.h"
19 /* Maximum number of zones to report per blkdev_report_zones() call */
20 #define BTRFS_REPORT_NR_ZONES 4096
21 /* Invalid allocation pointer value for missing devices */
22 #define WP_MISSING_DEV ((u64)-1)
23 /* Pseudo write pointer value for conventional zone */
24 #define WP_CONVENTIONAL ((u64)-2)
27 * Location of the first zone of superblock logging zone pairs.
29 * - primary superblock: 0B (zone 0)
30 * - first copy: 512G (zone starting at that offset)
31 * - second copy: 4T (zone starting at that offset)
33 #define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL)
34 #define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G)
35 #define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G)
37 #define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
38 #define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
40 /* Number of superblock log zones */
41 #define BTRFS_NR_SB_LOG_ZONES 2
44 * Minimum of active zones we need:
46 * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
47 * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
48 * - 1 zone for tree-log dedicated block group
49 * - 1 zone for relocation
51 #define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5)
54 * Minimum / maximum supported zone size. Currently, SMR disks have a zone
55 * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
56 * We do not expect the zone size to become larger than 8GiB or smaller than
57 * 4MiB in the near future.
59 #define BTRFS_MAX_ZONE_SIZE SZ_8G
60 #define BTRFS_MIN_ZONE_SIZE SZ_4M
62 #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
64 static inline bool sb_zone_is_full(const struct blk_zone
*zone
)
66 return (zone
->cond
== BLK_ZONE_COND_FULL
) ||
67 (zone
->wp
+ SUPER_INFO_SECTORS
> zone
->start
+ zone
->capacity
);
70 static int copy_zone_info_cb(struct blk_zone
*zone
, unsigned int idx
, void *data
)
72 struct blk_zone
*zones
= data
;
74 memcpy(&zones
[idx
], zone
, sizeof(*zone
));
79 static int sb_write_pointer(struct block_device
*bdev
, struct blk_zone
*zones
,
82 bool empty
[BTRFS_NR_SB_LOG_ZONES
];
83 bool full
[BTRFS_NR_SB_LOG_ZONES
];
87 for (i
= 0; i
< BTRFS_NR_SB_LOG_ZONES
; i
++) {
88 ASSERT(zones
[i
].type
!= BLK_ZONE_TYPE_CONVENTIONAL
);
89 empty
[i
] = (zones
[i
].cond
== BLK_ZONE_COND_EMPTY
);
90 full
[i
] = sb_zone_is_full(&zones
[i
]);
94 * Possible states of log buffer zones
96 * Empty[0] In use[0] Full[0]
102 * *: Special case, no superblock is written
103 * 0: Use write pointer of zones[0]
104 * 1: Use write pointer of zones[1]
105 * C: Compare super blocks from zones[0] and zones[1], use the latest
106 * one determined by generation
110 if (empty
[0] && empty
[1]) {
111 /* Special case to distinguish no superblock to read */
112 *wp_ret
= zones
[0].start
<< SECTOR_SHIFT
;
114 } else if (full
[0] && full
[1]) {
115 /* Compare two super blocks */
116 struct address_space
*mapping
= bdev
->bd_inode
->i_mapping
;
117 struct page
*page
[BTRFS_NR_SB_LOG_ZONES
];
118 struct btrfs_super_block
*super
[BTRFS_NR_SB_LOG_ZONES
];
121 for (i
= 0; i
< BTRFS_NR_SB_LOG_ZONES
; i
++) {
124 bytenr
= ((zones
[i
].start
+ zones
[i
].len
)
125 << SECTOR_SHIFT
) - BTRFS_SUPER_INFO_SIZE
;
127 page
[i
] = read_cache_page_gfp(mapping
,
128 bytenr
>> PAGE_SHIFT
, GFP_NOFS
);
129 if (IS_ERR(page
[i
])) {
131 btrfs_release_disk_super(super
[0]);
132 return PTR_ERR(page
[i
]);
134 super
[i
] = page_address(page
[i
]);
137 if (super
[0]->generation
> super
[1]->generation
)
138 sector
= zones
[1].start
;
140 sector
= zones
[0].start
;
142 for (i
= 0; i
< BTRFS_NR_SB_LOG_ZONES
; i
++)
143 btrfs_release_disk_super(super
[i
]);
144 } else if (!full
[0] && (empty
[1] || full
[1])) {
145 sector
= zones
[0].wp
;
146 } else if (full
[0]) {
147 sector
= zones
[1].wp
;
151 *wp_ret
= sector
<< SECTOR_SHIFT
;
156 * Get the first zone number of the superblock mirror
158 static inline u32
sb_zone_number(int shift
, int mirror
)
162 ASSERT(mirror
< BTRFS_SUPER_MIRROR_MAX
);
164 case 0: zone
= 0; break;
165 case 1: zone
= 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT
- shift
); break;
166 case 2: zone
= 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT
- shift
); break;
169 ASSERT(zone
<= U32_MAX
);
174 static inline sector_t
zone_start_sector(u32 zone_number
,
175 struct block_device
*bdev
)
177 return (sector_t
)zone_number
<< ilog2(bdev_zone_sectors(bdev
));
180 static inline u64
zone_start_physical(u32 zone_number
,
181 struct btrfs_zoned_device_info
*zone_info
)
183 return (u64
)zone_number
<< zone_info
->zone_size_shift
;
187 * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
188 * device into static sized chunks and fake a conventional zone on each of
191 static int emulate_report_zones(struct btrfs_device
*device
, u64 pos
,
192 struct blk_zone
*zones
, unsigned int nr_zones
)
194 const sector_t zone_sectors
= device
->fs_info
->zone_size
>> SECTOR_SHIFT
;
195 sector_t bdev_size
= bdev_nr_sectors(device
->bdev
);
198 pos
>>= SECTOR_SHIFT
;
199 for (i
= 0; i
< nr_zones
; i
++) {
200 zones
[i
].start
= i
* zone_sectors
+ pos
;
201 zones
[i
].len
= zone_sectors
;
202 zones
[i
].capacity
= zone_sectors
;
203 zones
[i
].wp
= zones
[i
].start
+ zone_sectors
;
204 zones
[i
].type
= BLK_ZONE_TYPE_CONVENTIONAL
;
205 zones
[i
].cond
= BLK_ZONE_COND_NOT_WP
;
207 if (zones
[i
].wp
>= bdev_size
) {
216 static int btrfs_get_dev_zones(struct btrfs_device
*device
, u64 pos
,
217 struct blk_zone
*zones
, unsigned int *nr_zones
)
219 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
226 if (!bdev_is_zoned(device
->bdev
)) {
227 ret
= emulate_report_zones(device
, pos
, zones
, *nr_zones
);
233 if (zinfo
->zone_cache
) {
236 ASSERT(IS_ALIGNED(pos
, zinfo
->zone_size
));
237 zno
= pos
>> zinfo
->zone_size_shift
;
239 * We cannot report zones beyond the zone end. So, it is OK to
240 * cap *nr_zones to at the end.
242 *nr_zones
= min_t(u32
, *nr_zones
, zinfo
->nr_zones
- zno
);
244 for (i
= 0; i
< *nr_zones
; i
++) {
245 struct blk_zone
*zone_info
;
247 zone_info
= &zinfo
->zone_cache
[zno
+ i
];
252 if (i
== *nr_zones
) {
253 /* Cache hit on all the zones */
254 memcpy(zones
, zinfo
->zone_cache
+ zno
,
255 sizeof(*zinfo
->zone_cache
) * *nr_zones
);
260 ret
= blkdev_report_zones(device
->bdev
, pos
>> SECTOR_SHIFT
, *nr_zones
,
261 copy_zone_info_cb
, zones
);
263 btrfs_err_in_rcu(device
->fs_info
,
264 "zoned: failed to read zone %llu on %s (devid %llu)",
265 pos
, rcu_str_deref(device
->name
),
274 if (zinfo
->zone_cache
)
275 memcpy(zinfo
->zone_cache
+ zno
, zones
,
276 sizeof(*zinfo
->zone_cache
) * *nr_zones
);
281 /* The emulated zone size is determined from the size of device extent */
282 static int calculate_emulated_zone_size(struct btrfs_fs_info
*fs_info
)
284 struct btrfs_path
*path
;
285 struct btrfs_root
*root
= fs_info
->dev_root
;
286 struct btrfs_key key
;
287 struct extent_buffer
*leaf
;
288 struct btrfs_dev_extent
*dext
;
292 key
.type
= BTRFS_DEV_EXTENT_KEY
;
295 path
= btrfs_alloc_path();
299 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
303 if (path
->slots
[0] >= btrfs_header_nritems(path
->nodes
[0])) {
304 ret
= btrfs_next_leaf(root
, path
);
307 /* No dev extents at all? Not good */
314 leaf
= path
->nodes
[0];
315 dext
= btrfs_item_ptr(leaf
, path
->slots
[0], struct btrfs_dev_extent
);
316 fs_info
->zone_size
= btrfs_dev_extent_length(leaf
, dext
);
320 btrfs_free_path(path
);
325 int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info
*fs_info
)
327 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
328 struct btrfs_device
*device
;
331 /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
332 if (!btrfs_fs_incompat(fs_info
, ZONED
))
335 mutex_lock(&fs_devices
->device_list_mutex
);
336 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
337 /* We can skip reading of zone info for missing devices */
341 ret
= btrfs_get_dev_zone_info(device
, true);
345 mutex_unlock(&fs_devices
->device_list_mutex
);
350 int btrfs_get_dev_zone_info(struct btrfs_device
*device
, bool populate_cache
)
352 struct btrfs_fs_info
*fs_info
= device
->fs_info
;
353 struct btrfs_zoned_device_info
*zone_info
= NULL
;
354 struct block_device
*bdev
= device
->bdev
;
355 unsigned int max_active_zones
;
356 unsigned int nactive
;
359 struct blk_zone
*zones
= NULL
;
360 unsigned int i
, nreported
= 0, nr_zones
;
361 sector_t zone_sectors
;
362 char *model
, *emulated
;
366 * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
369 if (!btrfs_fs_incompat(fs_info
, ZONED
))
372 if (device
->zone_info
)
375 zone_info
= kzalloc(sizeof(*zone_info
), GFP_KERNEL
);
379 device
->zone_info
= zone_info
;
381 if (!bdev_is_zoned(bdev
)) {
382 if (!fs_info
->zone_size
) {
383 ret
= calculate_emulated_zone_size(fs_info
);
388 ASSERT(fs_info
->zone_size
);
389 zone_sectors
= fs_info
->zone_size
>> SECTOR_SHIFT
;
391 zone_sectors
= bdev_zone_sectors(bdev
);
394 /* Check if it's power of 2 (see is_power_of_2) */
395 ASSERT(zone_sectors
!= 0 && (zone_sectors
& (zone_sectors
- 1)) == 0);
396 zone_info
->zone_size
= zone_sectors
<< SECTOR_SHIFT
;
398 /* We reject devices with a zone size larger than 8GB */
399 if (zone_info
->zone_size
> BTRFS_MAX_ZONE_SIZE
) {
400 btrfs_err_in_rcu(fs_info
,
401 "zoned: %s: zone size %llu larger than supported maximum %llu",
402 rcu_str_deref(device
->name
),
403 zone_info
->zone_size
, BTRFS_MAX_ZONE_SIZE
);
406 } else if (zone_info
->zone_size
< BTRFS_MIN_ZONE_SIZE
) {
407 btrfs_err_in_rcu(fs_info
,
408 "zoned: %s: zone size %llu smaller than supported minimum %u",
409 rcu_str_deref(device
->name
),
410 zone_info
->zone_size
, BTRFS_MIN_ZONE_SIZE
);
415 nr_sectors
= bdev_nr_sectors(bdev
);
416 zone_info
->zone_size_shift
= ilog2(zone_info
->zone_size
);
417 zone_info
->nr_zones
= nr_sectors
>> ilog2(zone_sectors
);
418 if (!IS_ALIGNED(nr_sectors
, zone_sectors
))
419 zone_info
->nr_zones
++;
421 max_active_zones
= bdev_max_active_zones(bdev
);
422 if (max_active_zones
&& max_active_zones
< BTRFS_MIN_ACTIVE_ZONES
) {
423 btrfs_err_in_rcu(fs_info
,
424 "zoned: %s: max active zones %u is too small, need at least %u active zones",
425 rcu_str_deref(device
->name
), max_active_zones
,
426 BTRFS_MIN_ACTIVE_ZONES
);
430 zone_info
->max_active_zones
= max_active_zones
;
432 zone_info
->seq_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
433 if (!zone_info
->seq_zones
) {
438 zone_info
->empty_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
439 if (!zone_info
->empty_zones
) {
444 zone_info
->active_zones
= bitmap_zalloc(zone_info
->nr_zones
, GFP_KERNEL
);
445 if (!zone_info
->active_zones
) {
450 zones
= kcalloc(BTRFS_REPORT_NR_ZONES
, sizeof(struct blk_zone
), GFP_KERNEL
);
457 * Enable zone cache only for a zoned device. On a non-zoned device, we
458 * fill the zone info with emulated CONVENTIONAL zones, so no need to
461 if (populate_cache
&& bdev_is_zoned(device
->bdev
)) {
462 zone_info
->zone_cache
= vzalloc(sizeof(struct blk_zone
) *
463 zone_info
->nr_zones
);
464 if (!zone_info
->zone_cache
) {
465 btrfs_err_in_rcu(device
->fs_info
,
466 "zoned: failed to allocate zone cache for %s",
467 rcu_str_deref(device
->name
));
475 while (sector
< nr_sectors
) {
476 nr_zones
= BTRFS_REPORT_NR_ZONES
;
477 ret
= btrfs_get_dev_zones(device
, sector
<< SECTOR_SHIFT
, zones
,
482 for (i
= 0; i
< nr_zones
; i
++) {
483 if (zones
[i
].type
== BLK_ZONE_TYPE_SEQWRITE_REQ
)
484 __set_bit(nreported
, zone_info
->seq_zones
);
485 switch (zones
[i
].cond
) {
486 case BLK_ZONE_COND_EMPTY
:
487 __set_bit(nreported
, zone_info
->empty_zones
);
489 case BLK_ZONE_COND_IMP_OPEN
:
490 case BLK_ZONE_COND_EXP_OPEN
:
491 case BLK_ZONE_COND_CLOSED
:
492 __set_bit(nreported
, zone_info
->active_zones
);
498 sector
= zones
[nr_zones
- 1].start
+ zones
[nr_zones
- 1].len
;
501 if (nreported
!= zone_info
->nr_zones
) {
502 btrfs_err_in_rcu(device
->fs_info
,
503 "inconsistent number of zones on %s (%u/%u)",
504 rcu_str_deref(device
->name
), nreported
,
505 zone_info
->nr_zones
);
510 if (max_active_zones
) {
511 if (nactive
> max_active_zones
) {
512 btrfs_err_in_rcu(device
->fs_info
,
513 "zoned: %u active zones on %s exceeds max_active_zones %u",
514 nactive
, rcu_str_deref(device
->name
),
519 atomic_set(&zone_info
->active_zones_left
,
520 max_active_zones
- nactive
);
523 /* Validate superblock log */
524 nr_zones
= BTRFS_NR_SB_LOG_ZONES
;
525 for (i
= 0; i
< BTRFS_SUPER_MIRROR_MAX
; i
++) {
528 int sb_pos
= BTRFS_NR_SB_LOG_ZONES
* i
;
530 sb_zone
= sb_zone_number(zone_info
->zone_size_shift
, i
);
531 if (sb_zone
+ 1 >= zone_info
->nr_zones
)
534 ret
= btrfs_get_dev_zones(device
,
535 zone_start_physical(sb_zone
, zone_info
),
536 &zone_info
->sb_zones
[sb_pos
],
541 if (nr_zones
!= BTRFS_NR_SB_LOG_ZONES
) {
542 btrfs_err_in_rcu(device
->fs_info
,
543 "zoned: failed to read super block log zone info at devid %llu zone %u",
544 device
->devid
, sb_zone
);
550 * If zones[0] is conventional, always use the beginning of the
551 * zone to record superblock. No need to validate in that case.
553 if (zone_info
->sb_zones
[BTRFS_NR_SB_LOG_ZONES
* i
].type
==
554 BLK_ZONE_TYPE_CONVENTIONAL
)
557 ret
= sb_write_pointer(device
->bdev
,
558 &zone_info
->sb_zones
[sb_pos
], &sb_wp
);
559 if (ret
!= -ENOENT
&& ret
) {
560 btrfs_err_in_rcu(device
->fs_info
,
561 "zoned: super block log zone corrupted devid %llu zone %u",
562 device
->devid
, sb_zone
);
571 switch (bdev_zoned_model(bdev
)) {
573 model
= "host-managed zoned";
577 model
= "host-aware zoned";
582 emulated
= "emulated ";
586 btrfs_err_in_rcu(fs_info
, "zoned: unsupported model %d on %s",
587 bdev_zoned_model(bdev
),
588 rcu_str_deref(device
->name
));
590 goto out_free_zone_info
;
593 btrfs_info_in_rcu(fs_info
,
594 "%s block device %s, %u %szones of %llu bytes",
595 model
, rcu_str_deref(device
->name
), zone_info
->nr_zones
,
596 emulated
, zone_info
->zone_size
);
603 btrfs_destroy_dev_zone_info(device
);
608 void btrfs_destroy_dev_zone_info(struct btrfs_device
*device
)
610 struct btrfs_zoned_device_info
*zone_info
= device
->zone_info
;
615 bitmap_free(zone_info
->active_zones
);
616 bitmap_free(zone_info
->seq_zones
);
617 bitmap_free(zone_info
->empty_zones
);
618 vfree(zone_info
->zone_cache
);
620 device
->zone_info
= NULL
;
623 int btrfs_get_dev_zone(struct btrfs_device
*device
, u64 pos
,
624 struct blk_zone
*zone
)
626 unsigned int nr_zones
= 1;
629 ret
= btrfs_get_dev_zones(device
, pos
, zone
, &nr_zones
);
630 if (ret
!= 0 || !nr_zones
)
631 return ret
? ret
: -EIO
;
636 int btrfs_check_zoned_mode(struct btrfs_fs_info
*fs_info
)
638 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
639 struct btrfs_device
*device
;
640 u64 zoned_devices
= 0;
643 const bool incompat_zoned
= btrfs_fs_incompat(fs_info
, ZONED
);
646 /* Count zoned devices */
647 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
648 enum blk_zoned_model model
;
653 model
= bdev_zoned_model(device
->bdev
);
655 * A Host-Managed zoned device must be used as a zoned device.
656 * A Host-Aware zoned device and a non-zoned devices can be
657 * treated as a zoned device, if ZONED flag is enabled in the
660 if (model
== BLK_ZONED_HM
||
661 (model
== BLK_ZONED_HA
&& incompat_zoned
) ||
662 (model
== BLK_ZONED_NONE
&& incompat_zoned
)) {
663 struct btrfs_zoned_device_info
*zone_info
;
665 zone_info
= device
->zone_info
;
668 zone_size
= zone_info
->zone_size
;
669 } else if (zone_info
->zone_size
!= zone_size
) {
671 "zoned: unequal block device zone sizes: have %llu found %llu",
672 device
->zone_info
->zone_size
,
681 if (!zoned_devices
&& !incompat_zoned
)
684 if (!zoned_devices
&& incompat_zoned
) {
685 /* No zoned block device found on ZONED filesystem */
687 "zoned: no zoned devices found on a zoned filesystem");
692 if (zoned_devices
&& !incompat_zoned
) {
694 "zoned: mode not enabled but zoned device found");
699 if (zoned_devices
!= nr_devices
) {
701 "zoned: cannot mix zoned and regular devices");
707 * stripe_size is always aligned to BTRFS_STRIPE_LEN in
708 * btrfs_create_chunk(). Since we want stripe_len == zone_size,
709 * check the alignment here.
711 if (!IS_ALIGNED(zone_size
, BTRFS_STRIPE_LEN
)) {
713 "zoned: zone size %llu not aligned to stripe %u",
714 zone_size
, BTRFS_STRIPE_LEN
);
719 if (btrfs_fs_incompat(fs_info
, MIXED_GROUPS
)) {
720 btrfs_err(fs_info
, "zoned: mixed block groups not supported");
725 fs_info
->zone_size
= zone_size
;
726 fs_info
->fs_devices
->chunk_alloc_policy
= BTRFS_CHUNK_ALLOC_ZONED
;
729 * Check mount options here, because we might change fs_info->zoned
730 * from fs_info->zone_size.
732 ret
= btrfs_check_mountopts_zoned(fs_info
);
736 btrfs_info(fs_info
, "zoned mode enabled with zone size %llu", zone_size
);
741 int btrfs_check_mountopts_zoned(struct btrfs_fs_info
*info
)
743 if (!btrfs_is_zoned(info
))
747 * Space cache writing is not COWed. Disable that to avoid write errors
748 * in sequential zones.
750 if (btrfs_test_opt(info
, SPACE_CACHE
)) {
751 btrfs_err(info
, "zoned: space cache v1 is not supported");
755 if (btrfs_test_opt(info
, NODATACOW
)) {
756 btrfs_err(info
, "zoned: NODATACOW not supported");
763 static int sb_log_location(struct block_device
*bdev
, struct blk_zone
*zones
,
764 int rw
, u64
*bytenr_ret
)
769 if (zones
[0].type
== BLK_ZONE_TYPE_CONVENTIONAL
) {
770 *bytenr_ret
= zones
[0].start
<< SECTOR_SHIFT
;
774 ret
= sb_write_pointer(bdev
, zones
, &wp
);
775 if (ret
!= -ENOENT
&& ret
< 0)
779 struct blk_zone
*reset
= NULL
;
781 if (wp
== zones
[0].start
<< SECTOR_SHIFT
)
783 else if (wp
== zones
[1].start
<< SECTOR_SHIFT
)
786 if (reset
&& reset
->cond
!= BLK_ZONE_COND_EMPTY
) {
787 ASSERT(sb_zone_is_full(reset
));
789 ret
= blkdev_zone_mgmt(bdev
, REQ_OP_ZONE_RESET
,
790 reset
->start
, reset
->len
,
795 reset
->cond
= BLK_ZONE_COND_EMPTY
;
796 reset
->wp
= reset
->start
;
798 } else if (ret
!= -ENOENT
) {
800 * For READ, we want the previous one. Move write pointer to
801 * the end of a zone, if it is at the head of a zone.
805 if (wp
== zones
[0].start
<< SECTOR_SHIFT
)
806 zone_end
= zones
[1].start
+ zones
[1].capacity
;
807 else if (wp
== zones
[1].start
<< SECTOR_SHIFT
)
808 zone_end
= zones
[0].start
+ zones
[0].capacity
;
810 wp
= ALIGN_DOWN(zone_end
<< SECTOR_SHIFT
,
811 BTRFS_SUPER_INFO_SIZE
);
813 wp
-= BTRFS_SUPER_INFO_SIZE
;
821 int btrfs_sb_log_location_bdev(struct block_device
*bdev
, int mirror
, int rw
,
824 struct blk_zone zones
[BTRFS_NR_SB_LOG_ZONES
];
825 sector_t zone_sectors
;
828 u8 zone_sectors_shift
;
832 if (!bdev_is_zoned(bdev
)) {
833 *bytenr_ret
= btrfs_sb_offset(mirror
);
837 ASSERT(rw
== READ
|| rw
== WRITE
);
839 zone_sectors
= bdev_zone_sectors(bdev
);
840 if (!is_power_of_2(zone_sectors
))
842 zone_sectors_shift
= ilog2(zone_sectors
);
843 nr_sectors
= bdev_nr_sectors(bdev
);
844 nr_zones
= nr_sectors
>> zone_sectors_shift
;
846 sb_zone
= sb_zone_number(zone_sectors_shift
+ SECTOR_SHIFT
, mirror
);
847 if (sb_zone
+ 1 >= nr_zones
)
850 ret
= blkdev_report_zones(bdev
, zone_start_sector(sb_zone
, bdev
),
851 BTRFS_NR_SB_LOG_ZONES
, copy_zone_info_cb
,
855 if (ret
!= BTRFS_NR_SB_LOG_ZONES
)
858 return sb_log_location(bdev
, zones
, rw
, bytenr_ret
);
861 int btrfs_sb_log_location(struct btrfs_device
*device
, int mirror
, int rw
,
864 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
868 * For a zoned filesystem on a non-zoned block device, use the same
869 * super block locations as regular filesystem. Doing so, the super
870 * block can always be retrieved and the zoned flag of the volume
871 * detected from the super block information.
873 if (!bdev_is_zoned(device
->bdev
)) {
874 *bytenr_ret
= btrfs_sb_offset(mirror
);
878 zone_num
= sb_zone_number(zinfo
->zone_size_shift
, mirror
);
879 if (zone_num
+ 1 >= zinfo
->nr_zones
)
882 return sb_log_location(device
->bdev
,
883 &zinfo
->sb_zones
[BTRFS_NR_SB_LOG_ZONES
* mirror
],
887 static inline bool is_sb_log_zone(struct btrfs_zoned_device_info
*zinfo
,
895 zone_num
= sb_zone_number(zinfo
->zone_size_shift
, mirror
);
896 if (zone_num
+ 1 >= zinfo
->nr_zones
)
899 if (!test_bit(zone_num
, zinfo
->seq_zones
))
905 int btrfs_advance_sb_log(struct btrfs_device
*device
, int mirror
)
907 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
908 struct blk_zone
*zone
;
911 if (!is_sb_log_zone(zinfo
, mirror
))
914 zone
= &zinfo
->sb_zones
[BTRFS_NR_SB_LOG_ZONES
* mirror
];
915 for (i
= 0; i
< BTRFS_NR_SB_LOG_ZONES
; i
++) {
916 /* Advance the next zone */
917 if (zone
->cond
== BLK_ZONE_COND_FULL
) {
922 if (zone
->cond
== BLK_ZONE_COND_EMPTY
)
923 zone
->cond
= BLK_ZONE_COND_IMP_OPEN
;
925 zone
->wp
+= SUPER_INFO_SECTORS
;
927 if (sb_zone_is_full(zone
)) {
929 * No room left to write new superblock. Since
930 * superblock is written with REQ_SYNC, it is safe to
931 * finish the zone now.
933 * If the write pointer is exactly at the capacity,
934 * explicit ZONE_FINISH is not necessary.
936 if (zone
->wp
!= zone
->start
+ zone
->capacity
) {
939 ret
= blkdev_zone_mgmt(device
->bdev
,
940 REQ_OP_ZONE_FINISH
, zone
->start
,
941 zone
->len
, GFP_NOFS
);
946 zone
->wp
= zone
->start
+ zone
->len
;
947 zone
->cond
= BLK_ZONE_COND_FULL
;
952 /* All the zones are FULL. Should not reach here. */
957 int btrfs_reset_sb_log_zones(struct block_device
*bdev
, int mirror
)
959 sector_t zone_sectors
;
961 u8 zone_sectors_shift
;
965 zone_sectors
= bdev_zone_sectors(bdev
);
966 zone_sectors_shift
= ilog2(zone_sectors
);
967 nr_sectors
= bdev_nr_sectors(bdev
);
968 nr_zones
= nr_sectors
>> zone_sectors_shift
;
970 sb_zone
= sb_zone_number(zone_sectors_shift
+ SECTOR_SHIFT
, mirror
);
971 if (sb_zone
+ 1 >= nr_zones
)
974 return blkdev_zone_mgmt(bdev
, REQ_OP_ZONE_RESET
,
975 zone_start_sector(sb_zone
, bdev
),
976 zone_sectors
* BTRFS_NR_SB_LOG_ZONES
, GFP_NOFS
);
980 * btrfs_find_allocatable_zones - find allocatable zones within a given region
982 * @device: the device to allocate a region on
983 * @hole_start: the position of the hole to allocate the region
984 * @num_bytes: size of wanted region
985 * @hole_end: the end of the hole
986 * @return: position of allocatable zones
988 * Allocatable region should not contain any superblock locations.
990 u64
btrfs_find_allocatable_zones(struct btrfs_device
*device
, u64 hole_start
,
991 u64 hole_end
, u64 num_bytes
)
993 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
994 const u8 shift
= zinfo
->zone_size_shift
;
995 u64 nzones
= num_bytes
>> shift
;
996 u64 pos
= hole_start
;
1001 ASSERT(IS_ALIGNED(hole_start
, zinfo
->zone_size
));
1002 ASSERT(IS_ALIGNED(num_bytes
, zinfo
->zone_size
));
1004 while (pos
< hole_end
) {
1005 begin
= pos
>> shift
;
1006 end
= begin
+ nzones
;
1008 if (end
> zinfo
->nr_zones
)
1011 /* Check if zones in the region are all empty */
1012 if (btrfs_dev_is_sequential(device
, pos
) &&
1013 find_next_zero_bit(zinfo
->empty_zones
, end
, begin
) != end
) {
1014 pos
+= zinfo
->zone_size
;
1019 for (i
= 0; i
< BTRFS_SUPER_MIRROR_MAX
; i
++) {
1023 sb_zone
= sb_zone_number(shift
, i
);
1024 if (!(end
<= sb_zone
||
1025 sb_zone
+ BTRFS_NR_SB_LOG_ZONES
<= begin
)) {
1027 pos
= zone_start_physical(
1028 sb_zone
+ BTRFS_NR_SB_LOG_ZONES
, zinfo
);
1032 /* We also need to exclude regular superblock positions */
1033 sb_pos
= btrfs_sb_offset(i
);
1034 if (!(pos
+ num_bytes
<= sb_pos
||
1035 sb_pos
+ BTRFS_SUPER_INFO_SIZE
<= pos
)) {
1037 pos
= ALIGN(sb_pos
+ BTRFS_SUPER_INFO_SIZE
,
1049 static bool btrfs_dev_set_active_zone(struct btrfs_device
*device
, u64 pos
)
1051 struct btrfs_zoned_device_info
*zone_info
= device
->zone_info
;
1052 unsigned int zno
= (pos
>> zone_info
->zone_size_shift
);
1054 /* We can use any number of zones */
1055 if (zone_info
->max_active_zones
== 0)
1058 if (!test_bit(zno
, zone_info
->active_zones
)) {
1059 /* Active zone left? */
1060 if (atomic_dec_if_positive(&zone_info
->active_zones_left
) < 0)
1062 if (test_and_set_bit(zno
, zone_info
->active_zones
)) {
1063 /* Someone already set the bit */
1064 atomic_inc(&zone_info
->active_zones_left
);
1071 static void btrfs_dev_clear_active_zone(struct btrfs_device
*device
, u64 pos
)
1073 struct btrfs_zoned_device_info
*zone_info
= device
->zone_info
;
1074 unsigned int zno
= (pos
>> zone_info
->zone_size_shift
);
1076 /* We can use any number of zones */
1077 if (zone_info
->max_active_zones
== 0)
1080 if (test_and_clear_bit(zno
, zone_info
->active_zones
))
1081 atomic_inc(&zone_info
->active_zones_left
);
1084 int btrfs_reset_device_zone(struct btrfs_device
*device
, u64 physical
,
1085 u64 length
, u64
*bytes
)
1090 ret
= blkdev_zone_mgmt(device
->bdev
, REQ_OP_ZONE_RESET
,
1091 physical
>> SECTOR_SHIFT
, length
>> SECTOR_SHIFT
,
1098 btrfs_dev_set_zone_empty(device
, physical
);
1099 btrfs_dev_clear_active_zone(device
, physical
);
1100 physical
+= device
->zone_info
->zone_size
;
1101 length
-= device
->zone_info
->zone_size
;
1107 int btrfs_ensure_empty_zones(struct btrfs_device
*device
, u64 start
, u64 size
)
1109 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
1110 const u8 shift
= zinfo
->zone_size_shift
;
1111 unsigned long begin
= start
>> shift
;
1112 unsigned long end
= (start
+ size
) >> shift
;
1116 ASSERT(IS_ALIGNED(start
, zinfo
->zone_size
));
1117 ASSERT(IS_ALIGNED(size
, zinfo
->zone_size
));
1119 if (end
> zinfo
->nr_zones
)
1122 /* All the zones are conventional */
1123 if (find_next_bit(zinfo
->seq_zones
, begin
, end
) == end
)
1126 /* All the zones are sequential and empty */
1127 if (find_next_zero_bit(zinfo
->seq_zones
, begin
, end
) == end
&&
1128 find_next_zero_bit(zinfo
->empty_zones
, begin
, end
) == end
)
1131 for (pos
= start
; pos
< start
+ size
; pos
+= zinfo
->zone_size
) {
1134 if (!btrfs_dev_is_sequential(device
, pos
) ||
1135 btrfs_dev_is_empty_zone(device
, pos
))
1138 /* Free regions should be empty */
1141 "zoned: resetting device %s (devid %llu) zone %llu for allocation",
1142 rcu_str_deref(device
->name
), device
->devid
, pos
>> shift
);
1145 ret
= btrfs_reset_device_zone(device
, pos
, zinfo
->zone_size
,
1155 * Calculate an allocation pointer from the extent allocation information
1156 * for a block group consist of conventional zones. It is pointed to the
1157 * end of the highest addressed extent in the block group as an allocation
1160 static int calculate_alloc_pointer(struct btrfs_block_group
*cache
,
1163 struct btrfs_fs_info
*fs_info
= cache
->fs_info
;
1164 struct btrfs_root
*root
;
1165 struct btrfs_path
*path
;
1166 struct btrfs_key key
;
1167 struct btrfs_key found_key
;
1171 path
= btrfs_alloc_path();
1175 key
.objectid
= cache
->start
+ cache
->length
;
1179 root
= btrfs_extent_root(fs_info
, key
.objectid
);
1180 ret
= btrfs_search_slot(NULL
, root
, &key
, path
, 0, 0);
1181 /* We should not find the exact match */
1187 ret
= btrfs_previous_extent_item(root
, path
, cache
->start
);
1196 btrfs_item_key_to_cpu(path
->nodes
[0], &found_key
, path
->slots
[0]);
1198 if (found_key
.type
== BTRFS_EXTENT_ITEM_KEY
)
1199 length
= found_key
.offset
;
1201 length
= fs_info
->nodesize
;
1203 if (!(found_key
.objectid
>= cache
->start
&&
1204 found_key
.objectid
+ length
<= cache
->start
+ cache
->length
)) {
1208 *offset_ret
= found_key
.objectid
+ length
- cache
->start
;
1212 btrfs_free_path(path
);
1216 int btrfs_load_block_group_zone_info(struct btrfs_block_group
*cache
, bool new)
1218 struct btrfs_fs_info
*fs_info
= cache
->fs_info
;
1219 struct extent_map_tree
*em_tree
= &fs_info
->mapping_tree
;
1220 struct extent_map
*em
;
1221 struct map_lookup
*map
;
1222 struct btrfs_device
*device
;
1223 u64 logical
= cache
->start
;
1224 u64 length
= cache
->length
;
1227 unsigned int nofs_flag
;
1228 u64
*alloc_offsets
= NULL
;
1230 u64
*physical
= NULL
;
1231 unsigned long *active
= NULL
;
1233 u32 num_sequential
= 0, num_conventional
= 0;
1235 if (!btrfs_is_zoned(fs_info
))
1239 if (!IS_ALIGNED(length
, fs_info
->zone_size
)) {
1241 "zoned: block group %llu len %llu unaligned to zone size %llu",
1242 logical
, length
, fs_info
->zone_size
);
1246 /* Get the chunk mapping */
1247 read_lock(&em_tree
->lock
);
1248 em
= lookup_extent_mapping(em_tree
, logical
, length
);
1249 read_unlock(&em_tree
->lock
);
1254 map
= em
->map_lookup
;
1256 cache
->physical_map
= kmemdup(map
, map_lookup_size(map
->num_stripes
), GFP_NOFS
);
1257 if (!cache
->physical_map
) {
1262 alloc_offsets
= kcalloc(map
->num_stripes
, sizeof(*alloc_offsets
), GFP_NOFS
);
1263 if (!alloc_offsets
) {
1268 caps
= kcalloc(map
->num_stripes
, sizeof(*caps
), GFP_NOFS
);
1274 physical
= kcalloc(map
->num_stripes
, sizeof(*physical
), GFP_NOFS
);
1280 active
= bitmap_zalloc(map
->num_stripes
, GFP_NOFS
);
1286 for (i
= 0; i
< map
->num_stripes
; i
++) {
1288 struct blk_zone zone
;
1289 struct btrfs_dev_replace
*dev_replace
= &fs_info
->dev_replace
;
1290 int dev_replace_is_ongoing
= 0;
1292 device
= map
->stripes
[i
].dev
;
1293 physical
[i
] = map
->stripes
[i
].physical
;
1295 if (device
->bdev
== NULL
) {
1296 alloc_offsets
[i
] = WP_MISSING_DEV
;
1300 is_sequential
= btrfs_dev_is_sequential(device
, physical
[i
]);
1306 if (!is_sequential
) {
1307 alloc_offsets
[i
] = WP_CONVENTIONAL
;
1312 * This zone will be used for allocation, so mark this zone
1315 btrfs_dev_clear_zone_empty(device
, physical
[i
]);
1317 down_read(&dev_replace
->rwsem
);
1318 dev_replace_is_ongoing
= btrfs_dev_replace_is_ongoing(dev_replace
);
1319 if (dev_replace_is_ongoing
&& dev_replace
->tgtdev
!= NULL
)
1320 btrfs_dev_clear_zone_empty(dev_replace
->tgtdev
, physical
[i
]);
1321 up_read(&dev_replace
->rwsem
);
1324 * The group is mapped to a sequential zone. Get the zone write
1325 * pointer to determine the allocation offset within the zone.
1327 WARN_ON(!IS_ALIGNED(physical
[i
], fs_info
->zone_size
));
1328 nofs_flag
= memalloc_nofs_save();
1329 ret
= btrfs_get_dev_zone(device
, physical
[i
], &zone
);
1330 memalloc_nofs_restore(nofs_flag
);
1331 if (ret
== -EIO
|| ret
== -EOPNOTSUPP
) {
1333 alloc_offsets
[i
] = WP_MISSING_DEV
;
1339 if (zone
.type
== BLK_ZONE_TYPE_CONVENTIONAL
) {
1340 btrfs_err_in_rcu(fs_info
,
1341 "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
1342 zone
.start
<< SECTOR_SHIFT
,
1343 rcu_str_deref(device
->name
), device
->devid
);
1348 caps
[i
] = (zone
.capacity
<< SECTOR_SHIFT
);
1350 switch (zone
.cond
) {
1351 case BLK_ZONE_COND_OFFLINE
:
1352 case BLK_ZONE_COND_READONLY
:
1354 "zoned: offline/readonly zone %llu on device %s (devid %llu)",
1355 physical
[i
] >> device
->zone_info
->zone_size_shift
,
1356 rcu_str_deref(device
->name
), device
->devid
);
1357 alloc_offsets
[i
] = WP_MISSING_DEV
;
1359 case BLK_ZONE_COND_EMPTY
:
1360 alloc_offsets
[i
] = 0;
1362 case BLK_ZONE_COND_FULL
:
1363 alloc_offsets
[i
] = caps
[i
];
1366 /* Partially used zone */
1368 ((zone
.wp
- zone
.start
) << SECTOR_SHIFT
);
1369 __set_bit(i
, active
);
1374 * Consider a zone as active if we can allow any number of
1377 if (!device
->zone_info
->max_active_zones
)
1378 __set_bit(i
, active
);
1381 if (num_sequential
> 0)
1382 cache
->seq_zone
= true;
1384 if (num_conventional
> 0) {
1386 * Avoid calling calculate_alloc_pointer() for new BG. It
1387 * is no use for new BG. It must be always 0.
1389 * Also, we have a lock chain of extent buffer lock ->
1390 * chunk mutex. For new BG, this function is called from
1391 * btrfs_make_block_group() which is already taking the
1392 * chunk mutex. Thus, we cannot call
1393 * calculate_alloc_pointer() which takes extent buffer
1394 * locks to avoid deadlock.
1397 /* Zone capacity is always zone size in emulation */
1398 cache
->zone_capacity
= cache
->length
;
1400 cache
->alloc_offset
= 0;
1403 ret
= calculate_alloc_pointer(cache
, &last_alloc
);
1404 if (ret
|| map
->num_stripes
== num_conventional
) {
1406 cache
->alloc_offset
= last_alloc
;
1409 "zoned: failed to determine allocation offset of bg %llu",
1415 switch (map
->type
& BTRFS_BLOCK_GROUP_PROFILE_MASK
) {
1416 case 0: /* single */
1417 if (alloc_offsets
[0] == WP_MISSING_DEV
) {
1419 "zoned: cannot recover write pointer for zone %llu",
1424 cache
->alloc_offset
= alloc_offsets
[0];
1425 cache
->zone_capacity
= caps
[0];
1426 cache
->zone_is_active
= test_bit(0, active
);
1428 case BTRFS_BLOCK_GROUP_DUP
:
1429 if (map
->type
& BTRFS_BLOCK_GROUP_DATA
) {
1430 btrfs_err(fs_info
, "zoned: profile DUP not yet supported on data bg");
1434 if (alloc_offsets
[0] == WP_MISSING_DEV
) {
1436 "zoned: cannot recover write pointer for zone %llu",
1441 if (alloc_offsets
[1] == WP_MISSING_DEV
) {
1443 "zoned: cannot recover write pointer for zone %llu",
1448 if (alloc_offsets
[0] != alloc_offsets
[1]) {
1450 "zoned: write pointer offset mismatch of zones in DUP profile");
1454 if (test_bit(0, active
) != test_bit(1, active
)) {
1455 if (!btrfs_zone_activate(cache
)) {
1460 cache
->zone_is_active
= test_bit(0, active
);
1462 cache
->alloc_offset
= alloc_offsets
[0];
1463 cache
->zone_capacity
= min(caps
[0], caps
[1]);
1465 case BTRFS_BLOCK_GROUP_RAID1
:
1466 case BTRFS_BLOCK_GROUP_RAID0
:
1467 case BTRFS_BLOCK_GROUP_RAID10
:
1468 case BTRFS_BLOCK_GROUP_RAID5
:
1469 case BTRFS_BLOCK_GROUP_RAID6
:
1470 /* non-single profiles are not supported yet */
1472 btrfs_err(fs_info
, "zoned: profile %s not yet supported",
1473 btrfs_bg_type_to_raid_name(map
->type
));
1478 if (cache
->zone_is_active
) {
1479 btrfs_get_block_group(cache
);
1480 spin_lock(&fs_info
->zone_active_bgs_lock
);
1481 list_add_tail(&cache
->active_bg_list
, &fs_info
->zone_active_bgs
);
1482 spin_unlock(&fs_info
->zone_active_bgs_lock
);
1486 if (cache
->alloc_offset
> fs_info
->zone_size
) {
1488 "zoned: invalid write pointer %llu in block group %llu",
1489 cache
->alloc_offset
, cache
->start
);
1493 if (cache
->alloc_offset
> cache
->zone_capacity
) {
1495 "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
1496 cache
->alloc_offset
, cache
->zone_capacity
,
1501 /* An extent is allocated after the write pointer */
1502 if (!ret
&& num_conventional
&& last_alloc
> cache
->alloc_offset
) {
1504 "zoned: got wrong write pointer in BG %llu: %llu > %llu",
1505 logical
, last_alloc
, cache
->alloc_offset
);
1510 cache
->meta_write_pointer
= cache
->alloc_offset
+ cache
->start
;
1513 kfree(cache
->physical_map
);
1514 cache
->physical_map
= NULL
;
1516 bitmap_free(active
);
1519 kfree(alloc_offsets
);
1520 free_extent_map(em
);
1525 void btrfs_calc_zone_unusable(struct btrfs_block_group
*cache
)
1529 if (!btrfs_is_zoned(cache
->fs_info
))
1532 WARN_ON(cache
->bytes_super
!= 0);
1533 unusable
= (cache
->alloc_offset
- cache
->used
) +
1534 (cache
->length
- cache
->zone_capacity
);
1535 free
= cache
->zone_capacity
- cache
->alloc_offset
;
1537 /* We only need ->free_space in ALLOC_SEQ block groups */
1538 cache
->last_byte_to_unpin
= (u64
)-1;
1539 cache
->cached
= BTRFS_CACHE_FINISHED
;
1540 cache
->free_space_ctl
->free_space
= free
;
1541 cache
->zone_unusable
= unusable
;
1544 void btrfs_redirty_list_add(struct btrfs_transaction
*trans
,
1545 struct extent_buffer
*eb
)
1547 struct btrfs_fs_info
*fs_info
= eb
->fs_info
;
1549 if (!btrfs_is_zoned(fs_info
) ||
1550 btrfs_header_flag(eb
, BTRFS_HEADER_FLAG_WRITTEN
) ||
1551 !list_empty(&eb
->release_list
))
1554 set_extent_buffer_dirty(eb
);
1555 set_extent_bits_nowait(&trans
->dirty_pages
, eb
->start
,
1556 eb
->start
+ eb
->len
- 1, EXTENT_DIRTY
);
1557 memzero_extent_buffer(eb
, 0, eb
->len
);
1558 set_bit(EXTENT_BUFFER_NO_CHECK
, &eb
->bflags
);
1560 spin_lock(&trans
->releasing_ebs_lock
);
1561 list_add_tail(&eb
->release_list
, &trans
->releasing_ebs
);
1562 spin_unlock(&trans
->releasing_ebs_lock
);
1563 atomic_inc(&eb
->refs
);
1566 void btrfs_free_redirty_list(struct btrfs_transaction
*trans
)
1568 spin_lock(&trans
->releasing_ebs_lock
);
1569 while (!list_empty(&trans
->releasing_ebs
)) {
1570 struct extent_buffer
*eb
;
1572 eb
= list_first_entry(&trans
->releasing_ebs
,
1573 struct extent_buffer
, release_list
);
1574 list_del_init(&eb
->release_list
);
1575 free_extent_buffer(eb
);
1577 spin_unlock(&trans
->releasing_ebs_lock
);
1580 bool btrfs_use_zone_append(struct btrfs_inode
*inode
, u64 start
)
1582 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
1583 struct btrfs_block_group
*cache
;
1586 if (!btrfs_is_zoned(fs_info
))
1589 if (!is_data_inode(&inode
->vfs_inode
))
1593 * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
1594 * extent layout the relocation code has.
1595 * Furthermore we have set aside own block-group from which only the
1596 * relocation "process" can allocate and make sure only one process at a
1597 * time can add pages to an extent that gets relocated, so it's safe to
1598 * use regular REQ_OP_WRITE for this special case.
1600 if (btrfs_is_data_reloc_root(inode
->root
))
1603 cache
= btrfs_lookup_block_group(fs_info
, start
);
1608 ret
= cache
->seq_zone
;
1609 btrfs_put_block_group(cache
);
1614 void btrfs_record_physical_zoned(struct inode
*inode
, u64 file_offset
,
1617 struct btrfs_ordered_extent
*ordered
;
1618 const u64 physical
= bio
->bi_iter
.bi_sector
<< SECTOR_SHIFT
;
1620 if (bio_op(bio
) != REQ_OP_ZONE_APPEND
)
1623 ordered
= btrfs_lookup_ordered_extent(BTRFS_I(inode
), file_offset
);
1624 if (WARN_ON(!ordered
))
1627 ordered
->physical
= physical
;
1628 ordered
->bdev
= bio
->bi_bdev
;
1630 btrfs_put_ordered_extent(ordered
);
1633 void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent
*ordered
)
1635 struct btrfs_inode
*inode
= BTRFS_I(ordered
->inode
);
1636 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
1637 struct extent_map_tree
*em_tree
;
1638 struct extent_map
*em
;
1639 struct btrfs_ordered_sum
*sum
;
1640 u64 orig_logical
= ordered
->disk_bytenr
;
1641 u64
*logical
= NULL
;
1644 /* Zoned devices should not have partitions. So, we can assume it is 0 */
1645 ASSERT(!bdev_is_partition(ordered
->bdev
));
1646 if (WARN_ON(!ordered
->bdev
))
1649 if (WARN_ON(btrfs_rmap_block(fs_info
, orig_logical
, ordered
->bdev
,
1650 ordered
->physical
, &logical
, &nr
,
1656 if (orig_logical
== *logical
)
1659 ordered
->disk_bytenr
= *logical
;
1661 em_tree
= &inode
->extent_tree
;
1662 write_lock(&em_tree
->lock
);
1663 em
= search_extent_mapping(em_tree
, ordered
->file_offset
,
1664 ordered
->num_bytes
);
1665 em
->block_start
= *logical
;
1666 free_extent_map(em
);
1667 write_unlock(&em_tree
->lock
);
1669 list_for_each_entry(sum
, &ordered
->list
, list
) {
1670 if (*logical
< orig_logical
)
1671 sum
->bytenr
-= orig_logical
- *logical
;
1673 sum
->bytenr
+= *logical
- orig_logical
;
1680 bool btrfs_check_meta_write_pointer(struct btrfs_fs_info
*fs_info
,
1681 struct extent_buffer
*eb
,
1682 struct btrfs_block_group
**cache_ret
)
1684 struct btrfs_block_group
*cache
;
1687 if (!btrfs_is_zoned(fs_info
))
1690 cache
= btrfs_lookup_block_group(fs_info
, eb
->start
);
1694 if (cache
->meta_write_pointer
!= eb
->start
) {
1695 btrfs_put_block_group(cache
);
1699 cache
->meta_write_pointer
= eb
->start
+ eb
->len
;
1707 void btrfs_revert_meta_write_pointer(struct btrfs_block_group
*cache
,
1708 struct extent_buffer
*eb
)
1710 if (!btrfs_is_zoned(eb
->fs_info
) || !cache
)
1713 ASSERT(cache
->meta_write_pointer
== eb
->start
+ eb
->len
);
1714 cache
->meta_write_pointer
= eb
->start
;
1717 int btrfs_zoned_issue_zeroout(struct btrfs_device
*device
, u64 physical
, u64 length
)
1719 if (!btrfs_dev_is_sequential(device
, physical
))
1722 return blkdev_issue_zeroout(device
->bdev
, physical
>> SECTOR_SHIFT
,
1723 length
>> SECTOR_SHIFT
, GFP_NOFS
, 0);
1726 static int read_zone_info(struct btrfs_fs_info
*fs_info
, u64 logical
,
1727 struct blk_zone
*zone
)
1729 struct btrfs_io_context
*bioc
= NULL
;
1730 u64 mapped_length
= PAGE_SIZE
;
1731 unsigned int nofs_flag
;
1735 ret
= btrfs_map_sblock(fs_info
, BTRFS_MAP_GET_READ_MIRRORS
, logical
,
1736 &mapped_length
, &bioc
);
1737 if (ret
|| !bioc
|| mapped_length
< PAGE_SIZE
) {
1742 if (bioc
->map_type
& BTRFS_BLOCK_GROUP_RAID56_MASK
) {
1747 nofs_flag
= memalloc_nofs_save();
1748 nmirrors
= (int)bioc
->num_stripes
;
1749 for (i
= 0; i
< nmirrors
; i
++) {
1750 u64 physical
= bioc
->stripes
[i
].physical
;
1751 struct btrfs_device
*dev
= bioc
->stripes
[i
].dev
;
1753 /* Missing device */
1757 ret
= btrfs_get_dev_zone(dev
, physical
, zone
);
1758 /* Failing device */
1759 if (ret
== -EIO
|| ret
== -EOPNOTSUPP
)
1763 memalloc_nofs_restore(nofs_flag
);
1765 btrfs_put_bioc(bioc
);
1770 * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
1771 * filling zeros between @physical_pos to a write pointer of dev-replace
1774 int btrfs_sync_zone_write_pointer(struct btrfs_device
*tgt_dev
, u64 logical
,
1775 u64 physical_start
, u64 physical_pos
)
1777 struct btrfs_fs_info
*fs_info
= tgt_dev
->fs_info
;
1778 struct blk_zone zone
;
1783 if (!btrfs_dev_is_sequential(tgt_dev
, physical_pos
))
1786 ret
= read_zone_info(fs_info
, logical
, &zone
);
1790 wp
= physical_start
+ ((zone
.wp
- zone
.start
) << SECTOR_SHIFT
);
1792 if (physical_pos
== wp
)
1795 if (physical_pos
> wp
)
1798 length
= wp
- physical_pos
;
1799 return btrfs_zoned_issue_zeroout(tgt_dev
, physical_pos
, length
);
1802 struct btrfs_device
*btrfs_zoned_get_device(struct btrfs_fs_info
*fs_info
,
1803 u64 logical
, u64 length
)
1805 struct btrfs_device
*device
;
1806 struct extent_map
*em
;
1807 struct map_lookup
*map
;
1809 em
= btrfs_get_chunk_map(fs_info
, logical
, length
);
1811 return ERR_CAST(em
);
1813 map
= em
->map_lookup
;
1814 /* We only support single profile for now */
1815 device
= map
->stripes
[0].dev
;
1817 free_extent_map(em
);
1823 * Activate block group and underlying device zones
1825 * @block_group: the block group to activate
1827 * Return: true on success, false otherwise
1829 bool btrfs_zone_activate(struct btrfs_block_group
*block_group
)
1831 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
1832 struct map_lookup
*map
;
1833 struct btrfs_device
*device
;
1838 if (!btrfs_is_zoned(block_group
->fs_info
))
1841 map
= block_group
->physical_map
;
1843 spin_lock(&block_group
->lock
);
1844 if (block_group
->zone_is_active
) {
1850 if (btrfs_zoned_bg_is_full(block_group
)) {
1855 for (i
= 0; i
< map
->num_stripes
; i
++) {
1856 device
= map
->stripes
[i
].dev
;
1857 physical
= map
->stripes
[i
].physical
;
1859 if (device
->zone_info
->max_active_zones
== 0)
1862 if (!btrfs_dev_set_active_zone(device
, physical
)) {
1863 /* Cannot activate the zone */
1869 /* Successfully activated all the zones */
1870 block_group
->zone_is_active
= 1;
1871 spin_unlock(&block_group
->lock
);
1873 /* For the active block group list */
1874 btrfs_get_block_group(block_group
);
1876 spin_lock(&fs_info
->zone_active_bgs_lock
);
1877 list_add_tail(&block_group
->active_bg_list
, &fs_info
->zone_active_bgs
);
1878 spin_unlock(&fs_info
->zone_active_bgs_lock
);
1883 spin_unlock(&block_group
->lock
);
1887 static int do_zone_finish(struct btrfs_block_group
*block_group
, bool fully_written
)
1889 struct btrfs_fs_info
*fs_info
= block_group
->fs_info
;
1890 struct map_lookup
*map
;
1894 spin_lock(&block_group
->lock
);
1895 if (!block_group
->zone_is_active
) {
1896 spin_unlock(&block_group
->lock
);
1900 /* Check if we have unwritten allocated space */
1901 if ((block_group
->flags
&
1902 (BTRFS_BLOCK_GROUP_METADATA
| BTRFS_BLOCK_GROUP_SYSTEM
)) &&
1903 block_group
->start
+ block_group
->alloc_offset
> block_group
->meta_write_pointer
) {
1904 spin_unlock(&block_group
->lock
);
1909 * If we are sure that the block group is full (= no more room left for
1910 * new allocation) and the IO for the last usable block is completed, we
1911 * don't need to wait for the other IOs. This holds because we ensure
1912 * the sequential IO submissions using the ZONE_APPEND command for data
1913 * and block_group->meta_write_pointer for metadata.
1915 if (!fully_written
) {
1916 spin_unlock(&block_group
->lock
);
1918 ret
= btrfs_inc_block_group_ro(block_group
, false);
1922 /* Ensure all writes in this block group finish */
1923 btrfs_wait_block_group_reservations(block_group
);
1924 /* No need to wait for NOCOW writers. Zoned mode does not allow that */
1925 btrfs_wait_ordered_roots(fs_info
, U64_MAX
, block_group
->start
,
1926 block_group
->length
);
1928 spin_lock(&block_group
->lock
);
1931 * Bail out if someone already deactivated the block group, or
1932 * allocated space is left in the block group.
1934 if (!block_group
->zone_is_active
) {
1935 spin_unlock(&block_group
->lock
);
1936 btrfs_dec_block_group_ro(block_group
);
1940 if (block_group
->reserved
) {
1941 spin_unlock(&block_group
->lock
);
1942 btrfs_dec_block_group_ro(block_group
);
1947 block_group
->zone_is_active
= 0;
1948 block_group
->alloc_offset
= block_group
->zone_capacity
;
1949 block_group
->free_space_ctl
->free_space
= 0;
1950 btrfs_clear_treelog_bg(block_group
);
1951 btrfs_clear_data_reloc_bg(block_group
);
1952 spin_unlock(&block_group
->lock
);
1954 map
= block_group
->physical_map
;
1955 for (i
= 0; i
< map
->num_stripes
; i
++) {
1956 struct btrfs_device
*device
= map
->stripes
[i
].dev
;
1957 const u64 physical
= map
->stripes
[i
].physical
;
1959 if (device
->zone_info
->max_active_zones
== 0)
1962 ret
= blkdev_zone_mgmt(device
->bdev
, REQ_OP_ZONE_FINISH
,
1963 physical
>> SECTOR_SHIFT
,
1964 device
->zone_info
->zone_size
>> SECTOR_SHIFT
,
1970 btrfs_dev_clear_active_zone(device
, physical
);
1974 btrfs_dec_block_group_ro(block_group
);
1976 spin_lock(&fs_info
->zone_active_bgs_lock
);
1977 ASSERT(!list_empty(&block_group
->active_bg_list
));
1978 list_del_init(&block_group
->active_bg_list
);
1979 spin_unlock(&fs_info
->zone_active_bgs_lock
);
1981 /* For active_bg_list */
1982 btrfs_put_block_group(block_group
);
1987 int btrfs_zone_finish(struct btrfs_block_group
*block_group
)
1989 if (!btrfs_is_zoned(block_group
->fs_info
))
1992 return do_zone_finish(block_group
, false);
1995 bool btrfs_can_activate_zone(struct btrfs_fs_devices
*fs_devices
, u64 flags
)
1997 struct btrfs_fs_info
*fs_info
= fs_devices
->fs_info
;
1998 struct btrfs_device
*device
;
2001 if (!btrfs_is_zoned(fs_info
))
2004 /* Check if there is a device with active zones left */
2005 mutex_lock(&fs_info
->chunk_mutex
);
2006 list_for_each_entry(device
, &fs_devices
->alloc_list
, dev_alloc_list
) {
2007 struct btrfs_zoned_device_info
*zinfo
= device
->zone_info
;
2012 if (!zinfo
->max_active_zones
||
2013 atomic_read(&zinfo
->active_zones_left
)) {
2018 mutex_unlock(&fs_info
->chunk_mutex
);
2023 void btrfs_zone_finish_endio(struct btrfs_fs_info
*fs_info
, u64 logical
, u64 length
)
2025 struct btrfs_block_group
*block_group
;
2026 u64 min_alloc_bytes
;
2028 if (!btrfs_is_zoned(fs_info
))
2031 block_group
= btrfs_lookup_block_group(fs_info
, logical
);
2032 ASSERT(block_group
);
2034 /* No MIXED_BG on zoned btrfs. */
2035 if (block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
)
2036 min_alloc_bytes
= fs_info
->sectorsize
;
2038 min_alloc_bytes
= fs_info
->nodesize
;
2040 /* Bail out if we can allocate more data from this block group. */
2041 if (logical
+ length
+ min_alloc_bytes
<=
2042 block_group
->start
+ block_group
->zone_capacity
)
2045 do_zone_finish(block_group
, true);
2048 btrfs_put_block_group(block_group
);
2051 static void btrfs_zone_finish_endio_workfn(struct work_struct
*work
)
2053 struct btrfs_block_group
*bg
=
2054 container_of(work
, struct btrfs_block_group
, zone_finish_work
);
2056 wait_on_extent_buffer_writeback(bg
->last_eb
);
2057 free_extent_buffer(bg
->last_eb
);
2058 btrfs_zone_finish_endio(bg
->fs_info
, bg
->start
, bg
->length
);
2059 btrfs_put_block_group(bg
);
2062 void btrfs_schedule_zone_finish_bg(struct btrfs_block_group
*bg
,
2063 struct extent_buffer
*eb
)
2065 if (!bg
->seq_zone
|| eb
->start
+ eb
->len
* 2 <= bg
->start
+ bg
->zone_capacity
)
2068 if (WARN_ON(bg
->zone_finish_work
.func
== btrfs_zone_finish_endio_workfn
)) {
2069 btrfs_err(bg
->fs_info
, "double scheduling of bg %llu zone finishing",
2075 btrfs_get_block_group(bg
);
2076 atomic_inc(&eb
->refs
);
2078 INIT_WORK(&bg
->zone_finish_work
, btrfs_zone_finish_endio_workfn
);
2079 queue_work(system_unbound_wq
, &bg
->zone_finish_work
);
2082 void btrfs_clear_data_reloc_bg(struct btrfs_block_group
*bg
)
2084 struct btrfs_fs_info
*fs_info
= bg
->fs_info
;
2086 spin_lock(&fs_info
->relocation_bg_lock
);
2087 if (fs_info
->data_reloc_bg
== bg
->start
)
2088 fs_info
->data_reloc_bg
= 0;
2089 spin_unlock(&fs_info
->relocation_bg_lock
);
2092 void btrfs_free_zone_cache(struct btrfs_fs_info
*fs_info
)
2094 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
2095 struct btrfs_device
*device
;
2097 if (!btrfs_is_zoned(fs_info
))
2100 mutex_lock(&fs_devices
->device_list_mutex
);
2101 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
2102 if (device
->zone_info
) {
2103 vfree(device
->zone_info
->zone_cache
);
2104 device
->zone_info
->zone_cache
= NULL
;
2107 mutex_unlock(&fs_devices
->device_list_mutex
);
2110 bool btrfs_zoned_should_reclaim(struct btrfs_fs_info
*fs_info
)
2112 struct btrfs_fs_devices
*fs_devices
= fs_info
->fs_devices
;
2113 struct btrfs_device
*device
;
2118 ASSERT(btrfs_is_zoned(fs_info
));
2120 if (fs_info
->bg_reclaim_threshold
== 0)
2123 mutex_lock(&fs_devices
->device_list_mutex
);
2124 list_for_each_entry(device
, &fs_devices
->devices
, dev_list
) {
2128 total
+= device
->disk_total_bytes
;
2129 used
+= device
->bytes_used
;
2131 mutex_unlock(&fs_devices
->device_list_mutex
);
2133 factor
= div64_u64(used
* 100, total
);
2134 return factor
>= fs_info
->bg_reclaim_threshold
;
2137 void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info
*fs_info
, u64 logical
,
2140 struct btrfs_block_group
*block_group
;
2142 if (!btrfs_is_zoned(fs_info
))
2145 block_group
= btrfs_lookup_block_group(fs_info
, logical
);
2146 /* It should be called on a previous data relocation block group. */
2147 ASSERT(block_group
&& (block_group
->flags
& BTRFS_BLOCK_GROUP_DATA
));
2149 spin_lock(&block_group
->lock
);
2150 if (!block_group
->zoned_data_reloc_ongoing
)
2153 /* All relocation extents are written. */
2154 if (block_group
->start
+ block_group
->alloc_offset
== logical
+ length
) {
2155 /* Now, release this block group for further allocations. */
2156 block_group
->zoned_data_reloc_ongoing
= 0;
2160 spin_unlock(&block_group
->lock
);
2161 btrfs_put_block_group(block_group
);