]> git.ipfire.org Git - people/ms/linux.git/blame - fs/btrfs/zoned.c
btrfs: zoned: defer loading zone info after opening trees
[people/ms/linux.git] / fs / btrfs / zoned.c
CommitLineData
5b316468
NA
1// SPDX-License-Identifier: GPL-2.0
2
3#include <linux/slab.h>
4#include <linux/blkdev.h>
5#include "ctree.h"
6#include "volumes.h"
7#include "zoned.h"
8#include "rcu-string.h"
9
10/* Maximum number of zones to report per blkdev_report_zones() call */
11#define BTRFS_REPORT_NR_ZONES 4096
12
12659251
NA
13/* Number of superblock log zones */
14#define BTRFS_NR_SB_LOG_ZONES 2
15
5b316468
NA
16static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
17{
18 struct blk_zone *zones = data;
19
20 memcpy(&zones[idx], zone, sizeof(*zone));
21
22 return 0;
23}
24
12659251
NA
25static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
26 u64 *wp_ret)
27{
28 bool empty[BTRFS_NR_SB_LOG_ZONES];
29 bool full[BTRFS_NR_SB_LOG_ZONES];
30 sector_t sector;
31
32 ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
33 zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
34
35 empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
36 empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
37 full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
38 full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);
39
40 /*
41 * Possible states of log buffer zones
42 *
43 * Empty[0] In use[0] Full[0]
44 * Empty[1] * x 0
45 * In use[1] 0 x 0
46 * Full[1] 1 1 C
47 *
48 * Log position:
49 * *: Special case, no superblock is written
50 * 0: Use write pointer of zones[0]
51 * 1: Use write pointer of zones[1]
52 * C: Compare super blcoks from zones[0] and zones[1], use the latest
53 * one determined by generation
54 * x: Invalid state
55 */
56
57 if (empty[0] && empty[1]) {
58 /* Special case to distinguish no superblock to read */
59 *wp_ret = zones[0].start << SECTOR_SHIFT;
60 return -ENOENT;
61 } else if (full[0] && full[1]) {
62 /* Compare two super blocks */
63 struct address_space *mapping = bdev->bd_inode->i_mapping;
64 struct page *page[BTRFS_NR_SB_LOG_ZONES];
65 struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
66 int i;
67
68 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
69 u64 bytenr;
70
71 bytenr = ((zones[i].start + zones[i].len)
72 << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE;
73
74 page[i] = read_cache_page_gfp(mapping,
75 bytenr >> PAGE_SHIFT, GFP_NOFS);
76 if (IS_ERR(page[i])) {
77 if (i == 1)
78 btrfs_release_disk_super(super[0]);
79 return PTR_ERR(page[i]);
80 }
81 super[i] = page_address(page[i]);
82 }
83
84 if (super[0]->generation > super[1]->generation)
85 sector = zones[1].start;
86 else
87 sector = zones[0].start;
88
89 for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
90 btrfs_release_disk_super(super[i]);
91 } else if (!full[0] && (empty[1] || full[1])) {
92 sector = zones[0].wp;
93 } else if (full[0]) {
94 sector = zones[1].wp;
95 } else {
96 return -EUCLEAN;
97 }
98 *wp_ret = sector << SECTOR_SHIFT;
99 return 0;
100}
101
102/*
103 * The following zones are reserved as the circular buffer on ZONED btrfs.
104 * - The primary superblock: zones 0 and 1
105 * - The first copy: zones 16 and 17
106 * - The second copy: zones 1024 or zone at 256GB which is minimum, and
107 * the following one
108 */
109static inline u32 sb_zone_number(int shift, int mirror)
110{
111 ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
112
113 switch (mirror) {
114 case 0: return 0;
115 case 1: return 16;
116 case 2: return min_t(u64, btrfs_sb_offset(mirror) >> shift, 1024);
117 }
118
119 return 0;
120}
121
5b316468
NA
122static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
123 struct blk_zone *zones, unsigned int *nr_zones)
124{
125 int ret;
126
127 if (!*nr_zones)
128 return 0;
129
130 ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
131 copy_zone_info_cb, zones);
132 if (ret < 0) {
133 btrfs_err_in_rcu(device->fs_info,
134 "zoned: failed to read zone %llu on %s (devid %llu)",
135 pos, rcu_str_deref(device->name),
136 device->devid);
137 return ret;
138 }
139 *nr_zones = ret;
140 if (!ret)
141 return -EIO;
142
143 return 0;
144}
145
73651042
NA
146int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
147{
148 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
149 struct btrfs_device *device;
150 int ret = 0;
151
152 /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
153 if (!btrfs_fs_incompat(fs_info, ZONED))
154 return 0;
155
156 mutex_lock(&fs_devices->device_list_mutex);
157 list_for_each_entry(device, &fs_devices->devices, dev_list) {
158 /* We can skip reading of zone info for missing devices */
159 if (!device->bdev)
160 continue;
161
162 ret = btrfs_get_dev_zone_info(device);
163 if (ret)
164 break;
165 }
166 mutex_unlock(&fs_devices->device_list_mutex);
167
168 return ret;
169}
170
5b316468
NA
171int btrfs_get_dev_zone_info(struct btrfs_device *device)
172{
173 struct btrfs_zoned_device_info *zone_info = NULL;
174 struct block_device *bdev = device->bdev;
862931c7 175 struct request_queue *queue = bdev_get_queue(bdev);
5b316468
NA
176 sector_t nr_sectors;
177 sector_t sector = 0;
178 struct blk_zone *zones = NULL;
179 unsigned int i, nreported = 0, nr_zones;
180 unsigned int zone_sectors;
181 int ret;
182
183 if (!bdev_is_zoned(bdev))
184 return 0;
185
186 if (device->zone_info)
187 return 0;
188
189 zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
190 if (!zone_info)
191 return -ENOMEM;
192
ac7ac461 193 nr_sectors = bdev_nr_sectors(bdev);
5b316468
NA
194 zone_sectors = bdev_zone_sectors(bdev);
195 /* Check if it's power of 2 (see is_power_of_2) */
196 ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
197 zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
198 zone_info->zone_size_shift = ilog2(zone_info->zone_size);
862931c7
NA
199 zone_info->max_zone_append_size =
200 (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT;
5b316468
NA
201 zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
202 if (!IS_ALIGNED(nr_sectors, zone_sectors))
203 zone_info->nr_zones++;
204
205 zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
206 if (!zone_info->seq_zones) {
207 ret = -ENOMEM;
208 goto out;
209 }
210
211 zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
212 if (!zone_info->empty_zones) {
213 ret = -ENOMEM;
214 goto out;
215 }
216
217 zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
218 if (!zones) {
219 ret = -ENOMEM;
220 goto out;
221 }
222
223 /* Get zones type */
224 while (sector < nr_sectors) {
225 nr_zones = BTRFS_REPORT_NR_ZONES;
226 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
227 &nr_zones);
228 if (ret)
229 goto out;
230
231 for (i = 0; i < nr_zones; i++) {
232 if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
233 __set_bit(nreported, zone_info->seq_zones);
234 if (zones[i].cond == BLK_ZONE_COND_EMPTY)
235 __set_bit(nreported, zone_info->empty_zones);
236 nreported++;
237 }
238 sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
239 }
240
241 if (nreported != zone_info->nr_zones) {
242 btrfs_err_in_rcu(device->fs_info,
243 "inconsistent number of zones on %s (%u/%u)",
244 rcu_str_deref(device->name), nreported,
245 zone_info->nr_zones);
246 ret = -EIO;
247 goto out;
248 }
249
12659251
NA
250 /* Validate superblock log */
251 nr_zones = BTRFS_NR_SB_LOG_ZONES;
252 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
253 u32 sb_zone;
254 u64 sb_wp;
255 int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
256
257 sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
258 if (sb_zone + 1 >= zone_info->nr_zones)
259 continue;
260
261 sector = sb_zone << (zone_info->zone_size_shift - SECTOR_SHIFT);
262 ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT,
263 &zone_info->sb_zones[sb_pos],
264 &nr_zones);
265 if (ret)
266 goto out;
267
268 if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
269 btrfs_err_in_rcu(device->fs_info,
270 "zoned: failed to read super block log zone info at devid %llu zone %u",
271 device->devid, sb_zone);
272 ret = -EUCLEAN;
273 goto out;
274 }
275
276 /*
277 * If zones[0] is conventional, always use the beggining of the
278 * zone to record superblock. No need to validate in that case.
279 */
280 if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
281 BLK_ZONE_TYPE_CONVENTIONAL)
282 continue;
283
284 ret = sb_write_pointer(device->bdev,
285 &zone_info->sb_zones[sb_pos], &sb_wp);
286 if (ret != -ENOENT && ret) {
287 btrfs_err_in_rcu(device->fs_info,
288 "zoned: super block log zone corrupted devid %llu zone %u",
289 device->devid, sb_zone);
290 ret = -EUCLEAN;
291 goto out;
292 }
293 }
294
295
5b316468
NA
296 kfree(zones);
297
298 device->zone_info = zone_info;
299
300 /* device->fs_info is not safe to use for printing messages */
301 btrfs_info_in_rcu(NULL,
302 "host-%s zoned block device %s, %u zones of %llu bytes",
303 bdev_zoned_model(bdev) == BLK_ZONED_HM ? "managed" : "aware",
304 rcu_str_deref(device->name), zone_info->nr_zones,
305 zone_info->zone_size);
306
307 return 0;
308
309out:
310 kfree(zones);
311 bitmap_free(zone_info->empty_zones);
312 bitmap_free(zone_info->seq_zones);
313 kfree(zone_info);
314
315 return ret;
316}
317
318void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
319{
320 struct btrfs_zoned_device_info *zone_info = device->zone_info;
321
322 if (!zone_info)
323 return;
324
325 bitmap_free(zone_info->seq_zones);
326 bitmap_free(zone_info->empty_zones);
327 kfree(zone_info);
328 device->zone_info = NULL;
329}
330
331int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
332 struct blk_zone *zone)
333{
334 unsigned int nr_zones = 1;
335 int ret;
336
337 ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
338 if (ret != 0 || !nr_zones)
339 return ret ? ret : -EIO;
340
341 return 0;
342}
b70f5097
NA
343
344int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
345{
346 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
347 struct btrfs_device *device;
348 u64 zoned_devices = 0;
349 u64 nr_devices = 0;
350 u64 zone_size = 0;
862931c7 351 u64 max_zone_append_size = 0;
b70f5097
NA
352 const bool incompat_zoned = btrfs_is_zoned(fs_info);
353 int ret = 0;
354
355 /* Count zoned devices */
356 list_for_each_entry(device, &fs_devices->devices, dev_list) {
357 enum blk_zoned_model model;
358
359 if (!device->bdev)
360 continue;
361
362 model = bdev_zoned_model(device->bdev);
363 if (model == BLK_ZONED_HM ||
364 (model == BLK_ZONED_HA && incompat_zoned)) {
862931c7
NA
365 struct btrfs_zoned_device_info *zone_info;
366
367 zone_info = device->zone_info;
b70f5097
NA
368 zoned_devices++;
369 if (!zone_size) {
862931c7
NA
370 zone_size = zone_info->zone_size;
371 } else if (zone_info->zone_size != zone_size) {
b70f5097
NA
372 btrfs_err(fs_info,
373 "zoned: unequal block device zone sizes: have %llu found %llu",
374 device->zone_info->zone_size,
375 zone_size);
376 ret = -EINVAL;
377 goto out;
378 }
862931c7
NA
379 if (!max_zone_append_size ||
380 (zone_info->max_zone_append_size &&
381 zone_info->max_zone_append_size < max_zone_append_size))
382 max_zone_append_size =
383 zone_info->max_zone_append_size;
b70f5097
NA
384 }
385 nr_devices++;
386 }
387
388 if (!zoned_devices && !incompat_zoned)
389 goto out;
390
391 if (!zoned_devices && incompat_zoned) {
392 /* No zoned block device found on ZONED filesystem */
393 btrfs_err(fs_info,
394 "zoned: no zoned devices found on a zoned filesystem");
395 ret = -EINVAL;
396 goto out;
397 }
398
399 if (zoned_devices && !incompat_zoned) {
400 btrfs_err(fs_info,
401 "zoned: mode not enabled but zoned device found");
402 ret = -EINVAL;
403 goto out;
404 }
405
406 if (zoned_devices != nr_devices) {
407 btrfs_err(fs_info,
408 "zoned: cannot mix zoned and regular devices");
409 ret = -EINVAL;
410 goto out;
411 }
412
413 /*
414 * stripe_size is always aligned to BTRFS_STRIPE_LEN in
415 * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
416 * check the alignment here.
417 */
418 if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
419 btrfs_err(fs_info,
420 "zoned: zone size %llu not aligned to stripe %u",
421 zone_size, BTRFS_STRIPE_LEN);
422 ret = -EINVAL;
423 goto out;
424 }
425
a589dde0
NA
426 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
427 btrfs_err(fs_info, "zoned: mixed block groups not supported");
428 ret = -EINVAL;
429 goto out;
430 }
431
b70f5097 432 fs_info->zone_size = zone_size;
862931c7 433 fs_info->max_zone_append_size = max_zone_append_size;
b70f5097
NA
434
435 btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
436out:
437 return ret;
438}
5d1ab66c
NA
439
440int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
441{
442 if (!btrfs_is_zoned(info))
443 return 0;
444
445 /*
446 * Space cache writing is not COWed. Disable that to avoid write errors
447 * in sequential zones.
448 */
449 if (btrfs_test_opt(info, SPACE_CACHE)) {
450 btrfs_err(info, "zoned: space cache v1 is not supported");
451 return -EINVAL;
452 }
453
d206e9c9
NA
454 if (btrfs_test_opt(info, NODATACOW)) {
455 btrfs_err(info, "zoned: NODATACOW not supported");
456 return -EINVAL;
457 }
458
5d1ab66c
NA
459 return 0;
460}
12659251
NA
461
462static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
463 int rw, u64 *bytenr_ret)
464{
465 u64 wp;
466 int ret;
467
468 if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
469 *bytenr_ret = zones[0].start << SECTOR_SHIFT;
470 return 0;
471 }
472
473 ret = sb_write_pointer(bdev, zones, &wp);
474 if (ret != -ENOENT && ret < 0)
475 return ret;
476
477 if (rw == WRITE) {
478 struct blk_zone *reset = NULL;
479
480 if (wp == zones[0].start << SECTOR_SHIFT)
481 reset = &zones[0];
482 else if (wp == zones[1].start << SECTOR_SHIFT)
483 reset = &zones[1];
484
485 if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
486 ASSERT(reset->cond == BLK_ZONE_COND_FULL);
487
488 ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
489 reset->start, reset->len,
490 GFP_NOFS);
491 if (ret)
492 return ret;
493
494 reset->cond = BLK_ZONE_COND_EMPTY;
495 reset->wp = reset->start;
496 }
497 } else if (ret != -ENOENT) {
498 /* For READ, we want the precious one */
499 if (wp == zones[0].start << SECTOR_SHIFT)
500 wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
501 wp -= BTRFS_SUPER_INFO_SIZE;
502 }
503
504 *bytenr_ret = wp;
505 return 0;
506
507}
508
509int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
510 u64 *bytenr_ret)
511{
512 struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
513 unsigned int zone_sectors;
514 u32 sb_zone;
515 int ret;
12659251
NA
516 u8 zone_sectors_shift;
517 sector_t nr_sectors;
518 u32 nr_zones;
519
520 if (!bdev_is_zoned(bdev)) {
521 *bytenr_ret = btrfs_sb_offset(mirror);
522 return 0;
523 }
524
525 ASSERT(rw == READ || rw == WRITE);
526
527 zone_sectors = bdev_zone_sectors(bdev);
528 if (!is_power_of_2(zone_sectors))
529 return -EINVAL;
12659251 530 zone_sectors_shift = ilog2(zone_sectors);
ac7ac461 531 nr_sectors = bdev_nr_sectors(bdev);
12659251
NA
532 nr_zones = nr_sectors >> zone_sectors_shift;
533
534 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
535 if (sb_zone + 1 >= nr_zones)
536 return -ENOENT;
537
538 ret = blkdev_report_zones(bdev, sb_zone << zone_sectors_shift,
539 BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
540 zones);
541 if (ret < 0)
542 return ret;
543 if (ret != BTRFS_NR_SB_LOG_ZONES)
544 return -EIO;
545
546 return sb_log_location(bdev, zones, rw, bytenr_ret);
547}
548
549int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
550 u64 *bytenr_ret)
551{
552 struct btrfs_zoned_device_info *zinfo = device->zone_info;
553 u32 zone_num;
554
555 if (!zinfo) {
556 *bytenr_ret = btrfs_sb_offset(mirror);
557 return 0;
558 }
559
560 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
561 if (zone_num + 1 >= zinfo->nr_zones)
562 return -ENOENT;
563
564 return sb_log_location(device->bdev,
565 &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
566 rw, bytenr_ret);
567}
568
569static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
570 int mirror)
571{
572 u32 zone_num;
573
574 if (!zinfo)
575 return false;
576
577 zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
578 if (zone_num + 1 >= zinfo->nr_zones)
579 return false;
580
581 if (!test_bit(zone_num, zinfo->seq_zones))
582 return false;
583
584 return true;
585}
586
587void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
588{
589 struct btrfs_zoned_device_info *zinfo = device->zone_info;
590 struct blk_zone *zone;
591
592 if (!is_sb_log_zone(zinfo, mirror))
593 return;
594
595 zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
596 if (zone->cond != BLK_ZONE_COND_FULL) {
597 if (zone->cond == BLK_ZONE_COND_EMPTY)
598 zone->cond = BLK_ZONE_COND_IMP_OPEN;
599
600 zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
601
602 if (zone->wp == zone->start + zone->len)
603 zone->cond = BLK_ZONE_COND_FULL;
604
605 return;
606 }
607
608 zone++;
609 ASSERT(zone->cond != BLK_ZONE_COND_FULL);
610 if (zone->cond == BLK_ZONE_COND_EMPTY)
611 zone->cond = BLK_ZONE_COND_IMP_OPEN;
612
613 zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
614
615 if (zone->wp == zone->start + zone->len)
616 zone->cond = BLK_ZONE_COND_FULL;
617}
618
619int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
620{
621 sector_t zone_sectors;
622 sector_t nr_sectors;
623 u8 zone_sectors_shift;
624 u32 sb_zone;
625 u32 nr_zones;
626
627 zone_sectors = bdev_zone_sectors(bdev);
628 zone_sectors_shift = ilog2(zone_sectors);
ac7ac461 629 nr_sectors = bdev_nr_sectors(bdev);
12659251
NA
630 nr_zones = nr_sectors >> zone_sectors_shift;
631
632 sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
633 if (sb_zone + 1 >= nr_zones)
634 return -ENOENT;
635
636 return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
637 sb_zone << zone_sectors_shift,
638 zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
639}