]> git.ipfire.org Git - thirdparty/kernel/linux.git/blame - block/blk-zoned.c
Merge tag 'for-6.10/block-20240511' of git://git.kernel.dk/linux
[thirdparty/kernel/linux.git] / block / blk-zoned.c
CommitLineData
3dcf60bc 1// SPDX-License-Identifier: GPL-2.0
6a0cb1bc
HR
2/*
3 * Zoned block device handling
4 *
5 * Copyright (c) 2015, Hannes Reinecke
6 * Copyright (c) 2015, SUSE Linux GmbH
7 *
8 * Copyright (c) 2016, Damien Le Moal
9 * Copyright (c) 2016, Western Digital
dd291d77 10 * Copyright (c) 2024, Western Digital Corporation or its affiliates.
6a0cb1bc
HR
11 */
12
13#include <linux/kernel.h>
14#include <linux/module.h>
6a0cb1bc 15#include <linux/blkdev.h>
bf505456 16#include <linux/blk-mq.h>
26202928
DLM
17#include <linux/mm.h>
18#include <linux/vmalloc.h>
bd976e52 19#include <linux/sched/mm.h>
dd291d77
DLM
20#include <linux/spinlock.h>
21#include <linux/atomic.h>
22#include <linux/mempool.h>
6a0cb1bc 23
a2d6b3a2 24#include "blk.h"
dd291d77 25#include "blk-mq-sched.h"
d9f1439a 26#include "blk-mq-debugfs.h"
a2d6b3a2 27
02694e86
CK
28#define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
29static const char *const zone_cond_name[] = {
30 ZONE_COND_NAME(NOT_WP),
31 ZONE_COND_NAME(EMPTY),
32 ZONE_COND_NAME(IMP_OPEN),
33 ZONE_COND_NAME(EXP_OPEN),
34 ZONE_COND_NAME(CLOSED),
35 ZONE_COND_NAME(READONLY),
36 ZONE_COND_NAME(FULL),
37 ZONE_COND_NAME(OFFLINE),
38};
39#undef ZONE_COND_NAME
40
dd291d77
DLM
41/*
42 * Per-zone write plug.
43 * @node: hlist_node structure for managing the plug using a hash table.
44 * @link: To list the plug in the zone write plug error list of the disk.
45 * @ref: Zone write plug reference counter. A zone write plug reference is
46 * always at least 1 when the plug is hashed in the disk plug hash table.
47 * The reference is incremented whenever a new BIO needing plugging is
48 * submitted and when a function needs to manipulate a plug. The
49 * reference count is decremented whenever a plugged BIO completes and
50 * when a function that referenced the plug returns. The initial
51 * reference is dropped whenever the zone of the zone write plug is reset,
52 * finished and when the zone becomes full (last write BIO to the zone
53 * completes).
54 * @lock: Spinlock to atomically manipulate the plug.
55 * @flags: Flags indicating the plug state.
56 * @zone_no: The number of the zone the plug is managing.
57 * @wp_offset: The zone write pointer location relative to the start of the zone
58 * as a number of 512B sectors.
59 * @bio_list: The list of BIOs that are currently plugged.
60 * @bio_work: Work struct to handle issuing of plugged BIOs
61 * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
62 * @disk: The gendisk the plug belongs to.
63 */
64struct blk_zone_wplug {
65 struct hlist_node node;
66 struct list_head link;
67 atomic_t ref;
68 spinlock_t lock;
69 unsigned int flags;
70 unsigned int zone_no;
71 unsigned int wp_offset;
72 struct bio_list bio_list;
73 struct work_struct bio_work;
74 struct rcu_head rcu_head;
75 struct gendisk *disk;
76};
77
78/*
79 * Zone write plug flags bits:
80 * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
81 * that is, that write BIOs are being throttled due to a write BIO already
82 * being executed or the zone write plug bio list is not empty.
83 * - BLK_ZONE_WPLUG_ERROR: Indicates that a write error happened which will be
84 * recovered with a report zone to update the zone write pointer offset.
85 * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
86 * from the disk hash table and that the initial reference to the zone
87 * write plug set when the plug was first added to the hash table has been
88 * dropped. This flag is set when a zone is reset, finished or become full,
89 * to prevent new references to the zone write plug to be taken for
90 * newly incoming BIOs. A zone write plug flagged with this flag will be
91 * freed once all remaining references from BIOs or functions are dropped.
92 */
93#define BLK_ZONE_WPLUG_PLUGGED (1U << 0)
94#define BLK_ZONE_WPLUG_ERROR (1U << 1)
95#define BLK_ZONE_WPLUG_UNHASHED (1U << 2)
96
97#define BLK_ZONE_WPLUG_BUSY (BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR)
98
02694e86
CK
99/**
100 * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
101 * @zone_cond: BLK_ZONE_COND_XXX.
102 *
103 * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX
104 * into string format. Useful in the debugging and tracing zone conditions. For
105 * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN".
106 */
107const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
108{
109 static const char *zone_cond_str = "UNKNOWN";
110
111 if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond])
112 zone_cond_str = zone_cond_name[zone_cond];
113
114 return zone_cond_str;
115}
116EXPORT_SYMBOL_GPL(blk_zone_cond_str);
117
a91e1380 118/**
b623e347
CH
119 * bdev_nr_zones - Get number of zones
120 * @bdev: Target device
a91e1380 121 *
9b38bb4b
CH
122 * Return the total number of zones of a zoned block device. For a block
123 * device without zone capabilities, the number of zones is always 0.
a91e1380 124 */
b623e347 125unsigned int bdev_nr_zones(struct block_device *bdev)
a91e1380 126{
b623e347 127 sector_t zone_sectors = bdev_zone_sectors(bdev);
a91e1380 128
b623e347 129 if (!bdev_is_zoned(bdev))
a91e1380 130 return 0;
b623e347
CH
131 return (bdev_nr_sectors(bdev) + zone_sectors - 1) >>
132 ilog2(zone_sectors);
a91e1380 133}
b623e347 134EXPORT_SYMBOL_GPL(bdev_nr_zones);
a91e1380 135
6a0cb1bc
HR
136/**
137 * blkdev_report_zones - Get zones information
138 * @bdev: Target block device
139 * @sector: Sector from which to report zones
d4100351
CH
140 * @nr_zones: Maximum number of zones to report
141 * @cb: Callback function called for each reported zone
142 * @data: Private data for the callback
6a0cb1bc
HR
143 *
144 * Description:
d4100351
CH
145 * Get zone information starting from the zone containing @sector for at most
146 * @nr_zones, and call @cb for each zone reported by the device.
147 * To report all zones in a device starting from @sector, the BLK_ALL_ZONES
148 * constant can be passed to @nr_zones.
149 * Returns the number of zones reported by the device, or a negative errno
150 * value in case of failure.
151 *
152 * Note: The caller must use memalloc_noXX_save/restore() calls to control
153 * memory allocations done within this function.
6a0cb1bc 154 */
e76239a3 155int blkdev_report_zones(struct block_device *bdev, sector_t sector,
d4100351 156 unsigned int nr_zones, report_zones_cb cb, void *data)
6a0cb1bc 157{
ceeb373a 158 struct gendisk *disk = bdev->bd_disk;
5eac3eb3 159 sector_t capacity = get_capacity(disk);
6a0cb1bc 160
edd1dbc8 161 if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
e76239a3 162 return -EOPNOTSUPP;
6a0cb1bc 163
d4100351 164 if (!nr_zones || sector >= capacity)
6a0cb1bc 165 return 0;
6a0cb1bc 166
d4100351 167 return disk->fops->report_zones(disk, sector, nr_zones, cb, data);
6a0cb1bc
HR
168}
169EXPORT_SYMBOL_GPL(blkdev_report_zones);
170
1ee533ec
DLM
171static inline unsigned long *blk_alloc_zone_bitmap(int node,
172 unsigned int nr_zones)
6e33dbf2 173{
1ee533ec
DLM
174 return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
175 GFP_NOIO, node);
176}
6e33dbf2 177
1ee533ec
DLM
178static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx,
179 void *data)
180{
6e33dbf2 181 /*
1ee533ec
DLM
182 * For an all-zones reset, ignore conventional, empty, read-only
183 * and offline zones.
6e33dbf2 184 */
1ee533ec
DLM
185 switch (zone->cond) {
186 case BLK_ZONE_COND_NOT_WP:
187 case BLK_ZONE_COND_EMPTY:
188 case BLK_ZONE_COND_READONLY:
189 case BLK_ZONE_COND_OFFLINE:
190 return 0;
191 default:
192 set_bit(idx, (unsigned long *)data);
193 return 0;
194 }
195}
196
71f4ecdb 197static int blkdev_zone_reset_all_emulated(struct block_device *bdev)
1ee533ec 198{
d86e716a 199 struct gendisk *disk = bdev->bd_disk;
375c140c
CH
200 sector_t capacity = bdev_nr_sectors(bdev);
201 sector_t zone_sectors = bdev_zone_sectors(bdev);
1ee533ec
DLM
202 unsigned long *need_reset;
203 struct bio *bio = NULL;
204 sector_t sector = 0;
205 int ret;
206
d86e716a 207 need_reset = blk_alloc_zone_bitmap(disk->queue->node, disk->nr_zones);
1ee533ec
DLM
208 if (!need_reset)
209 return -ENOMEM;
210
d86e716a
CH
211 ret = disk->fops->report_zones(disk, 0, disk->nr_zones,
212 blk_zone_need_reset_cb, need_reset);
1ee533ec
DLM
213 if (ret < 0)
214 goto out_free_need_reset;
215
216 ret = 0;
217 while (sector < capacity) {
d86e716a 218 if (!test_bit(disk_zone_no(disk, sector), need_reset)) {
1ee533ec
DLM
219 sector += zone_sectors;
220 continue;
221 }
222
0a3140ea 223 bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC,
71f4ecdb 224 GFP_KERNEL);
1ee533ec
DLM
225 bio->bi_iter.bi_sector = sector;
226 sector += zone_sectors;
227
228 /* This may take a while, so be nice to others */
229 cond_resched();
230 }
231
232 if (bio) {
233 ret = submit_bio_wait(bio);
234 bio_put(bio);
235 }
236
237out_free_need_reset:
238 kfree(need_reset);
239 return ret;
240}
241
71f4ecdb 242static int blkdev_zone_reset_all(struct block_device *bdev)
1ee533ec
DLM
243{
244 struct bio bio;
245
49add496 246 bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
1ee533ec 247 return submit_bio_wait(&bio);
6e33dbf2
CK
248}
249
6a0cb1bc 250/**
6c1b1da5 251 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
6a0cb1bc 252 * @bdev: Target block device
6c1b1da5
AJ
253 * @op: Operation to be performed on the zones
254 * @sector: Start sector of the first zone to operate on
255 * @nr_sectors: Number of sectors, should be at least the length of one zone and
256 * must be zone size aligned.
6a0cb1bc
HR
257 *
258 * Description:
6c1b1da5 259 * Perform the specified operation on the range of zones specified by
6a0cb1bc
HR
260 * @sector..@sector+@nr_sectors. Specifying the entire disk sector range
261 * is valid, but the specified range should not contain conventional zones.
6c1b1da5
AJ
262 * The operation to execute on each zone can be a zone reset, open, close
263 * or finish request.
6a0cb1bc 264 */
ff07a02e 265int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
71f4ecdb 266 sector_t sector, sector_t nr_sectors)
6a0cb1bc
HR
267{
268 struct request_queue *q = bdev_get_queue(bdev);
375c140c
CH
269 sector_t zone_sectors = bdev_zone_sectors(bdev);
270 sector_t capacity = bdev_nr_sectors(bdev);
6a0cb1bc 271 sector_t end_sector = sector + nr_sectors;
a2d6b3a2 272 struct bio *bio = NULL;
1ee533ec 273 int ret = 0;
6a0cb1bc 274
edd1dbc8 275 if (!bdev_is_zoned(bdev))
6a0cb1bc
HR
276 return -EOPNOTSUPP;
277
a2d6b3a2
DLM
278 if (bdev_read_only(bdev))
279 return -EPERM;
280
6c1b1da5
AJ
281 if (!op_is_zone_mgmt(op))
282 return -EOPNOTSUPP;
283
11bde986 284 if (end_sector <= sector || end_sector > capacity)
6a0cb1bc
HR
285 /* Out of range */
286 return -EINVAL;
287
288 /* Check alignment (handle eventual smaller last zone) */
e29b2100 289 if (!bdev_is_zone_start(bdev, sector))
6a0cb1bc
HR
290 return -EINVAL;
291
e29b2100 292 if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity)
6a0cb1bc
HR
293 return -EINVAL;
294
1ee533ec
DLM
295 /*
296 * In the case of a zone reset operation over all zones,
297 * REQ_OP_ZONE_RESET_ALL can be used with devices supporting this
298 * command. For other devices, we emulate this command behavior by
299 * identifying the zones needing a reset.
300 */
301 if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) {
302 if (!blk_queue_zone_resetall(q))
71f4ecdb
JT
303 return blkdev_zone_reset_all_emulated(bdev);
304 return blkdev_zone_reset_all(bdev);
1ee533ec
DLM
305 }
306
6a0cb1bc 307 while (sector < end_sector) {
71f4ecdb 308 bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL);
c7a1d926 309 bio->bi_iter.bi_sector = sector;
6a0cb1bc
HR
310 sector += zone_sectors;
311
312 /* This may take a while, so be nice to others */
313 cond_resched();
6a0cb1bc
HR
314 }
315
a2d6b3a2
DLM
316 ret = submit_bio_wait(bio);
317 bio_put(bio);
318
a2d6b3a2 319 return ret;
6a0cb1bc 320}
6c1b1da5 321EXPORT_SYMBOL_GPL(blkdev_zone_mgmt);
3ed05a98 322
d4100351
CH
323struct zone_report_args {
324 struct blk_zone __user *zones;
325};
326
327static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx,
328 void *data)
329{
330 struct zone_report_args *args = data;
331
332 if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone)))
333 return -EFAULT;
334 return 0;
335}
336
56c4bddb 337/*
3ed05a98
ST
338 * BLKREPORTZONE ioctl processing.
339 * Called from blkdev_ioctl.
340 */
5e4ea834
CH
341int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
342 unsigned long arg)
3ed05a98
ST
343{
344 void __user *argp = (void __user *)arg;
d4100351 345 struct zone_report_args args;
3ed05a98 346 struct blk_zone_report rep;
3ed05a98
ST
347 int ret;
348
349 if (!argp)
350 return -EINVAL;
351
edd1dbc8 352 if (!bdev_is_zoned(bdev))
3ed05a98
ST
353 return -ENOTTY;
354
3ed05a98
ST
355 if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
356 return -EFAULT;
357
358 if (!rep.nr_zones)
359 return -EINVAL;
360
d4100351
CH
361 args.zones = argp + sizeof(struct blk_zone_report);
362 ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
363 blkdev_copy_zone_to_user, &args);
364 if (ret < 0)
365 return ret;
3ed05a98 366
d4100351 367 rep.nr_zones = ret;
82394db7 368 rep.flags = BLK_ZONE_REP_CAPACITY;
d4100351
CH
369 if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
370 return -EFAULT;
371 return 0;
3ed05a98
ST
372}
373
05bdb996
CH
374static int blkdev_truncate_zone_range(struct block_device *bdev,
375 blk_mode_t mode, const struct blk_zone_range *zrange)
e5113505
SK
376{
377 loff_t start, end;
378
379 if (zrange->sector + zrange->nr_sectors <= zrange->sector ||
380 zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk))
381 /* Out of range */
382 return -EINVAL;
383
384 start = zrange->sector << SECTOR_SHIFT;
385 end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1;
386
387 return truncate_bdev_range(bdev, mode, start, end);
388}
389
56c4bddb 390/*
e876df1f 391 * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing.
3ed05a98
ST
392 * Called from blkdev_ioctl.
393 */
05bdb996 394int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
e876df1f 395 unsigned int cmd, unsigned long arg)
3ed05a98
ST
396{
397 void __user *argp = (void __user *)arg;
3ed05a98 398 struct blk_zone_range zrange;
ff07a02e 399 enum req_op op;
e5113505 400 int ret;
3ed05a98
ST
401
402 if (!argp)
403 return -EINVAL;
404
edd1dbc8 405 if (!bdev_is_zoned(bdev))
3ed05a98
ST
406 return -ENOTTY;
407
05bdb996 408 if (!(mode & BLK_OPEN_WRITE))
3ed05a98
ST
409 return -EBADF;
410
411 if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
412 return -EFAULT;
413
e876df1f
AJ
414 switch (cmd) {
415 case BLKRESETZONE:
416 op = REQ_OP_ZONE_RESET;
e5113505
SK
417
418 /* Invalidate the page cache, including dirty pages. */
86399ea0 419 filemap_invalidate_lock(bdev->bd_inode->i_mapping);
e5113505
SK
420 ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
421 if (ret)
86399ea0 422 goto fail;
e876df1f
AJ
423 break;
424 case BLKOPENZONE:
425 op = REQ_OP_ZONE_OPEN;
426 break;
427 case BLKCLOSEZONE:
428 op = REQ_OP_ZONE_CLOSE;
429 break;
430 case BLKFINISHZONE:
431 op = REQ_OP_ZONE_FINISH;
432 break;
433 default:
434 return -ENOTTY;
435 }
436
71f4ecdb 437 ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
e5113505 438
86399ea0
SK
439fail:
440 if (cmd == BLKRESETZONE)
441 filemap_invalidate_unlock(bdev->bd_inode->i_mapping);
e5113505
SK
442
443 return ret;
3ed05a98 444}
bf505456 445
dd291d77
DLM
446static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector)
447{
448 if (!disk->conv_zones_bitmap)
449 return false;
450 return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap);
451}
452
453static bool disk_insert_zone_wplug(struct gendisk *disk,
454 struct blk_zone_wplug *zwplug)
455{
456 struct blk_zone_wplug *zwplg;
457 unsigned long flags;
458 unsigned int idx =
459 hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits);
460
461 /*
462 * Add the new zone write plug to the hash table, but carefully as we
463 * are racing with other submission context, so we may already have a
464 * zone write plug for the same zone.
465 */
466 spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
467 hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) {
468 if (zwplg->zone_no == zwplug->zone_no) {
469 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
470 return false;
471 }
472 }
473 hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
474 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
475
476 return true;
477}
478
dd291d77
DLM
479static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
480 sector_t sector)
481{
482 unsigned int zno = disk_zone_no(disk, sector);
483 unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits);
484 struct blk_zone_wplug *zwplug;
485
486 rcu_read_lock();
487
488 hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) {
489 if (zwplug->zone_no == zno &&
490 atomic_inc_not_zero(&zwplug->ref)) {
491 rcu_read_unlock();
492 return zwplug;
493 }
494 }
495
496 rcu_read_unlock();
497
498 return NULL;
499}
500
501static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
502{
503 struct blk_zone_wplug *zwplug =
504 container_of(rcu_head, struct blk_zone_wplug, rcu_head);
505
506 mempool_free(zwplug, zwplug->disk->zone_wplugs_pool);
507}
508
509static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
510{
511 if (atomic_dec_and_test(&zwplug->ref)) {
512 WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
513 WARN_ON_ONCE(!list_empty(&zwplug->link));
79ae35a4 514 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED));
dd291d77
DLM
515
516 call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
517 }
518}
519
79ae35a4
DLM
520static inline bool disk_should_remove_zone_wplug(struct gendisk *disk,
521 struct blk_zone_wplug *zwplug)
522{
7b295187
DLM
523 /* If the zone write plug was already removed, we are done. */
524 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
525 return false;
526
527 /* If the zone write plug is still busy, it cannot be removed. */
79ae35a4
DLM
528 if (zwplug->flags & BLK_ZONE_WPLUG_BUSY)
529 return false;
530
7b295187
DLM
531 /*
532 * Completions of BIOs with blk_zone_write_plug_bio_endio() may
533 * happen after handling a request completion with
347bde9d 534 * blk_zone_write_plug_finish_request() (e.g. with split BIOs
7b295187
DLM
535 * that are chained). In such case, disk_zone_wplug_unplug_bio()
536 * should not attempt to remove the zone write plug until all BIO
537 * completions are seen. Check by looking at the zone write plug
538 * reference count, which is 2 when the plug is unused (one reference
539 * taken when the plug was allocated and another reference taken by the
540 * caller context).
541 */
542 if (atomic_read(&zwplug->ref) > 2)
543 return false;
544
79ae35a4
DLM
545 /* We can remove zone write plugs for zones that are empty or full. */
546 return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity;
547}
548
549static void disk_remove_zone_wplug(struct gendisk *disk,
550 struct blk_zone_wplug *zwplug)
551{
552 unsigned long flags;
553
554 /* If the zone write plug was already removed, we have nothing to do. */
555 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
556 return;
557
558 /*
559 * Mark the zone write plug as unhashed and drop the extra reference we
560 * took when the plug was inserted in the hash table.
561 */
562 zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED;
563 spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
564 hlist_del_init_rcu(&zwplug->node);
565 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
566 disk_put_zone_wplug(zwplug);
567}
568
dd291d77
DLM
569static void blk_zone_wplug_bio_work(struct work_struct *work);
570
571/*
572 * Get a reference on the write plug for the zone containing @sector.
573 * If the plug does not exist, it is allocated and hashed.
574 * Return a pointer to the zone write plug with the plug spinlock held.
575 */
576static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
577 sector_t sector, gfp_t gfp_mask,
578 unsigned long *flags)
bf505456 579{
dd291d77
DLM
580 unsigned int zno = disk_zone_no(disk, sector);
581 struct blk_zone_wplug *zwplug;
582
583again:
584 zwplug = disk_get_zone_wplug(disk, sector);
585 if (zwplug) {
586 /*
587 * Check that a BIO completion or a zone reset or finish
588 * operation has not already removed the zone write plug from
589 * the hash table and dropped its reference count. In such case,
590 * we need to get a new plug so start over from the beginning.
591 */
592 spin_lock_irqsave(&zwplug->lock, *flags);
593 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
594 spin_unlock_irqrestore(&zwplug->lock, *flags);
595 disk_put_zone_wplug(zwplug);
596 goto again;
597 }
598 return zwplug;
599 }
600
601 /*
602 * Allocate and initialize a zone write plug with an extra reference
603 * so that it is not freed when the zone write plug becomes idle without
604 * the zone being full.
605 */
606 zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask);
607 if (!zwplug)
608 return NULL;
609
610 INIT_HLIST_NODE(&zwplug->node);
611 INIT_LIST_HEAD(&zwplug->link);
612 atomic_set(&zwplug->ref, 2);
613 spin_lock_init(&zwplug->lock);
614 zwplug->flags = 0;
615 zwplug->zone_no = zno;
616 zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1);
617 bio_list_init(&zwplug->bio_list);
618 INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
619 zwplug->disk = disk;
620
621 spin_lock_irqsave(&zwplug->lock, *flags);
622
623 /*
624 * Insert the new zone write plug in the hash table. This can fail only
625 * if another context already inserted a plug. Retry from the beginning
626 * in such case.
627 */
628 if (!disk_insert_zone_wplug(disk, zwplug)) {
629 spin_unlock_irqrestore(&zwplug->lock, *flags);
630 mempool_free(zwplug, disk->zone_wplugs_pool);
631 goto again;
632 }
633
634 return zwplug;
635}
636
c9c8aea0
DLM
637static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug,
638 struct bio *bio)
dd291d77 639{
c9c8aea0 640 struct request_queue *q = zwplug->disk->queue;
dd291d77
DLM
641
642 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
643 bio_io_error(bio);
c9c8aea0 644 disk_put_zone_wplug(zwplug);
dd291d77
DLM
645 blk_queue_exit(q);
646}
647
648/*
649 * Abort (fail) all plugged BIOs of a zone write plug.
650 */
651static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
652{
653 struct bio *bio;
654
c9c8aea0
DLM
655 while ((bio = bio_list_pop(&zwplug->bio_list)))
656 blk_zone_wplug_bio_io_error(zwplug, bio);
dd291d77
DLM
657}
658
659/*
660 * Abort (fail) all plugged BIOs of a zone write plug that are not aligned
661 * with the assumed write pointer location of the zone when the BIO will
662 * be unplugged.
663 */
664static void disk_zone_wplug_abort_unaligned(struct gendisk *disk,
665 struct blk_zone_wplug *zwplug)
666{
667 unsigned int zone_capacity = disk->zone_capacity;
668 unsigned int wp_offset = zwplug->wp_offset;
669 struct bio_list bl = BIO_EMPTY_LIST;
670 struct bio *bio;
671
672 while ((bio = bio_list_pop(&zwplug->bio_list))) {
673 if (wp_offset >= zone_capacity ||
9b1ce7f0
DLM
674 (bio_op(bio) != REQ_OP_ZONE_APPEND &&
675 bio_offset_from_zone_start(bio) != wp_offset)) {
c9c8aea0 676 blk_zone_wplug_bio_io_error(zwplug, bio);
dd291d77
DLM
677 continue;
678 }
679
680 wp_offset += bio_sectors(bio);
681 bio_list_add(&bl, bio);
682 }
683
684 bio_list_merge(&zwplug->bio_list, &bl);
685}
686
19aad274
DLM
687static inline void disk_zone_wplug_set_error(struct gendisk *disk,
688 struct blk_zone_wplug *zwplug)
689{
690 unsigned long flags;
691
692 if (zwplug->flags & BLK_ZONE_WPLUG_ERROR)
693 return;
694
695 /*
696 * At this point, we already have a reference on the zone write plug.
697 * However, since we are going to add the plug to the disk zone write
698 * plugs work list, increase its reference count. This reference will
699 * be dropped in disk_zone_wplugs_work() once the error state is
700 * handled, or in disk_zone_wplug_clear_error() if the zone is reset or
701 * finished.
702 */
703 zwplug->flags |= BLK_ZONE_WPLUG_ERROR;
704 atomic_inc(&zwplug->ref);
705
706 spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
707 list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list);
708 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
709}
710
711static inline void disk_zone_wplug_clear_error(struct gendisk *disk,
712 struct blk_zone_wplug *zwplug)
713{
714 unsigned long flags;
715
716 if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR))
717 return;
718
719 /*
720 * We are racing with the error handling work which drops the reference
721 * on the zone write plug after handling the error state. So remove the
722 * plug from the error list and drop its reference count only if the
723 * error handling has not yet started, that is, if the zone write plug
724 * is still listed.
725 */
726 spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
727 if (!list_empty(&zwplug->link)) {
728 list_del_init(&zwplug->link);
729 zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
730 disk_put_zone_wplug(zwplug);
731 }
732 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
733}
734
dd291d77
DLM
735/*
736 * Set a zone write plug write pointer offset to either 0 (zone reset case)
737 * or to the zone size (zone finish case). This aborts all plugged BIOs, which
738 * is fine to do as doing a zone reset or zone finish while writes are in-flight
739 * is a mistake from the user which will most likely cause all plugged BIOs to
740 * fail anyway.
741 */
742static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
743 struct blk_zone_wplug *zwplug,
744 unsigned int wp_offset)
745{
746 unsigned long flags;
747
748 spin_lock_irqsave(&zwplug->lock, flags);
749
750 /*
751 * Make sure that a BIO completion or another zone reset or finish
752 * operation has not already removed the plug from the hash table.
753 */
754 if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
755 spin_unlock_irqrestore(&zwplug->lock, flags);
756 return;
757 }
758
759 /* Update the zone write pointer and abort all plugged BIOs. */
760 zwplug->wp_offset = wp_offset;
761 disk_zone_wplug_abort(zwplug);
762
763 /*
764 * Updating the write pointer offset puts back the zone
765 * in a good state. So clear the error flag and decrement the
766 * error count if we were in error state.
767 */
19aad274 768 disk_zone_wplug_clear_error(disk, zwplug);
dd291d77
DLM
769
770 /*
771 * The zone write plug now has no BIO plugged: remove it from the
772 * hash table so that it cannot be seen. The plug will be freed
773 * when the last reference is dropped.
774 */
775 if (disk_should_remove_zone_wplug(disk, zwplug))
776 disk_remove_zone_wplug(disk, zwplug);
777
778 spin_unlock_irqrestore(&zwplug->lock, flags);
779}
780
781static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio,
782 unsigned int wp_offset)
783{
784 struct gendisk *disk = bio->bi_bdev->bd_disk;
785 sector_t sector = bio->bi_iter.bi_sector;
786 struct blk_zone_wplug *zwplug;
787
788 /* Conventional zones cannot be reset nor finished. */
789 if (disk_zone_is_conv(disk, sector)) {
790 bio_io_error(bio);
791 return true;
792 }
793
794 /*
795 * If we have a zone write plug, set its write pointer offset to 0
796 * (reset case) or to the zone size (finish case). This will abort all
797 * BIOs plugged for the target zone. It is fine as resetting or
798 * finishing zones while writes are still in-flight will result in the
799 * writes failing anyway.
800 */
801 zwplug = disk_get_zone_wplug(disk, sector);
802 if (zwplug) {
803 disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
804 disk_put_zone_wplug(zwplug);
805 }
806
807 return false;
808}
809
810static bool blk_zone_wplug_handle_reset_all(struct bio *bio)
811{
812 struct gendisk *disk = bio->bi_bdev->bd_disk;
813 struct blk_zone_wplug *zwplug;
814 sector_t sector;
815
816 /*
817 * Set the write pointer offset of all zone write plugs to 0. This will
818 * abort all plugged BIOs. It is fine as resetting zones while writes
819 * are still in-flight will result in the writes failing anyway.
820 */
821 for (sector = 0; sector < get_capacity(disk);
822 sector += disk->queue->limits.chunk_sectors) {
823 zwplug = disk_get_zone_wplug(disk, sector);
824 if (zwplug) {
825 disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
826 disk_put_zone_wplug(zwplug);
827 }
828 }
829
830 return false;
831}
832
833static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug,
834 struct bio *bio, unsigned int nr_segs)
835{
836 /*
837 * Grab an extra reference on the BIO request queue usage counter.
838 * This reference will be reused to submit a request for the BIO for
839 * blk-mq devices and dropped when the BIO is failed and after
840 * it is issued in the case of BIO-based devices.
841 */
842 percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter);
843
844 /*
845 * The BIO is being plugged and thus will have to wait for the on-going
846 * write and for all other writes already plugged. So polling makes
847 * no sense.
848 */
849 bio_clear_polled(bio);
850
851 /*
852 * Reuse the poll cookie field to store the number of segments when
853 * split to the hardware limits.
854 */
855 bio->__bi_nr_segments = nr_segs;
856
857 /*
858 * We always receive BIOs after they are split and ready to be issued.
859 * The block layer passes the parts of a split BIO in order, and the
860 * user must also issue write sequentially. So simply add the new BIO
861 * at the tail of the list to preserve the sequential write order.
862 */
863 bio_list_add(&zwplug->bio_list, bio);
864}
865
866/*
867 * Called from bio_attempt_back_merge() when a BIO was merged with a request.
868 */
869void blk_zone_write_plug_bio_merged(struct bio *bio)
870{
871 struct blk_zone_wplug *zwplug;
872 unsigned long flags;
873
874 /*
875 * If the BIO was already plugged, then we were called through
096bc7ea
DLM
876 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge().
877 * For this case, we already hold a reference on the zone write plug for
878 * the BIO and blk_zone_write_plug_init_request() will handle the
dd291d77
DLM
879 * zone write pointer offset update.
880 */
881 if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
882 return;
883
884 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
885
886 /*
c4c3ffda
DLM
887 * Get a reference on the zone write plug of the target zone and advance
888 * the zone write pointer offset. Given that this is a merge, we already
889 * have at least one request and one BIO referencing the zone write
890 * plug. So this should not fail.
dd291d77
DLM
891 */
892 zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk,
893 bio->bi_iter.bi_sector);
c4c3ffda
DLM
894 if (WARN_ON_ONCE(!zwplug))
895 return;
896
dd291d77
DLM
897 spin_lock_irqsave(&zwplug->lock, flags);
898 zwplug->wp_offset += bio_sectors(bio);
899 spin_unlock_irqrestore(&zwplug->lock, flags);
900}
901
902/*
903 * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
904 * already went through zone write plugging (either a new BIO or one that was
905 * unplugged).
906 */
096bc7ea 907void blk_zone_write_plug_init_request(struct request *req)
dd291d77
DLM
908{
909 sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req);
910 struct request_queue *q = req->q;
911 struct gendisk *disk = q->disk;
912 unsigned int zone_capacity = disk->zone_capacity;
913 struct blk_zone_wplug *zwplug =
914 disk_get_zone_wplug(disk, blk_rq_pos(req));
915 unsigned long flags;
916 struct bio *bio;
917
096bc7ea
DLM
918 if (WARN_ON_ONCE(!zwplug))
919 return;
920
dd291d77 921 /*
7b295187 922 * Indicate that completion of this request needs to be handled with
347bde9d 923 * blk_zone_write_plug_finish_request(), which will drop the reference
7b295187 924 * on the zone write plug we took above on entry to this function.
dd291d77
DLM
925 */
926 req->rq_flags |= RQF_ZONE_WRITE_PLUGGING;
927
928 if (blk_queue_nomerges(q))
929 return;
930
931 /*
932 * Walk through the list of plugged BIOs to check if they can be merged
933 * into the back of the request.
934 */
935 spin_lock_irqsave(&zwplug->lock, flags);
936 while (zwplug->wp_offset < zone_capacity) {
937 bio = bio_list_peek(&zwplug->bio_list);
938 if (!bio)
939 break;
940
941 if (bio->bi_iter.bi_sector != req_back_sector ||
942 !blk_rq_merge_ok(req, bio))
943 break;
944
945 WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES &&
946 !bio->__bi_nr_segments);
947
948 bio_list_pop(&zwplug->bio_list);
949 if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) !=
950 BIO_MERGE_OK) {
951 bio_list_add_head(&zwplug->bio_list, bio);
952 break;
953 }
954
955 /*
956 * Drop the extra reference on the queue usage we got when
957 * plugging the BIO and advance the write pointer offset.
958 */
959 blk_queue_exit(q);
960 zwplug->wp_offset += bio_sectors(bio);
961
962 req_back_sector += bio_sectors(bio);
963 }
964 spin_unlock_irqrestore(&zwplug->lock, flags);
965}
966
dd291d77
DLM
967/*
968 * Check and prepare a BIO for submission by incrementing the write pointer
9b1ce7f0
DLM
969 * offset of its zone write plug and changing zone append operations into
970 * regular write when zone append emulation is needed.
dd291d77
DLM
971 */
972static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
973 struct bio *bio)
974{
975 struct gendisk *disk = bio->bi_bdev->bd_disk;
976
977 /*
978 * Check that the user is not attempting to write to a full zone.
979 * We know such BIO will fail, and that would potentially overflow our
980 * write pointer offset beyond the end of the zone.
981 */
982 if (zwplug->wp_offset >= disk->zone_capacity)
983 goto err;
984
9b1ce7f0
DLM
985 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
986 /*
987 * Use a regular write starting at the current write pointer.
988 * Similarly to native zone append operations, do not allow
989 * merging.
990 */
991 bio->bi_opf &= ~REQ_OP_MASK;
992 bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE;
993 bio->bi_iter.bi_sector += zwplug->wp_offset;
994
995 /*
996 * Remember that this BIO is in fact a zone append operation
997 * so that we can restore its operation code on completion.
998 */
999 bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND);
1000 } else {
1001 /*
1002 * Check for non-sequential writes early because we avoid a
1003 * whole lot of error handling trouble if we don't send it off
1004 * to the driver.
1005 */
1006 if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
1007 goto err;
1008 }
dd291d77
DLM
1009
1010 /* Advance the zone write pointer offset. */
1011 zwplug->wp_offset += bio_sectors(bio);
1012
1013 return true;
1014
1015err:
1016 /* We detected an invalid write BIO: schedule error recovery. */
1017 disk_zone_wplug_set_error(disk, zwplug);
1018 kblockd_schedule_work(&disk->zone_wplugs_work);
1019 return false;
1020}
1021
1022static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
1023{
1024 struct gendisk *disk = bio->bi_bdev->bd_disk;
1025 sector_t sector = bio->bi_iter.bi_sector;
1026 struct blk_zone_wplug *zwplug;
1027 gfp_t gfp_mask = GFP_NOIO;
1028 unsigned long flags;
1029
1030 /*
1031 * BIOs must be fully contained within a zone so that we use the correct
1032 * zone write plug for the entire BIO. For blk-mq devices, the block
1033 * layer should already have done any splitting required to ensure this
1034 * and this BIO should thus not be straddling zone boundaries. For
1035 * BIO-based devices, it is the responsibility of the driver to split
1036 * the bio before submitting it.
1037 */
1038 if (WARN_ON_ONCE(bio_straddles_zones(bio))) {
1039 bio_io_error(bio);
1040 return true;
1041 }
1042
1043 /* Conventional zones do not need write plugging. */
9b1ce7f0
DLM
1044 if (disk_zone_is_conv(disk, sector)) {
1045 /* Zone append to conventional zones is not allowed. */
1046 if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
1047 bio_io_error(bio);
1048 return true;
1049 }
dd291d77 1050 return false;
9b1ce7f0 1051 }
dd291d77
DLM
1052
1053 if (bio->bi_opf & REQ_NOWAIT)
1054 gfp_mask = GFP_NOWAIT;
1055
1056 zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags);
1057 if (!zwplug) {
1058 bio_io_error(bio);
1059 return true;
1060 }
1061
1062 /* Indicate that this BIO is being handled using zone write plugging. */
1063 bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1064
1065 /*
1066 * If the zone is already plugged or has a pending error, add the BIO
1067 * to the plug BIO list. Otherwise, plug and let the BIO execute.
1068 */
1069 if (zwplug->flags & BLK_ZONE_WPLUG_BUSY)
1070 goto plug;
1071
1072 /*
1073 * If an error is detected when preparing the BIO, add it to the BIO
1074 * list so that error recovery can deal with it.
1075 */
1076 if (!blk_zone_wplug_prepare_bio(zwplug, bio))
1077 goto plug;
1078
1079 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
1080
1081 spin_unlock_irqrestore(&zwplug->lock, flags);
1082
1083 return false;
1084
1085plug:
1086 zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
1087 blk_zone_wplug_add_bio(zwplug, bio, nr_segs);
1088
1089 spin_unlock_irqrestore(&zwplug->lock, flags);
1090
1091 return true;
1092}
1093
1094/**
1095 * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
1096 * @bio: The BIO being submitted
1097 * @nr_segs: The number of physical segments of @bio
1098 *
9b1ce7f0
DLM
1099 * Handle write, write zeroes and zone append operations requiring emulation
1100 * using zone write plugging.
dd291d77
DLM
1101 *
1102 * Return true whenever @bio execution needs to be delayed through the zone
1103 * write plug. Otherwise, return false to let the submission path process
1104 * @bio normally.
1105 */
1106bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
1107{
1108 struct block_device *bdev = bio->bi_bdev;
1109
1110 if (!bdev->bd_disk->zone_wplugs_hash)
1111 return false;
1112
1113 /*
1114 * If the BIO already has the plugging flag set, then it was already
1115 * handled through this path and this is a submission from the zone
1116 * plug bio submit work.
1117 */
1118 if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
1119 return false;
1120
1121 /*
1122 * We do not need to do anything special for empty flush BIOs, e.g
1123 * BIOs such as issued by blkdev_issue_flush(). The is because it is
1124 * the responsibility of the user to first wait for the completion of
1125 * write operations for flush to have any effect on the persistence of
1126 * the written data.
1127 */
1128 if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
1129 return false;
1130
1131 /*
1132 * Regular writes and write zeroes need to be handled through the target
1133 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH
1134 * which may need to go through the flush machinery depending on the
1135 * target device capabilities. Plugging such writes is fine as the flush
1136 * machinery operates at the request level, below the plug, and
1137 * completion of the flush sequence will go through the regular BIO
1138 * completion, which will handle zone write plugging.
9b1ce7f0
DLM
1139 * Zone append operations for devices that requested emulation must
1140 * also be plugged so that these BIOs can be changed into regular
1141 * write BIOs.
dd291d77
DLM
1142 * Zone reset, reset all and finish commands need special treatment
1143 * to correctly track the write pointer offset of zones. These commands
1144 * are not plugged as we do not need serialization with write
1145 * operations. It is the responsibility of the user to not issue reset
1146 * and finish commands when write operations are in flight.
1147 */
1148 switch (bio_op(bio)) {
9b1ce7f0
DLM
1149 case REQ_OP_ZONE_APPEND:
1150 if (!bdev_emulates_zone_append(bdev))
1151 return false;
1152 fallthrough;
dd291d77
DLM
1153 case REQ_OP_WRITE:
1154 case REQ_OP_WRITE_ZEROES:
1155 return blk_zone_wplug_handle_write(bio, nr_segs);
1156 case REQ_OP_ZONE_RESET:
1157 return blk_zone_wplug_handle_reset_or_finish(bio, 0);
1158 case REQ_OP_ZONE_FINISH:
1159 return blk_zone_wplug_handle_reset_or_finish(bio,
1160 bdev_zone_sectors(bdev));
1161 case REQ_OP_ZONE_RESET_ALL:
1162 return blk_zone_wplug_handle_reset_all(bio);
1163 default:
1164 return false;
1165 }
1166
1167 return false;
1168}
1169EXPORT_SYMBOL_GPL(blk_zone_plug_bio);
1170
9e78c38a
DLM
1171static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
1172 struct blk_zone_wplug *zwplug)
1173{
1174 /*
1175 * Take a reference on the zone write plug and schedule the submission
1176 * of the next plugged BIO. blk_zone_wplug_bio_work() will release the
1177 * reference we take here.
1178 */
1179 WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
1180 atomic_inc(&zwplug->ref);
1181 queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
1182}
1183
dd291d77
DLM
1184static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
1185 struct blk_zone_wplug *zwplug)
1186{
1187 unsigned long flags;
1188
1189 spin_lock_irqsave(&zwplug->lock, flags);
1190
1191 /*
1192 * If we had an error, schedule error recovery. The recovery work
1193 * will restart submission of plugged BIOs.
1194 */
1195 if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) {
1196 spin_unlock_irqrestore(&zwplug->lock, flags);
1197 kblockd_schedule_work(&disk->zone_wplugs_work);
1198 return;
1199 }
1200
1201 /* Schedule submission of the next plugged BIO if we have one. */
1202 if (!bio_list_empty(&zwplug->bio_list)) {
9e78c38a 1203 disk_zone_wplug_schedule_bio_work(disk, zwplug);
dd291d77 1204 spin_unlock_irqrestore(&zwplug->lock, flags);
dd291d77
DLM
1205 return;
1206 }
1207
1208 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1209
1210 /*
1211 * If the zone is full (it was fully written or finished, or empty
1212 * (it was reset), remove its zone write plug from the hash table.
1213 */
1214 if (disk_should_remove_zone_wplug(disk, zwplug))
1215 disk_remove_zone_wplug(disk, zwplug);
1216
1217 spin_unlock_irqrestore(&zwplug->lock, flags);
1218}
1219
1220void blk_zone_write_plug_bio_endio(struct bio *bio)
1221{
1222 struct gendisk *disk = bio->bi_bdev->bd_disk;
1223 struct blk_zone_wplug *zwplug =
b5a64ec2 1224 disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
dd291d77
DLM
1225 unsigned long flags;
1226
1227 if (WARN_ON_ONCE(!zwplug))
1228 return;
1229
1230 /* Make sure we do not see this BIO again by clearing the plug flag. */
1231 bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
1232
9b1ce7f0
DLM
1233 /*
1234 * If this is a regular write emulating a zone append operation,
1235 * restore the original operation code.
1236 */
1237 if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
1238 bio->bi_opf &= ~REQ_OP_MASK;
1239 bio->bi_opf |= REQ_OP_ZONE_APPEND;
1240 }
1241
dd291d77
DLM
1242 /*
1243 * If the BIO failed, mark the plug as having an error to trigger
1244 * recovery.
1245 */
1246 if (bio->bi_status != BLK_STS_OK) {
1247 spin_lock_irqsave(&zwplug->lock, flags);
1248 disk_zone_wplug_set_error(disk, zwplug);
1249 spin_unlock_irqrestore(&zwplug->lock, flags);
1250 }
1251
7b295187
DLM
1252 /* Drop the reference we took when the BIO was issued. */
1253 disk_put_zone_wplug(zwplug);
1254
dd291d77 1255 /*
347bde9d 1256 * For BIO-based devices, blk_zone_write_plug_finish_request()
dd291d77
DLM
1257 * is not called. So we need to schedule execution of the next
1258 * plugged BIO here.
1259 */
1260 if (bio->bi_bdev->bd_has_submit_bio)
1261 disk_zone_wplug_unplug_bio(disk, zwplug);
1262
7b295187 1263 /* Drop the reference we took when entering this function. */
dd291d77
DLM
1264 disk_put_zone_wplug(zwplug);
1265}
1266
347bde9d 1267void blk_zone_write_plug_finish_request(struct request *req)
dd291d77
DLM
1268{
1269 struct gendisk *disk = req->q->disk;
347bde9d 1270 struct blk_zone_wplug *zwplug;
dd291d77 1271
347bde9d 1272 zwplug = disk_get_zone_wplug(disk, req->__sector);
dd291d77
DLM
1273 if (WARN_ON_ONCE(!zwplug))
1274 return;
1275
1276 req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING;
1277
dd291d77
DLM
1278 /*
1279 * Drop the reference we took when the request was initialized in
096bc7ea 1280 * blk_zone_write_plug_init_request().
dd291d77 1281 */
7b295187
DLM
1282 disk_put_zone_wplug(zwplug);
1283
1284 disk_zone_wplug_unplug_bio(disk, zwplug);
1285
1286 /* Drop the reference we took when entering this function. */
dd291d77
DLM
1287 disk_put_zone_wplug(zwplug);
1288}
1289
1290static void blk_zone_wplug_bio_work(struct work_struct *work)
1291{
1292 struct blk_zone_wplug *zwplug =
1293 container_of(work, struct blk_zone_wplug, bio_work);
1294 struct block_device *bdev;
1295 unsigned long flags;
1296 struct bio *bio;
1297
1298 /*
1299 * Submit the next plugged BIO. If we do not have any, clear
1300 * the plugged flag.
1301 */
1302 spin_lock_irqsave(&zwplug->lock, flags);
1303
1304 bio = bio_list_pop(&zwplug->bio_list);
1305 if (!bio) {
1306 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1307 spin_unlock_irqrestore(&zwplug->lock, flags);
9e78c38a 1308 goto put_zwplug;
dd291d77
DLM
1309 }
1310
1311 if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
1312 /* Error recovery will decide what to do with the BIO. */
1313 bio_list_add_head(&zwplug->bio_list, bio);
1314 spin_unlock_irqrestore(&zwplug->lock, flags);
9e78c38a 1315 goto put_zwplug;
dd291d77
DLM
1316 }
1317
1318 spin_unlock_irqrestore(&zwplug->lock, flags);
1319
1320 bdev = bio->bi_bdev;
1321 submit_bio_noacct_nocheck(bio);
1322
1323 /*
1324 * blk-mq devices will reuse the extra reference on the request queue
1325 * usage counter we took when the BIO was plugged, but the submission
1326 * path for BIO-based devices will not do that. So drop this extra
1327 * reference here.
1328 */
1329 if (bdev->bd_has_submit_bio)
1330 blk_queue_exit(bdev->bd_disk->queue);
9e78c38a
DLM
1331
1332put_zwplug:
1333 /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
1334 disk_put_zone_wplug(zwplug);
dd291d77
DLM
1335}
1336
1337static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
1338{
1339 switch (zone->cond) {
1340 case BLK_ZONE_COND_IMP_OPEN:
1341 case BLK_ZONE_COND_EXP_OPEN:
1342 case BLK_ZONE_COND_CLOSED:
1343 return zone->wp - zone->start;
1344 case BLK_ZONE_COND_FULL:
1345 return zone->len;
1346 case BLK_ZONE_COND_EMPTY:
1347 return 0;
1348 case BLK_ZONE_COND_NOT_WP:
1349 case BLK_ZONE_COND_OFFLINE:
1350 case BLK_ZONE_COND_READONLY:
1351 default:
1352 /*
1353 * Conventional, offline and read-only zones do not have a valid
1354 * write pointer.
1355 */
1356 return UINT_MAX;
1357 }
1358}
1359
1360static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone,
1361 unsigned int idx, void *data)
1362{
1363 struct blk_zone *zonep = data;
1364
1365 *zonep = *zone;
1366 return 0;
1367}
1368
1369static void disk_zone_wplug_handle_error(struct gendisk *disk,
1370 struct blk_zone_wplug *zwplug)
1371{
1372 sector_t zone_start_sector =
1373 bdev_zone_sectors(disk->part0) * zwplug->zone_no;
1374 unsigned int noio_flag;
1375 struct blk_zone zone;
1376 unsigned long flags;
1377 int ret;
1378
1379 /* Get the current zone information from the device. */
1380 noio_flag = memalloc_noio_save();
1381 ret = disk->fops->report_zones(disk, zone_start_sector, 1,
1382 blk_zone_wplug_report_zone_cb, &zone);
1383 memalloc_noio_restore(noio_flag);
1384
1385 spin_lock_irqsave(&zwplug->lock, flags);
1386
1387 /*
1388 * A zone reset or finish may have cleared the error already. In such
1389 * case, do nothing as the report zones may have seen the "old" write
1390 * pointer value before the reset/finish operation completed.
1391 */
1392 if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR))
1393 goto unlock;
1394
1395 zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
1396
1397 if (ret != 1) {
1398 /*
1399 * We failed to get the zone information, meaning that something
1400 * is likely really wrong with the device. Abort all remaining
1401 * plugged BIOs as otherwise we could endup waiting forever on
1402 * plugged BIOs to complete if there is a queue freeze on-going.
1403 */
1404 disk_zone_wplug_abort(zwplug);
1405 goto unplug;
1406 }
1407
1408 /* Update the zone write pointer offset. */
1409 zwplug->wp_offset = blk_zone_wp_offset(&zone);
1410 disk_zone_wplug_abort_unaligned(disk, zwplug);
1411
1412 /* Restart BIO submission if we still have any BIO left. */
1413 if (!bio_list_empty(&zwplug->bio_list)) {
9e78c38a 1414 disk_zone_wplug_schedule_bio_work(disk, zwplug);
dd291d77
DLM
1415 goto unlock;
1416 }
1417
1418unplug:
1419 zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
1420 if (disk_should_remove_zone_wplug(disk, zwplug))
1421 disk_remove_zone_wplug(disk, zwplug);
1422
1423unlock:
1424 spin_unlock_irqrestore(&zwplug->lock, flags);
1425}
1426
1427static void disk_zone_wplugs_work(struct work_struct *work)
1428{
1429 struct gendisk *disk =
1430 container_of(work, struct gendisk, zone_wplugs_work);
1431 struct blk_zone_wplug *zwplug;
1432 unsigned long flags;
1433
1434 spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
1435
1436 while (!list_empty(&disk->zone_wplugs_err_list)) {
1437 zwplug = list_first_entry(&disk->zone_wplugs_err_list,
1438 struct blk_zone_wplug, link);
1439 list_del_init(&zwplug->link);
1440 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
1441
1442 disk_zone_wplug_handle_error(disk, zwplug);
1443 disk_put_zone_wplug(zwplug);
1444
1445 spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
1446 }
1447
1448 spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
1449}
1450
1451static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
1452{
1453 return 1U << disk->zone_wplugs_hash_bits;
1454}
1455
1456void disk_init_zone_resources(struct gendisk *disk)
1457{
1458 spin_lock_init(&disk->zone_wplugs_lock);
1459 INIT_LIST_HEAD(&disk->zone_wplugs_err_list);
1460 INIT_WORK(&disk->zone_wplugs_work, disk_zone_wplugs_work);
1461}
1462
1463/*
1464 * For the size of a disk zone write plug hash table, use the size of the
1465 * zone write plug mempool, which is the maximum of the disk open zones and
1466 * active zones limits. But do not exceed 4KB (512 hlist head entries), that is,
1467 * 9 bits. For a disk that has no limits, mempool size defaults to 128.
1468 */
1469#define BLK_ZONE_WPLUG_MAX_HASH_BITS 9
1470#define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128
1471
1472static int disk_alloc_zone_resources(struct gendisk *disk,
1473 unsigned int pool_size)
1474{
1475 unsigned int i;
1476
1477 disk->zone_wplugs_hash_bits =
1478 min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS);
1479
1480 disk->zone_wplugs_hash =
1481 kcalloc(disk_zone_wplugs_hash_size(disk),
1482 sizeof(struct hlist_head), GFP_KERNEL);
1483 if (!disk->zone_wplugs_hash)
1484 return -ENOMEM;
1485
1486 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
1487 INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]);
1488
1489 disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size,
1490 sizeof(struct blk_zone_wplug));
a8f59e5a
DLM
1491 if (!disk->zone_wplugs_pool)
1492 goto free_hash;
1493
1494 disk->zone_wplugs_wq =
1495 alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI,
1496 pool_size, disk->disk_name);
1497 if (!disk->zone_wplugs_wq)
1498 goto destroy_pool;
dd291d77
DLM
1499
1500 return 0;
a8f59e5a
DLM
1501
1502destroy_pool:
1503 mempool_destroy(disk->zone_wplugs_pool);
1504 disk->zone_wplugs_pool = NULL;
1505free_hash:
1506 kfree(disk->zone_wplugs_hash);
1507 disk->zone_wplugs_hash = NULL;
1508 disk->zone_wplugs_hash_bits = 0;
1509 return -ENOMEM;
dd291d77
DLM
1510}
1511
1512static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
1513{
1514 struct blk_zone_wplug *zwplug;
1515 unsigned int i;
1516
1517 if (!disk->zone_wplugs_hash)
1518 return;
1519
1520 /* Free all the zone write plugs we have. */
1521 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
1522 while (!hlist_empty(&disk->zone_wplugs_hash[i])) {
1523 zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
1524 struct blk_zone_wplug, node);
1525 atomic_inc(&zwplug->ref);
1526 disk_remove_zone_wplug(disk, zwplug);
1527 disk_put_zone_wplug(zwplug);
1528 }
1529 }
1530
1531 kfree(disk->zone_wplugs_hash);
1532 disk->zone_wplugs_hash = NULL;
1533 disk->zone_wplugs_hash_bits = 0;
1534}
1535
1536void disk_free_zone_resources(struct gendisk *disk)
1537{
1538 cancel_work_sync(&disk->zone_wplugs_work);
1539
a8f59e5a
DLM
1540 if (disk->zone_wplugs_wq) {
1541 destroy_workqueue(disk->zone_wplugs_wq);
1542 disk->zone_wplugs_wq = NULL;
1543 }
1544
dd291d77
DLM
1545 disk_destroy_zone_wplugs_hash_table(disk);
1546
1547 /*
1548 * Wait for the zone write plugs to be RCU-freed before
1549 * destorying the mempool.
1550 */
1551 rcu_barrier();
1552
1553 mempool_destroy(disk->zone_wplugs_pool);
1554 disk->zone_wplugs_pool = NULL;
1555
d86e716a
CH
1556 kfree(disk->conv_zones_bitmap);
1557 disk->conv_zones_bitmap = NULL;
dd291d77
DLM
1558 disk->zone_capacity = 0;
1559 disk->nr_zones = 0;
1560}
1561
946dd71e
DLM
1562static inline bool disk_need_zone_resources(struct gendisk *disk)
1563{
1564 /*
1565 * All mq zoned devices need zone resources so that the block layer
1566 * can automatically handle write BIO plugging. BIO-based device drivers
1567 * (e.g. DM devices) are normally responsible for handling zone write
1568 * ordering and do not need zone resources, unless the driver requires
1569 * zone append emulation.
1570 */
1571 return queue_is_mq(disk->queue) ||
1572 queue_emulates_zone_append(disk->queue);
1573}
1574
dd291d77
DLM
1575static int disk_revalidate_zone_resources(struct gendisk *disk,
1576 unsigned int nr_zones)
1577{
1578 struct queue_limits *lim = &disk->queue->limits;
1579 unsigned int pool_size;
1580
946dd71e
DLM
1581 if (!disk_need_zone_resources(disk))
1582 return 0;
1583
dd291d77
DLM
1584 /*
1585 * If the device has no limit on the maximum number of open and active
1586 * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE.
1587 */
1588 pool_size = max(lim->max_open_zones, lim->max_active_zones);
1589 if (!pool_size)
1590 pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones);
1591
1592 if (!disk->zone_wplugs_hash)
1593 return disk_alloc_zone_resources(disk, pool_size);
1594
dd291d77 1595 return 0;
bf505456
DLM
1596}
1597
d4100351
CH
1598struct blk_revalidate_zone_args {
1599 struct gendisk *disk;
f216fdd7 1600 unsigned long *conv_zones_bitmap;
e94f5819 1601 unsigned int nr_zones;
ecfe43b1 1602 unsigned int zone_capacity;
d4100351
CH
1603 sector_t sector;
1604};
1605
843283e9
DLM
1606/*
1607 * Update the disk zone resources information and device queue limits.
1608 * The disk queue is frozen when this is executed.
1609 */
1610static int disk_update_zone_resources(struct gendisk *disk,
1611 struct blk_revalidate_zone_args *args)
1612{
1613 struct request_queue *q = disk->queue;
6b7593b5
DLM
1614 unsigned int nr_seq_zones, nr_conv_zones = 0;
1615 unsigned int pool_size;
843283e9
DLM
1616 struct queue_limits lim;
1617
1618 disk->nr_zones = args->nr_zones;
1619 disk->zone_capacity = args->zone_capacity;
843283e9 1620 swap(disk->conv_zones_bitmap, args->conv_zones_bitmap);
6b7593b5
DLM
1621 if (disk->conv_zones_bitmap)
1622 nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap,
1623 disk->nr_zones);
1624 if (nr_conv_zones >= disk->nr_zones) {
1625 pr_warn("%s: Invalid number of conventional zones %u / %u\n",
1626 disk->disk_name, nr_conv_zones, disk->nr_zones);
1627 return -ENODEV;
1628 }
1629
1630 if (!disk->zone_wplugs_pool)
1631 return 0;
843283e9
DLM
1632
1633 /*
1634 * If the device has no limit on the maximum number of open and active
1635 * zones, set its max open zone limit to the mempool size to indicate
1636 * to the user that there is a potential performance impact due to
1637 * dynamic zone write plug allocation when simultaneously writing to
1638 * more zones than the size of the mempool.
1639 */
6b7593b5
DLM
1640 lim = queue_limits_start_update(q);
1641
1642 nr_seq_zones = disk->nr_zones - nr_conv_zones;
1643 pool_size = max(lim.max_open_zones, lim.max_active_zones);
1644 if (!pool_size)
1645 pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones);
1646
1647 mempool_resize(disk->zone_wplugs_pool, pool_size);
1648
1649 if (!lim.max_open_zones && !lim.max_active_zones) {
1650 if (pool_size < nr_seq_zones)
1651 lim.max_open_zones = pool_size;
1652 else
1653 lim.max_open_zones = 0;
843283e9
DLM
1654 }
1655
6b7593b5 1656 return queue_limits_commit_update(q, &lim);
843283e9
DLM
1657}
1658
d7580149
DLM
1659static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
1660 struct blk_revalidate_zone_args *args)
1661{
1662 struct gendisk *disk = args->disk;
1663 struct request_queue *q = disk->queue;
1664
1665 if (zone->capacity != zone->len) {
1666 pr_warn("%s: Invalid conventional zone capacity\n",
1667 disk->disk_name);
1668 return -ENODEV;
1669 }
1670
1671 if (!disk_need_zone_resources(disk))
1672 return 0;
1673
1674 if (!args->conv_zones_bitmap) {
1675 args->conv_zones_bitmap =
1676 blk_alloc_zone_bitmap(q->node, args->nr_zones);
1677 if (!args->conv_zones_bitmap)
1678 return -ENOMEM;
1679 }
1680
1681 set_bit(idx, args->conv_zones_bitmap);
1682
1683 return 0;
1684}
1685
1686static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
1687 struct blk_revalidate_zone_args *args)
1688{
1689 struct gendisk *disk = args->disk;
1690 struct blk_zone_wplug *zwplug;
1691 unsigned int wp_offset;
1692 unsigned long flags;
1693
1694 /*
1695 * Remember the capacity of the first sequential zone and check
1696 * if it is constant for all zones.
1697 */
1698 if (!args->zone_capacity)
1699 args->zone_capacity = zone->capacity;
1700 if (zone->capacity != args->zone_capacity) {
1701 pr_warn("%s: Invalid variable zone capacity\n",
1702 disk->disk_name);
1703 return -ENODEV;
1704 }
1705
1706 /*
1707 * We need to track the write pointer of all zones that are not
1708 * empty nor full. So make sure we have a zone write plug for
1709 * such zone if the device has a zone write plug hash table.
1710 */
1711 if (!disk->zone_wplugs_hash)
1712 return 0;
1713
1714 wp_offset = blk_zone_wp_offset(zone);
1715 if (!wp_offset || wp_offset >= zone->capacity)
1716 return 0;
1717
1718 zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags);
1719 if (!zwplug)
1720 return -ENOMEM;
1721 spin_unlock_irqrestore(&zwplug->lock, flags);
1722 disk_put_zone_wplug(zwplug);
1723
1724 return 0;
1725}
1726
d9dd7308
DLM
1727/*
1728 * Helper function to check the validity of zones of a zoned block device.
1729 */
d4100351
CH
1730static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
1731 void *data)
d9dd7308 1732{
d4100351
CH
1733 struct blk_revalidate_zone_args *args = data;
1734 struct gendisk *disk = args->disk;
d9dd7308 1735 sector_t capacity = get_capacity(disk);
d7580149
DLM
1736 sector_t zone_sectors = disk->queue->limits.chunk_sectors;
1737 int ret;
03e51c4a
DLM
1738
1739 /* Check for bad zones and holes in the zone report */
1740 if (zone->start != args->sector) {
1741 pr_warn("%s: Zone gap at sectors %llu..%llu\n",
1742 disk->disk_name, args->sector, zone->start);
1743 return -ENODEV;
1744 }
1745
1746 if (zone->start >= capacity || !zone->len) {
1747 pr_warn("%s: Invalid zone start %llu, length %llu\n",
1748 disk->disk_name, zone->start, zone->len);
1749 return -ENODEV;
1750 }
d9dd7308
DLM
1751
1752 /*
1753 * All zones must have the same size, with the exception on an eventual
1754 * smaller last zone.
1755 */
03e51c4a
DLM
1756 if (zone->start + zone->len < capacity) {
1757 if (zone->len != zone_sectors) {
6c6b3549
CH
1758 pr_warn("%s: Invalid zoned device with non constant zone size\n",
1759 disk->disk_name);
1760 return -ENODEV;
1761 }
03e51c4a
DLM
1762 } else if (zone->len > zone_sectors) {
1763 pr_warn("%s: Invalid zoned device with larger last zone size\n",
1764 disk->disk_name);
d4100351 1765 return -ENODEV;
d9dd7308
DLM
1766 }
1767
ecfe43b1
DLM
1768 if (!zone->capacity || zone->capacity > zone->len) {
1769 pr_warn("%s: Invalid zone capacity\n",
1770 disk->disk_name);
1771 return -ENODEV;
1772 }
1773
d9dd7308
DLM
1774 /* Check zone type */
1775 switch (zone->type) {
1776 case BLK_ZONE_TYPE_CONVENTIONAL:
d7580149 1777 ret = blk_revalidate_conv_zone(zone, idx, args);
e94f5819 1778 break;
d9dd7308 1779 case BLK_ZONE_TYPE_SEQWRITE_REQ:
d7580149 1780 ret = blk_revalidate_seq_zone(zone, idx, args);
d9dd7308 1781 break;
587371ed 1782 case BLK_ZONE_TYPE_SEQWRITE_PREF:
d9dd7308
DLM
1783 default:
1784 pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
1785 disk->disk_name, (int)zone->type, zone->start);
d7580149 1786 ret = -ENODEV;
d9dd7308
DLM
1787 }
1788
d7580149
DLM
1789 if (!ret)
1790 args->sector += zone->len;
1791
1792 return ret;
d4100351
CH
1793}
1794
bf505456 1795/**
02ccd7c3 1796 * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
bf505456
DLM
1797 * @disk: Target disk
1798 *
9b3c08b9
DLM
1799 * Helper function for low-level device drivers to check, (re) allocate and
1800 * initialize resources used for managing zoned disks. This function should
1801 * normally be called by blk-mq based drivers when a zoned gendisk is probed
1802 * and when the zone configuration of the gendisk changes (e.g. after a format).
03e51c4a
DLM
1803 * Before calling this function, the device driver must already have set the
1804 * device zone size (chunk_sector limit) and the max zone append limit.
946dd71e
DLM
1805 * BIO based drivers can also use this function as long as the device queue
1806 * can be safely frozen.
bf505456 1807 */
9b3c08b9 1808int blk_revalidate_disk_zones(struct gendisk *disk)
bf505456
DLM
1809{
1810 struct request_queue *q = disk->queue;
03e51c4a
DLM
1811 sector_t zone_sectors = q->limits.chunk_sectors;
1812 sector_t capacity = get_capacity(disk);
1813 struct blk_revalidate_zone_args args = { };
6c6b3549 1814 unsigned int noio_flag;
dd291d77 1815 int ret = -ENOMEM;
bf505456 1816
c98c3d09
CH
1817 if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
1818 return -EIO;
bf505456 1819
03e51c4a
DLM
1820 if (!capacity)
1821 return -ENODEV;
1822
1823 /*
1824 * Checks that the device driver indicated a valid zone size and that
1825 * the max zone append limit is set.
1826 */
1827 if (!zone_sectors || !is_power_of_2(zone_sectors)) {
1828 pr_warn("%s: Invalid non power of two zone size (%llu)\n",
1829 disk->disk_name, zone_sectors);
1830 return -ENODEV;
1831 }
1832
ccdbf0aa 1833 if (!queue_max_zone_append_sectors(q)) {
03e51c4a
DLM
1834 pr_warn("%s: Invalid 0 maximum zone append limit\n",
1835 disk->disk_name);
1836 return -ENODEV;
1837 }
1a1206dc 1838
e94f5819 1839 /*
6c6b3549
CH
1840 * Ensure that all memory allocations in this context are done as if
1841 * GFP_NOIO was specified.
e94f5819 1842 */
03e51c4a
DLM
1843 args.disk = disk;
1844 args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors);
6c6b3549 1845 noio_flag = memalloc_noio_save();
dd291d77
DLM
1846 ret = disk_revalidate_zone_resources(disk, args.nr_zones);
1847 if (ret) {
1848 memalloc_noio_restore(noio_flag);
1849 return ret;
1850 }
6c6b3549
CH
1851 ret = disk->fops->report_zones(disk, 0, UINT_MAX,
1852 blk_revalidate_zone_cb, &args);
2afdeb23
DLM
1853 if (!ret) {
1854 pr_warn("%s: No zones reported\n", disk->disk_name);
1855 ret = -ENODEV;
1856 }
6c6b3549 1857 memalloc_noio_restore(noio_flag);
bf505456 1858
2afdeb23
DLM
1859 /*
1860 * If zones where reported, make sure that the entire disk capacity
1861 * has been checked.
1862 */
03e51c4a 1863 if (ret > 0 && args.sector != capacity) {
2afdeb23
DLM
1864 pr_warn("%s: Missing zones from sector %llu\n",
1865 disk->disk_name, args.sector);
1866 ret = -ENODEV;
1867 }
1868
bf505456 1869 /*
02ccd7c3
DLM
1870 * Set the new disk zone parameters only once the queue is frozen and
1871 * all I/Os are completed.
bf505456
DLM
1872 */
1873 blk_mq_freeze_queue(q);
9b3c08b9 1874 if (ret > 0)
843283e9 1875 ret = disk_update_zone_resources(disk, &args);
9b3c08b9 1876 else
bf505456 1877 pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
843283e9
DLM
1878 if (ret)
1879 disk_free_zone_resources(disk);
d4100351 1880 blk_mq_unfreeze_queue(q);
bf505456 1881
f216fdd7 1882 kfree(args.conv_zones_bitmap);
ecfe43b1 1883
bf505456
DLM
1884 return ret;
1885}
1886EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
d9f1439a
DLM
1887
1888#ifdef CONFIG_BLK_DEBUG_FS
1889
a98b05b0 1890int queue_zone_wplugs_show(void *data, struct seq_file *m)
d9f1439a
DLM
1891{
1892 struct request_queue *q = data;
a98b05b0
DLM
1893 struct gendisk *disk = q->disk;
1894 struct blk_zone_wplug *zwplug;
1895 unsigned int zwp_wp_offset, zwp_flags;
1896 unsigned int zwp_zone_no, zwp_ref;
1897 unsigned int zwp_bio_list_size, i;
1898 unsigned long flags;
d9f1439a 1899
57787fa4
JT
1900 if (!disk->zone_wplugs_hash)
1901 return 0;
1902
a98b05b0
DLM
1903 rcu_read_lock();
1904 for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
1905 hlist_for_each_entry_rcu(zwplug,
1906 &disk->zone_wplugs_hash[i], node) {
1907 spin_lock_irqsave(&zwplug->lock, flags);
1908 zwp_zone_no = zwplug->zone_no;
1909 zwp_flags = zwplug->flags;
1910 zwp_ref = atomic_read(&zwplug->ref);
1911 zwp_wp_offset = zwplug->wp_offset;
1912 zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
1913 spin_unlock_irqrestore(&zwplug->lock, flags);
d9f1439a 1914
a98b05b0
DLM
1915 seq_printf(m, "%u 0x%x %u %u %u\n",
1916 zwp_zone_no, zwp_flags, zwp_ref,
1917 zwp_wp_offset, zwp_bio_list_size);
1918 }
1919 }
1920 rcu_read_unlock();
d9f1439a
DLM
1921
1922 return 0;
1923}
1924
1925#endif