return queue_var_show(disk_nr_zones(disk), page);
}
+static ssize_t queue_zoned_qd1_writes_show(struct gendisk *disk, char *page)
+{
+ return queue_var_show(!!blk_queue_zoned_qd1_writes(disk->queue),
+ page);
+}
+
+static ssize_t queue_zoned_qd1_writes_store(struct gendisk *disk,
+ const char *page, size_t count)
+{
+ struct request_queue *q = disk->queue;
+ unsigned long qd1_writes;
+ unsigned int memflags;
+ ssize_t ret;
+
+ ret = queue_var_store(&qd1_writes, page, count);
+ if (ret < 0)
+ return ret;
+
+ memflags = blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
+ if (qd1_writes)
+ blk_queue_flag_set(QUEUE_FLAG_ZONED_QD1_WRITES, q);
+ else
+ blk_queue_flag_clear(QUEUE_FLAG_ZONED_QD1_WRITES, q);
+ blk_mq_unquiesce_queue(q);
+ blk_mq_unfreeze_queue(q, memflags);
+
+ return count;
+}
+
static ssize_t queue_iostats_passthrough_show(struct gendisk *disk, char *page)
{
return queue_var_show(!!blk_queue_passthrough_stat(disk->queue), page);
QUEUE_LIM_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
QUEUE_LIM_RO_ENTRY(queue_zoned, "zoned");
+QUEUE_RW_ENTRY(queue_zoned_qd1_writes, "zoned_qd1_writes");
QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
QUEUE_LIM_RO_ENTRY(queue_max_open_zones, "max_open_zones");
QUEUE_LIM_RO_ENTRY(queue_max_active_zones, "max_active_zones");
&queue_nomerges_entry.attr,
&queue_poll_entry.attr,
&queue_poll_delay_entry.attr,
+ &queue_zoned_qd1_writes_entry.attr,
NULL,
};
struct request_queue *q = disk->queue;
if ((attr == &queue_max_open_zones_entry.attr ||
- attr == &queue_max_active_zones_entry.attr) &&
+ attr == &queue_max_active_zones_entry.attr ||
+ attr == &queue_zoned_qd1_writes_entry.attr) &&
!blk_queue_is_zoned(q))
return 0;
#include <linux/spinlock.h>
#include <linux/refcount.h>
#include <linux/mempool.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include <trace/events/block.h>
/*
* Per-zone write plug.
* @node: hlist_node structure for managing the plug using a hash table.
+ * @entry: list_head structure for listing the plug in the disk list of active
+ * zone write plugs.
* @bio_list: The list of BIOs that are currently plugged.
* @bio_work: Work struct to handle issuing of plugged BIOs
* @rcu_head: RCU head to free zone write plugs with an RCU grace period.
*/
struct blk_zone_wplug {
struct hlist_node node;
+ struct list_head entry;
struct bio_list bio_list;
struct work_struct bio_work;
struct rcu_head rcu_head;
}
}
-static void blk_zone_wplug_bio_work(struct work_struct *work);
+static bool disk_zone_wplug_submit_bio(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug);
+
+static void blk_zone_wplug_bio_work(struct work_struct *work)
+{
+ struct blk_zone_wplug *zwplug =
+ container_of(work, struct blk_zone_wplug, bio_work);
+
+ disk_zone_wplug_submit_bio(zwplug->disk, zwplug);
+
+ /* Drop the reference we took in disk_zone_wplug_schedule_work(). */
+ disk_put_zone_wplug(zwplug);
+}
/*
* Get a zone write plug for the zone containing @sector.
zwplug->wp_offset = bdev_offset_from_zone_start(disk->part0, sector);
bio_list_init(&zwplug->bio_list);
INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
+ INIT_LIST_HEAD(&zwplug->entry);
zwplug->disk = disk;
/*
*/
static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
{
+ struct gendisk *disk = zwplug->disk;
struct bio *bio;
lockdep_assert_held(&zwplug->lock);
blk_zone_wplug_bio_io_error(zwplug, bio);
zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+
+ /*
+ * If we are using the per disk zone write plugs worker thread, remove
+ * the zone write plug from the work list and drop the reference we
+ * took when the zone write plug was added to that list.
+ */
+ if (blk_queue_zoned_qd1_writes(disk->queue)) {
+ spin_lock(&disk->zone_wplugs_list_lock);
+ if (!list_empty(&zwplug->entry)) {
+ list_del_init(&zwplug->entry);
+ disk_put_zone_wplug(zwplug);
+ }
+ spin_unlock(&disk->zone_wplugs_list_lock);
+ }
}
/*
}
}
-static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
- struct blk_zone_wplug *zwplug)
+static void disk_zone_wplug_schedule_work(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
{
lockdep_assert_held(&zwplug->lock);
* and we also drop this reference if the work is already scheduled.
*/
WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
+ WARN_ON_ONCE(blk_queue_zoned_qd1_writes(disk->queue));
refcount_inc(&zwplug->ref);
if (!queue_work(disk->zone_wplugs_wq, &zwplug->bio_work))
disk_put_zone_wplug(zwplug);
bio_list_add(&zwplug->bio_list, bio);
trace_disk_zone_wplug_add_bio(zwplug->disk->queue, zwplug->zone_no,
bio->bi_iter.bi_sector, bio_sectors(bio));
+
+ /*
+ * If we are using the disk zone write plugs worker instead of the per
+ * zone write plug BIO work, add the zone write plug to the work list
+ * if it is not already there. Make sure to also get an extra reference
+ * on the zone write plug so that it does not go away until it is
+ * removed from the work list.
+ */
+ if (blk_queue_zoned_qd1_writes(disk->queue)) {
+ spin_lock(&disk->zone_wplugs_list_lock);
+ if (list_empty(&zwplug->entry)) {
+ list_add_tail(&zwplug->entry, &disk->zone_wplugs_list);
+ refcount_inc(&zwplug->ref);
+ }
+ spin_unlock(&disk->zone_wplugs_list_lock);
+ }
}
/*
goto queue_bio;
}
+ /*
+ * For rotational devices, we will use the gendisk zone write plugs
+ * work instead of the per zone write plug BIO work, so queue the BIO.
+ */
+ if (blk_queue_zoned_qd1_writes(disk->queue))
+ goto queue_bio;
+
/* If the zone is already plugged, add the BIO to the BIO plug list. */
if (zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)
goto queue_bio;
if (!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)) {
zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
- disk_zone_wplug_schedule_bio_work(disk, zwplug);
+ if (blk_queue_zoned_qd1_writes(disk->queue))
+ wake_up_process(disk->zone_wplugs_worker);
+ else
+ disk_zone_wplug_schedule_work(disk, zwplug);
}
spin_unlock_irqrestore(&zwplug->lock, flags);
spin_lock_irqsave(&zwplug->lock, flags);
- /* Schedule submission of the next plugged BIO if we have one. */
- if (!bio_list_empty(&zwplug->bio_list)) {
- disk_zone_wplug_schedule_bio_work(disk, zwplug);
- spin_unlock_irqrestore(&zwplug->lock, flags);
- return;
- }
+ /*
+ * For rotational devices, signal the BIO completion to the zone write
+ * plug work. Otherwise, schedule submission of the next plugged BIO
+ * if we have one.
+ */
+ if (bio_list_empty(&zwplug->bio_list))
+ zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+
+ if (blk_queue_zoned_qd1_writes(disk->queue))
+ complete(&disk->zone_wplugs_worker_bio_done);
+ else if (!bio_list_empty(&zwplug->bio_list))
+ disk_zone_wplug_schedule_work(disk, zwplug);
- zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
if (!zwplug->wp_offset || disk_zone_wplug_is_full(disk, zwplug))
disk_mark_zone_wplug_dead(zwplug);
+
spin_unlock_irqrestore(&zwplug->lock, flags);
}
disk_put_zone_wplug(zwplug);
}
-static void blk_zone_wplug_bio_work(struct work_struct *work)
+static bool disk_zone_wplug_submit_bio(struct gendisk *disk,
+ struct blk_zone_wplug *zwplug)
{
- struct blk_zone_wplug *zwplug =
- container_of(work, struct blk_zone_wplug, bio_work);
struct block_device *bdev;
unsigned long flags;
struct bio *bio;
if (!bio) {
zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
spin_unlock_irqrestore(&zwplug->lock, flags);
- goto put_zwplug;
+ return false;
}
trace_blk_zone_wplug_bio(zwplug->disk->queue, zwplug->zone_no,
goto again;
}
- bdev = bio->bi_bdev;
-
/*
* blk-mq devices will reuse the extra reference on the request queue
* usage counter we took when the BIO was plugged, but the submission
* path for BIO-based devices will not do that. So drop this extra
* reference here.
*/
+ if (blk_queue_zoned_qd1_writes(disk->queue))
+ reinit_completion(&disk->zone_wplugs_worker_bio_done);
+ bdev = bio->bi_bdev;
if (bdev_test_flag(bdev, BD_HAS_SUBMIT_BIO)) {
bdev->bd_disk->fops->submit_bio(bio);
blk_queue_exit(bdev->bd_disk->queue);
blk_mq_submit_bio(bio);
}
-put_zwplug:
- /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
- disk_put_zone_wplug(zwplug);
+ return true;
+}
+
+static struct blk_zone_wplug *disk_get_zone_wplugs_work(struct gendisk *disk)
+{
+ struct blk_zone_wplug *zwplug;
+
+ spin_lock_irq(&disk->zone_wplugs_list_lock);
+ zwplug = list_first_entry_or_null(&disk->zone_wplugs_list,
+ struct blk_zone_wplug, entry);
+ if (zwplug)
+ list_del_init(&zwplug->entry);
+ spin_unlock_irq(&disk->zone_wplugs_list_lock);
+
+ return zwplug;
+}
+
+static int disk_zone_wplugs_worker(void *data)
+{
+ struct gendisk *disk = data;
+ struct blk_zone_wplug *zwplug;
+ unsigned int noio_flag;
+
+ noio_flag = memalloc_noio_save();
+ set_user_nice(current, MIN_NICE);
+ set_freezable();
+
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
+
+ zwplug = disk_get_zone_wplugs_work(disk);
+ if (zwplug) {
+ /*
+ * Process all BIOs of this zone write plug and then
+ * drop the reference we took when adding the zone write
+ * plug to the active list.
+ */
+ set_current_state(TASK_RUNNING);
+ while (disk_zone_wplug_submit_bio(disk, zwplug))
+ blk_wait_io(&disk->zone_wplugs_worker_bio_done);
+ disk_put_zone_wplug(zwplug);
+ continue;
+ }
+
+ /*
+ * Only sleep if nothing sets the state to running. Else check
+ * for zone write plugs work again as a newly submitted BIO
+ * might have added a zone write plug to the work list.
+ */
+ if (get_current_state() == TASK_RUNNING) {
+ try_to_freeze();
+ } else {
+ if (kthread_should_stop()) {
+ set_current_state(TASK_RUNNING);
+ break;
+ }
+ schedule();
+ }
+ }
+
+ WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list));
+ memalloc_noio_restore(noio_flag);
+
+ return 0;
}
void disk_init_zone_resources(struct gendisk *disk)
{
spin_lock_init(&disk->zone_wplugs_hash_lock);
+ spin_lock_init(&disk->zone_wplugs_list_lock);
+ INIT_LIST_HEAD(&disk->zone_wplugs_list);
+ init_completion(&disk->zone_wplugs_worker_bio_done);
}
/*
unsigned int pool_size)
{
unsigned int i;
+ int ret = -ENOMEM;
atomic_set(&disk->nr_zone_wplugs, 0);
disk->zone_wplugs_hash_bits =
if (!disk->zone_wplugs_wq)
goto destroy_pool;
+ disk->zone_wplugs_worker =
+ kthread_create(disk_zone_wplugs_worker, disk,
+ "%s_zwplugs_worker", disk->disk_name);
+ if (IS_ERR(disk->zone_wplugs_worker)) {
+ ret = PTR_ERR(disk->zone_wplugs_worker);
+ disk->zone_wplugs_worker = NULL;
+ goto destroy_wq;
+ }
+ wake_up_process(disk->zone_wplugs_worker);
+
return 0;
+destroy_wq:
+ destroy_workqueue(disk->zone_wplugs_wq);
+ disk->zone_wplugs_wq = NULL;
destroy_pool:
mempool_destroy(disk->zone_wplugs_pool);
disk->zone_wplugs_pool = NULL;
kfree(disk->zone_wplugs_hash);
disk->zone_wplugs_hash = NULL;
disk->zone_wplugs_hash_bits = 0;
- return -ENOMEM;
+ return ret;
}
static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
void disk_free_zone_resources(struct gendisk *disk)
{
+ if (disk->zone_wplugs_worker)
+ kthread_stop(disk->zone_wplugs_worker);
+ WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list));
+
if (disk->zone_wplugs_wq) {
destroy_workqueue(disk->zone_wplugs_wq);
disk->zone_wplugs_wq = NULL;