--- /dev/null
+From aa1c09cb65e2ed17cb8e652bc7ec84e0af1229eb Mon Sep 17 00:00:00 2001
+From: Damien Le Moal <damien.lemoal@wdc.com>
+Date: Thu, 29 Oct 2020 20:05:00 +0900
+Subject: null_blk: Fix locking in zoned mode
+
+From: Damien Le Moal <damien.lemoal@wdc.com>
+
+commit aa1c09cb65e2ed17cb8e652bc7ec84e0af1229eb upstream.
+
+When the zoned mode is enabled in null_blk, Serializing read, write
+and zone management operations for each zone is necessary to protect
+device level information for managing zone resources (zone open and
+closed counters) as well as each zone condition and write pointer
+position. Commit 35bc10b2eafb ("null_blk: synchronization fix for
+zoned device") introduced a spinlock to implement this serialization.
+However, when memory backing is also enabled, GFP_NOIO memory
+allocations are executed under the spinlock, resulting in might_sleep()
+warnings. Furthermore, the zone_lock spinlock is locked/unlocked using
+spin_lock_irq/spin_unlock_irq, similarly to the memory backing code with
+the nullb->lock spinlock. This nested use of irq locks wrecks the irq
+enabled/disabled state.
+
+Fix all this by introducing a bitmap for per-zone lock, with locking
+implemented using wait_on_bit_lock_io() and clear_and_wake_up_bit().
+This locking mechanism allows keeping a zone locked while executing
+null_process_cmd(), serializing all operations to the zone while
+allowing to sleep during memory backing allocation with GFP_NOIO.
+Device level zone resource management information is protected using
+a spinlock which is not held while executing null_process_cmd();
+
+Fixes: 35bc10b2eafb ("null_blk: synchronization fix for zoned device")
+Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+
+---
+ drivers/block/null_blk.h | 2
+ drivers/block/null_blk_zoned.c | 133 +++++++++++++++++++++++++++++------------
+ 2 files changed, 98 insertions(+), 37 deletions(-)
+
+--- a/drivers/block/null_blk.h
++++ b/drivers/block/null_blk.h
+@@ -44,7 +44,7 @@ struct nullb_device {
+ unsigned int nr_zones;
+ struct blk_zone *zones;
+ sector_t zone_size_sects;
+- spinlock_t zone_lock;
++ unsigned long *zone_locks;
+
+ unsigned long size; /* device size in MB */
+ unsigned long completion_nsec; /* time in ns to complete a request */
+--- a/drivers/block/null_blk_zoned.c
++++ b/drivers/block/null_blk_zoned.c
+@@ -1,5 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0
+ #include <linux/vmalloc.h>
++#include <linux/bitmap.h>
+ #include "null_blk.h"
+
+ #define CREATE_TRACE_POINTS
+@@ -45,7 +46,12 @@ int null_init_zoned_dev(struct nullb_dev
+ if (!dev->zones)
+ return -ENOMEM;
+
+- spin_lock_init(&dev->zone_lock);
++ dev->zone_locks = bitmap_zalloc(dev->nr_zones, GFP_KERNEL);
++ if (!dev->zone_locks) {
++ kvfree(dev->zones);
++ return -ENOMEM;
++ }
++
+ if (dev->zone_nr_conv >= dev->nr_zones) {
+ dev->zone_nr_conv = dev->nr_zones - 1;
+ pr_info("changed the number of conventional zones to %u",
+@@ -106,15 +112,26 @@ int null_register_zoned_dev(struct nullb
+
+ void null_free_zoned_dev(struct nullb_device *dev)
+ {
++ bitmap_free(dev->zone_locks);
+ kvfree(dev->zones);
+ }
+
++static inline void null_lock_zone(struct nullb_device *dev, unsigned int zno)
++{
++ wait_on_bit_lock_io(dev->zone_locks, zno, TASK_UNINTERRUPTIBLE);
++}
++
++static inline void null_unlock_zone(struct nullb_device *dev, unsigned int zno)
++{
++ clear_and_wake_up_bit(zno, dev->zone_locks);
++}
++
+ int null_report_zones(struct gendisk *disk, sector_t sector,
+ unsigned int nr_zones, report_zones_cb cb, void *data)
+ {
+ struct nullb *nullb = disk->private_data;
+ struct nullb_device *dev = nullb->dev;
+- unsigned int first_zone, i;
++ unsigned int first_zone, i, zno;
+ struct blk_zone zone;
+ int error;
+
+@@ -125,17 +142,17 @@ int null_report_zones(struct gendisk *di
+ nr_zones = min(nr_zones, dev->nr_zones - first_zone);
+ trace_nullb_report_zones(nullb, nr_zones);
+
+- for (i = 0; i < nr_zones; i++) {
++ zno = first_zone;
++ for (i = 0; i < nr_zones; i++, zno++) {
+ /*
+ * Stacked DM target drivers will remap the zone information by
+ * modifying the zone information passed to the report callback.
+ * So use a local copy to avoid corruption of the device zone
+ * array.
+ */
+- spin_lock_irq(&dev->zone_lock);
+- memcpy(&zone, &dev->zones[first_zone + i],
+- sizeof(struct blk_zone));
+- spin_unlock_irq(&dev->zone_lock);
++ null_lock_zone(dev, zno);
++ memcpy(&zone, &dev->zones[zno], sizeof(struct blk_zone));
++ null_unlock_zone(dev, zno);
+
+ error = cb(&zone, i, data);
+ if (error)
+@@ -145,6 +162,10 @@ int null_report_zones(struct gendisk *di
+ return nr_zones;
+ }
+
++/*
++ * This is called in the case of memory backing from null_process_cmd()
++ * with the target zone already locked.
++ */
+ size_t null_zone_valid_read_len(struct nullb *nullb,
+ sector_t sector, unsigned int len)
+ {
+@@ -176,10 +197,13 @@ static blk_status_t null_zone_write(stru
+ if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+ return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors);
+
++ null_lock_zone(dev, zno);
++
+ switch (zone->cond) {
+ case BLK_ZONE_COND_FULL:
+ /* Cannot write to a full zone */
+- return BLK_STS_IOERR;
++ ret = BLK_STS_IOERR;
++ break;
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_IMP_OPEN:
+ case BLK_ZONE_COND_EXP_OPEN:
+@@ -197,68 +221,96 @@ static blk_status_t null_zone_write(stru
+ else
+ cmd->rq->__sector = sector;
+ } else if (sector != zone->wp) {
+- return BLK_STS_IOERR;
++ ret = BLK_STS_IOERR;
++ break;
+ }
+
+- if (zone->wp + nr_sectors > zone->start + zone->capacity)
+- return BLK_STS_IOERR;
++ if (zone->wp + nr_sectors > zone->start + zone->capacity) {
++ ret = BLK_STS_IOERR;
++ break;
++ }
+
+ if (zone->cond != BLK_ZONE_COND_EXP_OPEN)
+ zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+ ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors);
+ if (ret != BLK_STS_OK)
+- return ret;
++ break;
+
+ zone->wp += nr_sectors;
+ if (zone->wp == zone->start + zone->capacity)
+ zone->cond = BLK_ZONE_COND_FULL;
+- return BLK_STS_OK;
++ ret = BLK_STS_OK;
++ break;
+ default:
+ /* Invalid zone condition */
+- return BLK_STS_IOERR;
++ ret = BLK_STS_IOERR;
+ }
++
++ null_unlock_zone(dev, zno);
++
++ return ret;
+ }
+
+ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op,
+ sector_t sector)
+ {
+ struct nullb_device *dev = cmd->nq->dev;
+- unsigned int zone_no = null_zone_no(dev, sector);
+- struct blk_zone *zone = &dev->zones[zone_no];
++ unsigned int zone_no;
++ struct blk_zone *zone;
++ blk_status_t ret = BLK_STS_OK;
+ size_t i;
+
+- switch (op) {
+- case REQ_OP_ZONE_RESET_ALL:
++ if (op == REQ_OP_ZONE_RESET_ALL) {
+ for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) {
++ null_lock_zone(dev, i);
+ zone = &dev->zones[i];
+ if (zone->cond != BLK_ZONE_COND_EMPTY) {
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ zone->wp = zone->start;
+ trace_nullb_zone_op(cmd, i, zone->cond);
+ }
++ null_unlock_zone(dev, i);
+ }
+ return BLK_STS_OK;
++ }
++
++ zone_no = null_zone_no(dev, sector);
++ zone = &dev->zones[zone_no];
++
++ null_lock_zone(dev, zone_no);
++
++ switch (op) {
+ case REQ_OP_ZONE_RESET:
+- if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+- return BLK_STS_IOERR;
++ if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) {
++ ret = BLK_STS_IOERR;
++ break;
++ }
+
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ zone->wp = zone->start;
+ break;
+ case REQ_OP_ZONE_OPEN:
+- if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+- return BLK_STS_IOERR;
+- if (zone->cond == BLK_ZONE_COND_FULL)
+- return BLK_STS_IOERR;
++ if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) {
++ ret = BLK_STS_IOERR;
++ break;
++ }
++ if (zone->cond == BLK_ZONE_COND_FULL) {
++ ret = BLK_STS_IOERR;
++ break;
++ }
+
+ zone->cond = BLK_ZONE_COND_EXP_OPEN;
+ break;
+ case REQ_OP_ZONE_CLOSE:
+- if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+- return BLK_STS_IOERR;
+- if (zone->cond == BLK_ZONE_COND_FULL)
+- return BLK_STS_IOERR;
++ if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) {
++ ret = BLK_STS_IOERR;
++ break;
++ }
++ if (zone->cond == BLK_ZONE_COND_FULL) {
++ ret = BLK_STS_IOERR;
++ break;
++ }
+
+ if (zone->wp == zone->start)
+ zone->cond = BLK_ZONE_COND_EMPTY;
+@@ -266,27 +318,35 @@ static blk_status_t null_zone_mgmt(struc
+ zone->cond = BLK_ZONE_COND_CLOSED;
+ break;
+ case REQ_OP_ZONE_FINISH:
+- if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+- return BLK_STS_IOERR;
++ if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) {
++ ret = BLK_STS_IOERR;
++ break;
++ }
+
+ zone->cond = BLK_ZONE_COND_FULL;
+ zone->wp = zone->start + zone->len;
++ ret = BLK_STS_OK;
+ break;
+ default:
+- return BLK_STS_NOTSUPP;
++ ret = BLK_STS_NOTSUPP;
++ break;
+ }
+
+- trace_nullb_zone_op(cmd, zone_no, zone->cond);
+- return BLK_STS_OK;
++ if (ret == BLK_STS_OK)
++ trace_nullb_zone_op(cmd, zone_no, zone->cond);
++
++ null_unlock_zone(dev, zone_no);
++
++ return ret;
+ }
+
+ blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op,
+ sector_t sector, sector_t nr_sectors)
+ {
+- blk_status_t sts;
+ struct nullb_device *dev = cmd->nq->dev;
++ unsigned int zno = null_zone_no(dev, sector);
++ blk_status_t sts;
+
+- spin_lock_irq(&dev->zone_lock);
+ switch (op) {
+ case REQ_OP_WRITE:
+ sts = null_zone_write(cmd, sector, nr_sectors, false);
+@@ -302,9 +362,10 @@ blk_status_t null_process_zoned_cmd(stru
+ sts = null_zone_mgmt(cmd, op, sector);
+ break;
+ default:
++ null_lock_zone(dev, zno);
+ sts = null_process_cmd(cmd, op, sector, nr_sectors);
++ null_unlock_zone(dev, zno);
+ }
+- spin_unlock_irq(&dev->zone_lock);
+
+ return sts;
+ }