nvme-multipath: introduce delayed removal of the multipath head node

author Nilay Shroff <nilay@linux.ibm.com>

Wed, 14 May 2025 13:03:15 +0000 (18:33 +0530)

committer Christoph Hellwig <hch@lst.de>

Tue, 20 May 2025 03:34:27 +0000 (05:34 +0200)
author Nilay Shroff <nilay@linux.ibm.com>
Wed, 14 May 2025 13:03:15 +0000 (18:33 +0530)
committer Christoph Hellwig <hch@lst.de>
Tue, 20 May 2025 03:34:27 +0000 (05:34 +0200)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c

index af871d268fcb6434ce10ac9e3d0cd6980f52974e..6d11f8b19633ef8df39efa3b3eb7f091075912f4 100644 (file)
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3743,7 +3743,7 @@ static struct nvme_ns_head *nvme_find_ns_head(struct nvme_ctrl *ctrl,
                  */
                 if (h->ns_id != nsid || !nvme_is_unique_nsid(ctrl, h))
                         continue;
-               if (!list_empty(&h->list) && nvme_tryget_ns_head(h))
+               if (nvme_tryget_ns_head(h))
                         return h;
         }
  
@@ -3987,7 +3987,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
                 }
         } else {
                 ret = -EINVAL;
-               if (!info->is_shared || !head->shared) {
+               if ((!info->is_shared || !head->shared) &&
+                   !list_empty(&head->list)) {
                         dev_err(ctrl->device,
                                 "Duplicate unshared namespace %d\n",
                                 info->nsid);
@@ -4191,7 +4192,8 @@ static void nvme_ns_remove(struct nvme_ns *ns)
         mutex_lock(&ns->ctrl->subsys->lock);
         list_del_rcu(&ns->siblings);
         if (list_empty(&ns->head->list)) {
-               list_del_init(&ns->head->entry);
+               if (!nvme_mpath_queue_if_no_path(ns->head))
+                       list_del_init(&ns->head->entry);
                 last_path = true;
         }
         mutex_unlock(&ns->ctrl->subsys->lock);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c

index 250f3da67cc9f7d16689d8f1fabec29bc412b55a..2db326d6114f4176ac62a2887348e35ef9820deb 100644 (file)
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -442,7 +442,17 @@ static bool nvme_available_path(struct nvme_ns_head *head)
                         break;
                 }
         }
-       return false;
+
+       /*
+        * If "head->delayed_removal_secs" is configured (i.e., non-zero), do
+        * not immediately fail I/O. Instead, requeue the I/O for the configured
+        * duration, anticipating that if there's a transient link failure then
+        * it may recover within this time window. This parameter is exported to
+        * userspace via sysfs, and its default value is zero. It is internally
+        * mapped to NVME_NSHEAD_QUEUE_IF_NO_PATH. When delayed_removal_secs is
+        * non-zero, this flag is set to true. When zero, the flag is cleared.
+        */
+       return nvme_mpath_queue_if_no_path(head);
  }
  
  static void nvme_ns_head_submit_bio(struct bio *bio)
@@ -617,6 +627,40 @@ static void nvme_requeue_work(struct work_struct *work)
         }
  }
  
+static void nvme_remove_head(struct nvme_ns_head *head)
+{
+       if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
+               /*
+                * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
+                * to allow multipath to fail all I/O.
+                */
+               kblockd_schedule_work(&head->requeue_work);
+
+               nvme_cdev_del(&head->cdev, &head->cdev_device);
+               synchronize_srcu(&head->srcu);
+               del_gendisk(head->disk);
+               nvme_put_ns_head(head);
+       }
+}
+
+static void nvme_remove_head_work(struct work_struct *work)
+{
+       struct nvme_ns_head *head = container_of(to_delayed_work(work),
+                       struct nvme_ns_head, remove_work);
+       bool shutdown = false;
+
+       mutex_lock(&head->subsys->lock);
+       if (list_empty(&head->list)) {
+               list_del_init(&head->entry);
+               shutdown = true;
+       }
+       mutex_unlock(&head->subsys->lock);
+       if (shutdown)
+               nvme_remove_head(head);
+
+       module_put(THIS_MODULE);
+}
+
  int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
  {
         struct queue_limits lim;
@@ -626,6 +670,8 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
         spin_lock_init(&head->requeue_lock);
         INIT_WORK(&head->requeue_work, nvme_requeue_work);
         INIT_WORK(&head->partition_scan_work, nvme_partition_scan_work);
+       INIT_DELAYED_WORK(&head->remove_work, nvme_remove_head_work);
+       head->delayed_removal_secs = 0;
  
         /*
          * Add a multipath node if the subsystems supports multiple controllers.
@@ -659,6 +705,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
         set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
         sprintf(head->disk->disk_name, "nvme%dn%d",
                         ctrl->subsys->instance, head->instance);
+       nvme_tryget_ns_head(head);
         return 0;
  }
  
@@ -1015,6 +1062,49 @@ static ssize_t numa_nodes_show(struct device *dev, struct device_attribute *attr
  }
  DEVICE_ATTR_RO(numa_nodes);
  
+static ssize_t delayed_removal_secs_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       struct gendisk *disk = dev_to_disk(dev);
+       struct nvme_ns_head *head = disk->private_data;
+       int ret;
+
+       mutex_lock(&head->subsys->lock);
+       ret = sysfs_emit(buf, "%u\n", head->delayed_removal_secs);
+       mutex_unlock(&head->subsys->lock);
+       return ret;
+}
+
+static ssize_t delayed_removal_secs_store(struct device *dev,
+               struct device_attribute *attr, const char *buf, size_t count)
+{
+       struct gendisk *disk = dev_to_disk(dev);
+       struct nvme_ns_head *head = disk->private_data;
+       unsigned int sec;
+       int ret;
+
+       ret = kstrtouint(buf, 0, &sec);
+       if (ret < 0)
+               return ret;
+
+       mutex_lock(&head->subsys->lock);
+       head->delayed_removal_secs = sec;
+       if (sec)
+               set_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
+       else
+               clear_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags);
+       mutex_unlock(&head->subsys->lock);
+       /*
+        * Ensure that update to NVME_NSHEAD_QUEUE_IF_NO_PATH is seen
+        * by its reader.
+        */
+       synchronize_srcu(&head->srcu);
+
+       return count;
+}
+
+DEVICE_ATTR_RW(delayed_removal_secs);
+
  static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
                 struct nvme_ana_group_desc *desc, void *data)
  {
@@ -1138,18 +1228,38 @@ void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
  
  void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
  {
-       if (!head->disk)
-               return;
-       if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
-               nvme_cdev_del(&head->cdev, &head->cdev_device);
+       bool shutdown = false;
+
+       mutex_lock(&head->subsys->lock);
+       /*
+        * We are called when all paths have been removed, and at that point
+        * head->list is expected to be empty. However, nvme_remove_ns() and
+        * nvme_init_ns_head() can run concurrently and so if head->delayed_
+        * removal_secs is configured, it is possible that by the time we reach
+        * this point, head->list may no longer be empty. Therefore, we recheck
+        * head->list here. If it is no longer empty then we skip enqueuing the
+        * delayed head removal work.
+        */
+       if (!list_empty(&head->list))
+               goto out;
+
+       if (head->delayed_removal_secs) {
                 /*
-                * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
-                * to allow multipath to fail all I/O.
+                * Ensure that no one could remove this module while the head
+                * remove work is pending.
                  */
-               synchronize_srcu(&head->srcu);
-               kblockd_schedule_work(&head->requeue_work);
-               del_gendisk(head->disk);
+               if (!try_module_get(THIS_MODULE))
+                       goto out;
+               queue_delayed_work(nvme_wq, &head->remove_work,
+                               head->delayed_removal_secs * HZ);
+       } else {
+               list_del_init(&head->entry);
+               shutdown = true;
         }
+out:
+       mutex_unlock(&head->subsys->lock);
+       if (shutdown)
+               nvme_remove_head(head);
  }
  
  void nvme_mpath_remove_disk(struct nvme_ns_head *head)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h

index 7aad581271c7efd68185924300eb54e76db364cb..f20076f6f06a775a07f4c50fc7fb3d1b226f70b7 100644 (file)
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -506,7 +506,10 @@ struct nvme_ns_head {
         struct work_struct      partition_scan_work;
         struct mutex            lock;
         unsigned long           flags;
-#define NVME_NSHEAD_DISK_LIVE  0
+       struct delayed_work     remove_work;
+       unsigned int            delayed_removal_secs;
+#define NVME_NSHEAD_DISK_LIVE          0
+#define NVME_NSHEAD_QUEUE_IF_NO_PATH   1
         struct nvme_ns __rcu    *current_path[];
  #endif
  };
@@ -989,12 +992,19 @@ extern struct device_attribute dev_attr_ana_grpid;
  extern struct device_attribute dev_attr_ana_state;
  extern struct device_attribute dev_attr_queue_depth;
  extern struct device_attribute dev_attr_numa_nodes;
+extern struct device_attribute dev_attr_delayed_removal_secs;
  extern struct device_attribute subsys_attr_iopolicy;
  
  static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
  {
         return disk->fops == &nvme_ns_head_ops;
  }
+static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head)
+{
+       if (test_bit(NVME_NSHEAD_QUEUE_IF_NO_PATH, &head->flags))
+               return true;
+       return false;
+}
  #else
  #define multipath false
  static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
@@ -1082,6 +1092,10 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
  {
         return false;
  }
+static inline bool nvme_mpath_queue_if_no_path(struct nvme_ns_head *head)
+{
+       return false;
+}
  #endif /* CONFIG_NVME_MULTIPATH */
  
  int nvme_ns_get_unique_id(struct nvme_ns *ns, u8 id[16],
diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c

index 6d31226f7a4f8419fd69a2857952e4085dc27aa2..a48d30c31d5145526807af255c343a1fce138945 100644 (file)
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -260,6 +260,7 @@ static struct attribute *nvme_ns_attrs[] = {
         &dev_attr_ana_state.attr,
         &dev_attr_queue_depth.attr,
         &dev_attr_numa_nodes.attr,
+       &dev_attr_delayed_removal_secs.attr,
  #endif
         &dev_attr_io_passthru_err_log_enabled.attr,
         NULL,
@@ -296,6 +297,12 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj,
                 if (nvme_disk_is_ns_head(dev_to_disk(dev)))
                         return 0;
         }
+       if (a == &dev_attr_delayed_removal_secs.attr) {
+               struct gendisk *disk = dev_to_disk(dev);
+
+               if (!nvme_disk_is_ns_head(disk))
+                       return 0;
+       }
  #endif
         return a->mode;
  }
author	Nilay Shroff <nilay@linux.ibm.com>
	Wed, 14 May 2025 13:03:15 +0000 (18:33 +0530)
committer	Christoph Hellwig <hch@lst.de>
	Tue, 20 May 2025 03:34:27 +0000 (05:34 +0200)
drivers/nvme/host/core.c		patch \| blob \| blame \| history
drivers/nvme/host/multipath.c		patch \| blob \| blame \| history
drivers/nvme/host/nvme.h		patch \| blob \| blame \| history
drivers/nvme/host/sysfs.c		patch \| blob \| blame \| history