]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
nvme-multipath: Add visibility for round-robin io-policy
authorNilay Shroff <nilay@linux.ibm.com>
Sun, 12 Jan 2025 12:41:44 +0000 (18:11 +0530)
committerKeith Busch <kbusch@kernel.org>
Thu, 20 Mar 2025 23:53:54 +0000 (16:53 -0700)
This patch helps add nvme native multipath visibility for round-robin
io-policy. It creates a "multipath" sysfs directory under head gendisk
device node directory and then from "multipath" directory it adds a link
to each namespace path device the head node refers.

For instance, if we have a shared namespace accessible from two different
controllers/paths then we create a soft link to each path device from head
disk node as shown below:

$ ls -l /sys/block/nvme1n1/multipath/
nvme1c1n1 -> ../../../../../pci052e:78/052e:78:00.0/nvme/nvme1/nvme1c1n1
nvme1c3n1 -> ../../../../../pci058e:78/058e:78:00.0/nvme/nvme3/nvme1c3n1

In the above example, nvme1n1 is head gendisk node created for a shared
namespace and the namespace is accessible from nvme1c1n1 and nvme1c3n1
paths.

For round-robin I/O policy, we could easily infer from the above output
that I/O workload targeted to nvme1n1 would toggle across paths nvme1c1n1
and nvme1c3n1.

Reviewed-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
drivers/nvme/host/core.c
drivers/nvme/host/multipath.c
drivers/nvme/host/nvme.h
drivers/nvme/host/sysfs.c

index 818d4e49aab51c388af9a48bf9d466fea9cef51b..870314c521077ccec9a801d600ad681b8a6905ee 100644 (file)
@@ -4020,6 +4020,9 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 
        if (!nvme_ns_head_multipath(ns->head))
                nvme_cdev_del(&ns->cdev, &ns->cdev_device);
+
+       nvme_mpath_remove_sysfs_link(ns);
+
        del_gendisk(ns->disk);
 
        mutex_lock(&ns->ctrl->namespaces_lock);
index 2a7635565083046c575efe1793362ae10581defd..7c89c8da46d6ac0a6cd7a06bb26c36c3cde20dee 100644 (file)
@@ -686,6 +686,8 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
                kblockd_schedule_work(&head->partition_scan_work);
        }
 
+       nvme_mpath_add_sysfs_link(ns->head);
+
        mutex_lock(&head->lock);
        if (nvme_path_is_optimized(ns)) {
                int node, srcu_idx;
@@ -768,6 +770,25 @@ static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
        if (nvme_state_is_live(ns->ana_state) &&
            nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
                nvme_mpath_set_live(ns);
+       else {
+               /*
+                * Add sysfs link from multipath head gendisk node to path
+                * device gendisk node.
+                * If path's ana state is live (i.e. state is either optimized
+                * or non-optimized) while we alloc the ns then sysfs link would
+                * be created from nvme_mpath_set_live(). In that case we would
+                * not fallthrough this code path. However for the path's ana
+                * state other than live, we call nvme_mpath_set_live() only
+                * after ana state transitioned to the live state. But we still
+                * want to create the sysfs link from head node to a path device
+                * irrespctive of the path's ana state.
+                * If we reach through here then it means that path's ana state
+                * is not live but still create the sysfs link to this path from
+                * head node if head node of the path has already come alive.
+                */
+               if (test_bit(NVME_NSHEAD_DISK_LIVE, &ns->head->flags))
+                       nvme_mpath_add_sysfs_link(ns->head);
+       }
 }
 
 static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
@@ -967,6 +988,84 @@ static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
        return -ENXIO; /* just break out of the loop */
 }
 
+void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head)
+{
+       struct device *target;
+       int rc, srcu_idx;
+       struct nvme_ns *ns;
+       struct kobject *kobj;
+
+       /*
+        * Ensure head disk node is already added otherwise we may get invalid
+        * kobj for head disk node
+        */
+       if (!test_bit(GD_ADDED, &head->disk->state))
+               return;
+
+       kobj = &disk_to_dev(head->disk)->kobj;
+
+       /*
+        * loop through each ns chained through the head->list and create the
+        * sysfs link from head node to the ns path node
+        */
+       srcu_idx = srcu_read_lock(&head->srcu);
+
+       list_for_each_entry_rcu(ns, &head->list, siblings) {
+               /*
+                * Avoid creating link if it already exists for the given path.
+                * When path ana state transitions from optimized to non-
+                * optimized or vice-versa, the nvme_mpath_set_live() is
+                * invoked which in truns call this function. Now if the sysfs
+                * link already exists for the given path and we attempt to re-
+                * create the link then sysfs code would warn about it loudly.
+                * So we evaluate NVME_NS_SYSFS_ATTR_LINK flag here to ensure
+                * that we're not creating duplicate link.
+                * The test_and_set_bit() is used because it is protecting
+                * against multiple nvme paths being simultaneously added.
+                */
+               if (test_and_set_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
+                       continue;
+
+               /*
+                * Ensure that ns path disk node is already added otherwise we
+                * may get invalid kobj name for target
+                */
+               if (!test_bit(GD_ADDED, &ns->disk->state))
+                       continue;
+
+               target = disk_to_dev(ns->disk);
+               /*
+                * Create sysfs link from head gendisk kobject @kobj to the
+                * ns path gendisk kobject @target->kobj.
+                */
+               rc = sysfs_add_link_to_group(kobj, nvme_ns_mpath_attr_group.name,
+                               &target->kobj, dev_name(target));
+               if (unlikely(rc)) {
+                       dev_err(disk_to_dev(ns->head->disk),
+                                       "failed to create link to %s\n",
+                                       dev_name(target));
+                       clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
+               }
+       }
+
+       srcu_read_unlock(&head->srcu, srcu_idx);
+}
+
+void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns)
+{
+       struct device *target;
+       struct kobject *kobj;
+
+       if (!test_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags))
+               return;
+
+       target = disk_to_dev(ns->disk);
+       kobj = &disk_to_dev(ns->head->disk)->kobj;
+       sysfs_remove_link_from_group(kobj, nvme_ns_mpath_attr_group.name,
+                       dev_name(target));
+       clear_bit(NVME_NS_SYSFS_ATTR_LINK, &ns->flags);
+}
+
 void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
 {
        if (nvme_ctrl_use_ana(ns->ctrl)) {
index daa4c91e4848ce37e54c82955501ff9ecc8ddfa6..1eeb782947db99dd1d14b59aee985ec6586607f7 100644 (file)
@@ -534,10 +534,11 @@ struct nvme_ns {
        struct nvme_ns_head *head;
 
        unsigned long flags;
-#define NVME_NS_REMOVING       0
-#define NVME_NS_ANA_PENDING    2
-#define NVME_NS_FORCE_RO       3
-#define NVME_NS_READY          4
+#define NVME_NS_REMOVING               0
+#define NVME_NS_ANA_PENDING            2
+#define NVME_NS_FORCE_RO               3
+#define NVME_NS_READY                  4
+#define NVME_NS_SYSFS_ATTR_LINK        5
 
        struct cdev             cdev;
        struct device           cdev_device;
@@ -933,6 +934,7 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
 
 extern const struct attribute_group *nvme_ns_attr_groups[];
+extern const struct attribute_group nvme_ns_mpath_attr_group;
 extern const struct pr_ops nvme_pr_ops;
 extern const struct block_device_operations nvme_ns_head_ops;
 extern const struct attribute_group nvme_dev_attrs_group;
@@ -955,6 +957,8 @@ void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys);
 void nvme_failover_req(struct request *req);
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
+void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns);
+void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns);
 void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid);
 void nvme_mpath_remove_disk(struct nvme_ns_head *head);
 int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
@@ -1009,6 +1013,12 @@ static inline void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid)
 static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
 {
 }
+static inline void nvme_mpath_add_sysfs_link(struct nvme_ns *ns)
+{
+}
+static inline void nvme_mpath_remove_sysfs_link(struct nvme_ns *ns)
+{
+}
 static inline bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
 {
        return false;
index 52683943be152f2b753a7e5ca909e45fac251aaf..477c123a4b86a8220ec5c6205137b457017ae336 100644 (file)
@@ -299,8 +299,22 @@ static const struct attribute_group nvme_ns_attr_group = {
        .is_visible     = nvme_ns_attrs_are_visible,
 };
 
+#ifdef CONFIG_NVME_MULTIPATH
+static struct attribute *nvme_ns_mpath_attrs[] = {
+       NULL,
+};
+
+const struct attribute_group nvme_ns_mpath_attr_group = {
+       .name           = "multipath",
+       .attrs          = nvme_ns_mpath_attrs,
+};
+#endif
+
 const struct attribute_group *nvme_ns_attr_groups[] = {
        &nvme_ns_attr_group,
+#ifdef CONFIG_NVME_MULTIPATH
+       &nvme_ns_mpath_attr_group,
+#endif
        NULL,
 };