]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
nvme: export controller reconnect event count via sysfs
authorNilay Shroff <nilay@linux.ibm.com>
Sat, 16 May 2026 18:36:55 +0000 (00:06 +0530)
committerKeith Busch <kbusch@kernel.org>
Thu, 4 Jun 2026 08:57:40 +0000 (01:57 -0700)
When an NVMe-oF link goes down, the driver attempts to recover the
connection by repeatedly reconnecting to the remote controller at
configured intervals. A maximum number of reconnect attempts is also
configured, after which recovery stops and the controller is removed
if the connection cannot be re-established.

The driver maintains a counter, nr_reconnects, which is incremented on
each reconnect attempt. However if in case the reconnect is successful
then this counter reset to zero. Moreover, currently, this counter is
only reported via kernel log messages and is not exposed to userspace.
Since dmesg is a circular buffer, this information may be lost over
time.

So introduce a new accumulator which accumulates nr_reconnect attempts
and also expose this accumulator per-fabric ctrl via a new sysfs
attribute reconnect_count, under diag attribute grroup to provide
persistent visibility into the number of reconnect attempts made by the
host. This information can help users diagnose unstable links or
connectivity issues. Furthermore, this sysfs attribute is also writable
so user may reset it to zero, if needed.

The reconnect_count can also be consumed by monitoring tools such as
nvme-top to improve controller-level observability.

Tested-by: Venkat Rao Bagalkote <venkat88@linux.ibm.com>
Signed-off-by: Nilay Shroff <nilay@linux.ibm.com>
Signed-off-by: Keith Busch <kbusch@kernel.org>
drivers/nvme/host/fc.c
drivers/nvme/host/nvme.h
drivers/nvme/host/rdma.c
drivers/nvme/host/sysfs.c
drivers/nvme/host/tcp.c

index e4f4528fe2a2d645040f7db277cdb5063acb07b1..f04eb13dd5e9a5efb281c915b833d5f2d7ab599e 100644 (file)
@@ -3148,6 +3148,8 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
                goto out_term_aen_ops;
        }
 
+       /* accumulate reconnect attempts before resetting it to zero */
+       atomic_long_add(ctrl->ctrl.nr_reconnects, &ctrl->ctrl.acc_reconnects);
        ctrl->ctrl.nr_reconnects = 0;
        nvme_start_ctrl(&ctrl->ctrl);
 
@@ -3470,6 +3472,7 @@ nvme_fc_alloc_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 
        ctrl->ctrl.opts = opts;
        ctrl->ctrl.nr_reconnects = 0;
+       atomic_long_set(&ctrl->ctrl.acc_reconnects, 0);
        INIT_LIST_HEAD(&ctrl->ctrl_list);
        ctrl->lport = lport;
        ctrl->rport = rport;
index 81f297e995e4a5b0fcd62bcdb9e53cfe80b6fbed..b367c67dcb37f225d55d9192b94fe528d358952f 100644 (file)
@@ -458,6 +458,8 @@ struct nvme_ctrl {
        u16 icdoff;
        u16 maxcmd;
        int nr_reconnects;
+       /* accumulate reconenct attempts, as nr_reconnects can reset to zero */
+       atomic_long_t acc_reconnects;
        unsigned long flags;
        struct nvmf_ctrl_options *opts;
 
index bf73135c1439a3dafa6d65ff8715155eca336800..61a91cfb40626da51627a904d097c594d77b0bd5 100644 (file)
@@ -1110,6 +1110,8 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
        dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
                        ctrl->ctrl.nr_reconnects);
 
+       /* accumulate reconnect attempts before resetting it to zero */
+       atomic_long_add(ctrl->ctrl.nr_reconnects, &ctrl->ctrl.acc_reconnects);
        ctrl->ctrl.nr_reconnects = 0;
 
        return;
index ff603a9d7b8caa42830757359c71e440738a6671..933a5adfb7af5d5974da480ecd6313ce9647b310 100644 (file)
@@ -1175,17 +1175,52 @@ static ssize_t reset_count_store(struct device *dev,
        return count;
 }
 
+static ssize_t reconnect_count_show(struct device *dev,
+                  struct device_attribute *attr, char *buf)
+{
+       struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+       return sysfs_emit(buf, "%lu\n",
+                         atomic_long_read(&ctrl->acc_reconnects) +
+                         ctrl->nr_reconnects);
+}
+
+static ssize_t reconnect_count_store(struct device *dev,
+               struct device_attribute *attr, const char *buf, size_t count)
+{
+       int err;
+       unsigned long reconnect_cnt;
+       struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+       err = kstrtoul(buf, 0, &reconnect_cnt);
+       if (err)
+               return -EINVAL;
+
+       atomic_long_set(&ctrl->acc_reconnects, reconnect_cnt);
+
+       return count;
+}
+
+static DEVICE_ATTR_RW(reconnect_count);
+
 static DEVICE_ATTR_RW(reset_count);
 
 static struct attribute *nvme_dev_diag_attrs[] = {
        &dev_attr_adm_errors.attr,
        &dev_attr_reset_count.attr,
+       &dev_attr_reconnect_count.attr,
        NULL,
 };
 
 static umode_t nvme_dev_diag_attrs_are_visible(struct kobject *kobj,
                struct attribute *a, int n)
 {
+       struct device *dev = container_of(kobj, struct device, kobj);
+       struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+
+       if (a == &dev_attr_reconnect_count.attr && !ctrl->opts)
+               return 0;
+
        return a->mode;
 }
 
index 9d17c88a620054077def63f2f6e9610a41957f6b..9b76b77ffdbbf9cd45f0200fd45c098d563c03a5 100644 (file)
@@ -2489,6 +2489,8 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
        dev_info(ctrl->device, "Successfully reconnected (attempt %d/%d)\n",
                 ctrl->nr_reconnects, ctrl->opts->max_reconnects);
 
+       /* accumulate reconnect attempts before resetting it to zero */
+       atomic_long_add(ctrl->nr_reconnects, &ctrl->acc_reconnects);
        ctrl->nr_reconnects = 0;
 
        return;