]> git.ipfire.org Git - thirdparty/collectd.git/commitdiff
Add NVMe attributies to SMART plugin
authorKotlowski, Bartlomiej <bartlomiej.kotlowski@intel.com>
Fri, 3 Jul 2020 10:15:59 +0000 (03:15 -0700)
committerPawel Zak <pawel.zak@intel.com>
Fri, 17 Jul 2020 12:08:47 +0000 (14:08 +0200)
Makefile.am
src/collectd.conf.pod
src/intel-nvme.h [new file with mode: 0644]
src/nvme.h [new file with mode: 0644]
src/smart.c
src/smart_test.c [new file with mode: 0644]
src/types.db

index cb8205fccb001184da9ce71b179ba1e144450810..8ed3f6139da3475a17a0968c2c2087015dc0baab 100644 (file)
@@ -1882,6 +1882,12 @@ smart_la_SOURCES = src/smart.c
 smart_la_CPPFLAGS = $(AM_CPPFLAGS) $(BUILD_WITH_LIBATASMART_CPPFLAGS) $(BUILD_WITH_LIBUDEV_CPPFLAGS)
 smart_la_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_LIBATASMART_LDFLAGS) $(BUILD_WITH_LIBUDEV_LDFLAGS)
 smart_la_LIBADD = libignorelist.la $(BUILD_WITH_LIBATASMART_LIBS) $(BUILD_WITH_LIBUDEV_LIBS)
+
+test_plugin_smart_SOURCES = src/smart_test.c
+test_plugin_smart_CPPFLAGS = $(AM_CPPFLAGS) $(BUILD_WITH_LIBATASMART_CPPFLAGS) $(BUILD_WITH_LIBUDEV_CPPFLAGS) -fprofile-arcs -ftest-coverage -O0 -g
+test_plugin_smart_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_LIBATASMART_LDFLAGS) $(BUILD_WITH_LIBUDEV_LDFLAGS) -fprofile-arcs
+test_plugin_smart_LDADD = libplugin_mock.la $(BUILD_WITH_LIBATASMART_LIBS) $(BUILD_WITH_LIBUDEV_LIBS)
+check_PROGRAMS += test_plugin_smart
 endif
 endif
 
index 3b2516ef6774d08e14496b8b957fd1aac7c494b0..2c8a79ef9c7e2c43e917afa671340afd18074cab 100644 (file)
@@ -8667,7 +8667,9 @@ The C<smart> plugin collects SMART information from physical
 disks. Values collectd include temperature, power cycle count, poweron
 time and bad sectors. Also, all SMART attributes are collected along
 with the normalized current value, the worst value, the threshold and
-a human readable value.
+a human readable value. The plugin can also collect SMART attributes 
+for NVMe disks (present in accordance with NVMe 1.4 spec) and Additional 
+SMART Attributes form IntelĀ® NVMe disks.
 
 Using the following two options you can ignore some disks or configure the
 collection only of specific disks.
@@ -8683,6 +8685,7 @@ is interpreted as a regular expression. Examples:
 
   Disk "sdd"
   Disk "/hda[34]/"
+  Disk "nvme0n1"
 
 See F</"IGNORELISTS"> for details.
 
diff --git a/src/intel-nvme.h b/src/intel-nvme.h
new file mode 100644 (file)
index 0000000..27ed6cf
--- /dev/null
@@ -0,0 +1,74 @@
+/**
+ * collectd - src/intel-nvme.h
+ * MIT License
+ *
+ * Copyright (C) 2020  Intel Corporation. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *   Bartlomiej Kotlowski <bartlomiej.kotlowski@intel.com>
+ * **/
+
+#include <linux/types.h>
+
+#ifdef __CHECKER__
+#define __force __attribute__((force))
+#else
+#define __force
+#endif
+
+#define NVME_SMART_INTEL_CDW10 0x008000ca
+#define INTEL_VENDOR_ID 0x8086
+
+struct __attribute__((packed)) nvme_additional_smart_log_item {
+  __u8 key;
+  __u8 _kp[2];
+  __u8 norm;
+  __u8 _np;
+  union __attribute__((packed)) {
+    __u8 raw[6];
+    struct __attribute__((packed)) wear_level {
+      __le16 min;
+      __le16 max;
+      __le16 avg;
+    } wear_level;
+    struct __attribute__((packed)) thermal_throttle {
+      __u8 pct;
+      __u32 count;
+    } thermal_throttle;
+  };
+  __u8 _rp;
+};
+
+struct nvme_additional_smart_log {
+  struct nvme_additional_smart_log_item program_fail_cnt;
+  struct nvme_additional_smart_log_item erase_fail_cnt;
+  struct nvme_additional_smart_log_item wear_leveling_cnt;
+  struct nvme_additional_smart_log_item e2e_err_cnt;
+  struct nvme_additional_smart_log_item crc_err_cnt;
+  struct nvme_additional_smart_log_item timed_workload_media_wear;
+  struct nvme_additional_smart_log_item timed_workload_host_reads;
+  struct nvme_additional_smart_log_item timed_workload_timer;
+  struct nvme_additional_smart_log_item thermal_throttle_status;
+  struct nvme_additional_smart_log_item retry_buffer_overflow_cnt;
+  struct nvme_additional_smart_log_item pll_lock_loss_cnt;
+  struct nvme_additional_smart_log_item nand_bytes_written;
+  struct nvme_additional_smart_log_item host_bytes_written;
+};
diff --git a/src/nvme.h b/src/nvme.h
new file mode 100644 (file)
index 0000000..a25355a
--- /dev/null
@@ -0,0 +1,64 @@
+/**
+ * collectd - src/nvme.h
+ * MIT License
+ *
+ * Copyright (C) 2020  Intel Corporation. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *   Bartlomiej Kotlowski <bartlomiej.kotlowski@intel.com>
+ * **/
+
+#include <linux/types.h>
+
+#define NVME_NSID_ALL 0xffffffff
+#define NVME_ADMIN_GET_LOG_PAGE 0x02
+#define NVME_ADMIN_IDENTIFY 0x06
+
+union nvme_smart_log {
+  struct {
+    __u8 critical_warning;
+    __u8 temperature[2];
+    __u8 avail_spare;
+    __u8 spare_thresh;
+    __u8 percent_used;
+    __u8 endu_grp_crit_warn_sumry;
+    __u8 rsvd1[25];
+    __u8 data_units_read[16];
+    __u8 data_units_written[16];
+    __u8 host_commands_read[16];
+    __u8 host_commands_written[16];
+    __u8 ctrl_busy_time[16];
+    __u8 power_cycles[16];
+    __u8 power_on_hours[16];
+    __u8 unsafe_shutdowns[16];
+    __u8 media_errors[16];
+    __u8 num_err_log_entries[16];
+    __le32 warning_temp_time;
+    __le32 critical_comp_time;
+    __le16 temp_sensor[8];
+    __le32 thm_temp1_trans_count;
+    __le32 thm_temp2_trans_count;
+    __le32 thm_temp1_total_time;
+    __le32 thm_temp2_total_time;
+    __u8 rsvd2[280];
+  } data;
+  __u8 raw[512];
+};
index 627c16d5537378dbdc5a9a393dca74f62c974549..c85bafa3c275a4c84225972708ff9f3ccdceddc1 100644 (file)
@@ -22,6 +22,8 @@
  *
  * Authors:
  *   Vincent Bernat <vbe at exoscale.ch>
+ *   Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+ *   Bartlomiej Kotlowski <bartlomiej.kotlowski@intel.com>
  **/
 
 #include "collectd.h"
 
 #include <atasmart.h>
 #include <libudev.h>
+#include <sys/ioctl.h>
+
+#include "intel-nvme.h"
+#include "nvme.h"
 
 #ifdef HAVE_SYS_CAPABILITY_H
 #include <sys/capability.h>
 #endif
 
+#define O_RDWR 02
+#define NVME_SMART_CDW10 0x00800002
+#define SHIFT_BYTE_LEFT 256
+struct nvme_admin_cmd {
+  __u8 opcode;
+  __u8 rsvd1[3];
+  __u32 nsid;
+  __u8 rsvd2[16];
+  __u64 addr;
+  __u8 rsvd3[4];
+  __u32 data_len;
+  __u32 cdw10;
+  __u32 cdw11;
+  __u8 rsvd4[24];
+};
+
+#define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd)
+
 static const char *config_keys[] = {"Disk", "IgnoreSelected", "IgnoreSleepMode",
                                     "UseSerial"};
 
 static int config_keys_num = STATIC_ARRAY_SIZE(config_keys);
 
-static ignorelist_t *ignorelist;
+static ignorelist_t *ignorelist, *ignorelist_by_serial;
 static int ignore_sleep_mode;
 static int use_serial;
+static int invert_ignorelist;
 
 static int smart_config(const char *key, const char *value) {
   if (ignorelist == NULL)
@@ -52,17 +77,17 @@ static int smart_config(const char *key, const char *value) {
   if (ignorelist == NULL)
     return 1;
 
-  if (strcasecmp("Disk", key) == 0) {
+  if (strncasecmp("Disk", key, 5) == 0) {
     ignorelist_add(ignorelist, value);
-  } else if (strcasecmp("IgnoreSelected", key) == 0) {
-    int invert = 1;
+  } else if (strncasecmp("IgnoreSelected", key, 15) == 0) {
+    invert_ignorelist = 1;
     if (IS_TRUE(value))
-      invert = 0;
-    ignorelist_set_invert(ignorelist, invert);
-  } else if (strcasecmp("IgnoreSleepMode", key) == 0) {
+      invert_ignorelist = 0;
+    ignorelist_set_invert(ignorelist, invert_ignorelist);
+  } else if (strncasecmp("IgnoreSleepMode", key, 16) == 0) {
     if (IS_TRUE(value))
       ignore_sleep_mode = 1;
-  } else if (strcasecmp("UseSerial", key) == 0) {
+  } else if (strncasecmp("UseSerial", key, 10) == 0) {
     if (IS_TRUE(value))
       use_serial = 1;
   } else {
@@ -72,6 +97,64 @@ static int smart_config(const char *key, const char *value) {
   return 0;
 } /* int smart_config */
 
+static int create_ignorelist_by_serial(ignorelist_t *il) {
+
+  struct udev *handle_udev;
+  struct udev_enumerate *enumerate;
+  struct udev_list_entry *devices, *dev_list_entry;
+  struct udev_device *dev;
+
+  if (ignorelist_by_serial == NULL)
+    ignorelist_by_serial = ignorelist_create(invert_ignorelist);
+  if (ignorelist_by_serial == NULL)
+    return 1;
+
+  if (invert_ignorelist == 0) {
+    ignorelist_set_invert(ignorelist, 1);
+  }
+
+  // Use udev to get a list of disks
+  handle_udev = udev_new();
+  if (!handle_udev) {
+    ERROR("smart plugin: unable to initialize udev.");
+    return 1;
+  }
+  enumerate = udev_enumerate_new(handle_udev);
+  if (enumerate == NULL) {
+    ERROR("fail udev_enumerate_new");
+    return 1;
+  }
+  udev_enumerate_add_match_subsystem(enumerate, "block");
+  udev_enumerate_add_match_property(enumerate, "DEVTYPE", "disk");
+  udev_enumerate_scan_devices(enumerate);
+  devices = udev_enumerate_get_list_entry(enumerate);
+  if (devices == NULL) {
+    ERROR("udev returned an empty list deviecs");
+    return 1;
+  }
+  udev_list_entry_foreach(dev_list_entry, devices) {
+    const char *path, *devpath, *serial, *name;
+    path = udev_list_entry_get_name(dev_list_entry);
+    dev = udev_device_new_from_syspath(handle_udev, path);
+    devpath = udev_device_get_devnode(dev);
+    serial = udev_device_get_property_value(dev, "ID_SERIAL_SHORT");
+    name = strrchr(devpath, '/');
+    if (name != NULL) {
+      if (name[0] == '/')
+        name++;
+
+      if (ignorelist_match(ignorelist, name) == 0 && serial != NULL) {
+        ignorelist_add(ignorelist_by_serial, serial);
+      }
+    }
+  }
+
+  if (invert_ignorelist == 0) {
+    ignorelist_set_invert(ignorelist, 1);
+  }
+  return 0;
+}
+
 static void smart_submit(const char *dev, const char *type,
                          const char *type_inst, double value) {
   value_list_t vl = VALUE_LIST_INIT;
@@ -82,7 +165,6 @@ static void smart_submit(const char *dev, const char *type,
   sstrncpy(vl.plugin_instance, dev, sizeof(vl.plugin_instance));
   sstrncpy(vl.type, type, sizeof(vl.type));
   sstrncpy(vl.type_instance, type_inst, sizeof(vl.type_instance));
-
   plugin_dispatch_values(&vl);
 }
 
@@ -123,7 +205,278 @@ static void handle_attribute(SkDisk *d, const SkSmartAttributeParsedData *a,
   }
 }
 
-static void smart_read_disk(SkDisk *d, char const *name) {
+static inline double compute_field(__u8 *data) {
+  double sum = 0;
+  double add = 0;
+
+  for (int i = 0; i < 16; i++) {
+    add = data[15 - i];
+    for (int j = i + 1; j < 16; j++) {
+      add *= SHIFT_BYTE_LEFT;
+    }
+    sum += add;
+  }
+  return sum;
+}
+
+static inline double int48_to_double(__u8 *data) {
+  double sum = 0;
+  double add = 0;
+
+  for (int i = 0; i < 6; i++) {
+    add = data[5 - i];
+    for (int j = i + 1; j < 6; j++) {
+      add *= SHIFT_BYTE_LEFT;
+    }
+    sum += add;
+  }
+  return sum;
+}
+
+/**
+ * There is a bunch of metrics that are 16 bytes long and the need to be
+ * converted onto the single double value, so they can be dispatched
+ */
+#define NVME_METRIC_16B(metric)                                                \
+  { "nvme_" #metric, offsetof(union nvme_smart_log, data.metric), "" }
+
+static inline uint16_t le16_to_cpu(__le16 x) {
+  return le16toh((__force __u16)x);
+}
+
+struct nvme_metric_16b {
+  char *label;
+  unsigned int offset;
+  char *type_inst;
+} nvme_metrics[] = {
+    NVME_METRIC_16B(data_units_read),    NVME_METRIC_16B(data_units_written),
+    NVME_METRIC_16B(host_commands_read), NVME_METRIC_16B(host_commands_written),
+    NVME_METRIC_16B(ctrl_busy_time),     NVME_METRIC_16B(power_cycles),
+    NVME_METRIC_16B(power_on_hours),     NVME_METRIC_16B(unsafe_shutdowns),
+    NVME_METRIC_16B(media_errors),       NVME_METRIC_16B(num_err_log_entries),
+};
+
+static void smart_nvme_submit_16b(char const *name, __u8 *raw) {
+  int i = 0;
+
+  for (; i < STATIC_ARRAY_SIZE(nvme_metrics); i++) {
+    DEBUG("%s : %f", nvme_metrics[i].label,
+          compute_field(&raw[nvme_metrics[i].offset]));
+    smart_submit(name, nvme_metrics[i].label, nvme_metrics[i].type_inst,
+                 compute_field(&raw[nvme_metrics[i].offset]));
+  }
+}
+
+static int get_vendor_id(const char *dev, char const *name) {
+
+  int fd, err;
+  __le16 vid;
+
+  fd = open(dev, O_RDWR);
+  if (fd < 0) {
+    ERROR("open failed with %s\n", strerror(errno));
+    return fd;
+  }
+
+  err = ioctl(fd, NVME_IOCTL_ADMIN_CMD,
+              &(struct nvme_admin_cmd){.opcode = NVME_ADMIN_IDENTIFY,
+                                       .nsid = NVME_NSID_ALL,
+                                       .addr = (unsigned long)&vid,
+                                       .data_len = sizeof(vid),
+                                       .cdw10 = 1,
+                                       .cdw11 = 0});
+
+  if (err < 0) {
+    ERROR("ioctl for NVME_IOCTL_ADMIN_CMD failed with %s\n", strerror(errno));
+    close(fd);
+    return err;
+  }
+
+  close(fd);
+  return (int)le16_to_cpu(vid);
+}
+
+static int smart_read_nvme_disk(const char *dev, char const *name) {
+  union nvme_smart_log smart_log = {};
+  int fd, status;
+
+  fd = open(dev, O_RDWR);
+  if (fd < 0) {
+    ERROR("open failed with %s\n", strerror(errno));
+    return fd;
+  }
+
+  /**
+   * Prepare Get Log Page command
+   * Fill following fields (see NVMe 1.4 spec, section 5.14.1)
+   * - Number of DWORDS (bits 27:16) - the struct that will be passed for
+   *   filling has 512 bytes which gives 128 (0x80) DWORDS
+   * - Log Page Indentifier (bits 7:0) - for SMART the id is 0x02
+   */
+
+  status = ioctl(fd, NVME_IOCTL_ADMIN_CMD,
+                 &(struct nvme_admin_cmd){.opcode = NVME_ADMIN_GET_LOG_PAGE,
+                                          .nsid = NVME_NSID_ALL,
+                                          .addr = (unsigned long)&smart_log,
+                                          .data_len = sizeof(smart_log),
+                                          .cdw10 = NVME_SMART_CDW10});
+  if (status < 0) {
+    ERROR("ioctl for NVME_IOCTL_ADMIN_CMD failed with %s\n", strerror(errno));
+    close(fd);
+    return status;
+  } else {
+    smart_submit(name, "nvme_critical_warning", "",
+                 (double)smart_log.data.critical_warning);
+    smart_submit(name, "nvme_temperature", "",
+                 ((double)(smart_log.data.temperature[1] << 8) +
+                  smart_log.data.temperature[0] - 273));
+    smart_submit(name, "nvme_avail_spare", "",
+                 (double)smart_log.data.avail_spare);
+    smart_submit(name, "nvme_avail_spare_thresh", "",
+                 (double)smart_log.data.spare_thresh);
+    smart_submit(name, "nvme_percent_used", "",
+                 (double)smart_log.data.percent_used);
+    smart_submit(name, "nvme_endu_grp_crit_warn_sumry", "",
+                 (double)smart_log.data.endu_grp_crit_warn_sumry);
+    smart_submit(name, "nvme_warning_temp_time", "",
+                 (double)smart_log.data.warning_temp_time);
+    smart_submit(name, "nvme_critical_comp_time", "",
+                 (double)smart_log.data.critical_comp_time);
+    smart_submit(name, "nvme_temp_sensor", "sensor_1",
+                 (double)smart_log.data.temp_sensor[0] - 273);
+    smart_submit(name, "nvme_temp_sensor", "sensor_2",
+                 (double)smart_log.data.temp_sensor[1] - 273);
+    smart_submit(name, "nvme_temp_sensor", "sensor_3",
+                 (double)smart_log.data.temp_sensor[2] - 273);
+    smart_submit(name, "nvme_temp_sensor", "sensor_4",
+                 (double)smart_log.data.temp_sensor[3] - 273);
+    smart_submit(name, "nvme_temp_sensor", "sensor_5",
+                 (double)smart_log.data.temp_sensor[4] - 273);
+    smart_submit(name, "nvme_temp_sensor", "sensor_6",
+                 (double)smart_log.data.temp_sensor[5] - 273);
+    smart_submit(name, "nvme_temp_sensor", "sensor_7",
+                 (double)smart_log.data.temp_sensor[6] - 273);
+    smart_submit(name, "nvme_temp_sensor", "sensor_8",
+                 (double)smart_log.data.temp_sensor[7] - 273);
+    smart_submit(name, "nvme_thermal_mgmt_temp1_transition_count", "",
+                 (double)smart_log.data.thm_temp1_trans_count);
+    smart_submit(name, "nvme_thermal_mgmt_temp1_total_time", "",
+                 (double)smart_log.data.thm_temp1_total_time);
+    smart_submit(name, "nvme_thermal_mgmt_temp2_transition_count", "",
+                 (double)smart_log.data.thm_temp2_trans_count);
+    smart_submit(name, "nvme_thermal_mgmt_temp2_total_time", "",
+                 (double)smart_log.data.thm_temp2_total_time);
+    smart_nvme_submit_16b(name, smart_log.raw);
+  }
+
+  close(fd);
+  return 0;
+}
+
+static int smart_read_nvme_intel_disk(const char *dev, char const *name) {
+
+  DEBUG("name = %s", name);
+  DEBUG("dev = %s", dev);
+
+  struct nvme_additional_smart_log intel_smart_log;
+  int fd, status;
+  fd = open(dev, O_RDWR);
+  if (fd < 0) {
+    ERROR("open failed with %s\n", strerror(errno));
+    return fd;
+  }
+
+  /**
+   * Prepare Get Log Page command
+   * - Additional SMART Attributes (Log Identfiter CAh)
+   */
+
+  status =
+      ioctl(fd, NVME_IOCTL_ADMIN_CMD,
+            &(struct nvme_admin_cmd){.opcode = NVME_ADMIN_GET_LOG_PAGE,
+                                     .nsid = NVME_NSID_ALL,
+                                     .addr = (unsigned long)&intel_smart_log,
+                                     .data_len = sizeof(intel_smart_log),
+                                     .cdw10 = NVME_SMART_INTEL_CDW10});
+  if (status < 0) {
+    ERROR("ioctl for NVME_IOCTL_ADMIN_CMD failed with %s\n", strerror(errno));
+    close(fd);
+    return status;
+  } else {
+
+    smart_submit(name, "nvme_program_fail_count", "norm",
+                 (double)intel_smart_log.program_fail_cnt.norm);
+    smart_submit(name, "nvme_program_fail_count", "raw",
+                 int48_to_double(intel_smart_log.program_fail_cnt.raw));
+    smart_submit(name, "nvme_erase_fail_count", "norm",
+                 (double)intel_smart_log.erase_fail_cnt.norm);
+    smart_submit(name, "nvme_erase_fail_count", "raw",
+                 int48_to_double(intel_smart_log.program_fail_cnt.raw));
+    smart_submit(name, "nvme_wear_leveling", "norm",
+                 (double)intel_smart_log.wear_leveling_cnt.norm);
+    smart_submit(
+        name, "nvme_wear_leveling", "min",
+        (double)le16_to_cpu(intel_smart_log.wear_leveling_cnt.wear_level.min));
+    smart_submit(
+        name, "nvme_wear_leveling", "max",
+        (double)le16_to_cpu(intel_smart_log.wear_leveling_cnt.wear_level.max));
+    smart_submit(
+        name, "nvme_wear_leveling", "avg",
+        (double)le16_to_cpu(intel_smart_log.wear_leveling_cnt.wear_level.avg));
+    smart_submit(name, "nvme_end_to_end_error_detection_count", "norm",
+                 (double)intel_smart_log.e2e_err_cnt.norm);
+    smart_submit(name, "nvme_end_to_end_error_detection_count", "raw",
+                 int48_to_double(intel_smart_log.e2e_err_cnt.raw));
+    smart_submit(name, "nvme_crc_error_count", "norm",
+                 (double)intel_smart_log.crc_err_cnt.norm);
+    smart_submit(name, "nvme_crc_error_count", "raw",
+                 int48_to_double(intel_smart_log.crc_err_cnt.raw));
+    smart_submit(name, "nvme_timed_workload_media_wear", "norm",
+                 (double)intel_smart_log.timed_workload_media_wear.norm);
+    smart_submit(
+        name, "nvme_timed_workload_media_wear", "raw",
+        int48_to_double(intel_smart_log.timed_workload_media_wear.raw));
+    smart_submit(name, "nvme_timed_workload_host_reads", "norm",
+                 (double)intel_smart_log.timed_workload_host_reads.norm);
+    smart_submit(
+        name, "nvme_timed_workload_host_reads", "raw",
+        int48_to_double(intel_smart_log.timed_workload_host_reads.raw));
+    smart_submit(name, "nvme_timed_workload_timer", "norm",
+                 (double)intel_smart_log.timed_workload_timer.norm);
+    smart_submit(name, "nvme_timed_workload_timer", "raw",
+                 int48_to_double(intel_smart_log.timed_workload_timer.raw));
+    smart_submit(name, "nvme_thermal_throttle_status", "norm",
+                 (double)intel_smart_log.thermal_throttle_status.norm);
+    smart_submit(
+        name, "nvme_thermal_throttle_status", "pct",
+        (double)intel_smart_log.thermal_throttle_status.thermal_throttle.pct);
+    smart_submit(
+        name, "nvme_thermal_throttle_status", "count",
+        (double)intel_smart_log.thermal_throttle_status.thermal_throttle.count);
+    smart_submit(name, "nvme_retry_buffer_overflow_count", "norm",
+                 (double)intel_smart_log.retry_buffer_overflow_cnt.norm);
+    smart_submit(
+        name, "nvme_retry_buffer_overflow_count", "raw",
+        int48_to_double(intel_smart_log.retry_buffer_overflow_cnt.raw));
+    smart_submit(name, "nvme_pll_lock_loss_count", "norm",
+                 (double)intel_smart_log.pll_lock_loss_cnt.norm);
+    smart_submit(name, "nvme_pll_lock_loss_count", "raw",
+                 int48_to_double(intel_smart_log.pll_lock_loss_cnt.raw));
+    smart_submit(name, "nvme_nand_bytes_written", "norm",
+                 (double)intel_smart_log.host_bytes_written.norm);
+    smart_submit(name, "nvme_nand_bytes_written", "raw",
+                 int48_to_double(intel_smart_log.host_bytes_written.raw));
+    smart_submit(name, "nvme_host_bytes_written", "norm",
+                 (double)intel_smart_log.host_bytes_written.norm);
+    smart_submit(name, "nvme_host_bytes_written", "raw",
+                 int48_to_double(intel_smart_log.host_bytes_written.raw));
+  }
+
+  close(fd);
+  return 0;
+}
+
+static void smart_read_sata_disk(SkDisk *d, char const *name) {
   SkBool available = FALSE;
   if (sk_disk_identify_is_available(d, &available) < 0 || !available) {
     DEBUG("smart plugin: disk %s cannot be identified.", name);
@@ -183,6 +536,7 @@ static void smart_read_disk(SkDisk *d, char const *name) {
 static void smart_handle_disk(const char *dev, const char *serial) {
   SkDisk *d = NULL;
   const char *name;
+  int err;
 
   if (use_serial && serial) {
     name = serial;
@@ -192,19 +546,49 @@ static void smart_handle_disk(const char *dev, const char *serial) {
       return;
     name++;
   }
-  if (ignorelist_match(ignorelist, name) != 0) {
-    DEBUG("smart plugin: ignoring %s.", dev);
-    return;
+
+  if (use_serial) {
+    if (ignorelist_match(ignorelist_by_serial, name) != 0) {
+      DEBUG("smart plugin: ignoring %s. Name = %s", dev, name);
+      return;
+    }
+  } else {
+    if (ignorelist_match(ignorelist, name) != 0) {
+      DEBUG("smart plugin: ignoring %s. Name = %s", dev, name);
+      return;
+    }
   }
 
   DEBUG("smart plugin: checking SMART status of %s.", dev);
-  if (sk_disk_open(dev, &d) < 0) {
-    ERROR("smart plugin: unable to open %s.", dev);
-    return;
-  }
 
-  smart_read_disk(d, name);
-  sk_disk_free(d);
+  if (strstr(dev, "nvme")) {
+    err = smart_read_nvme_disk(dev, name);
+    if (err) {
+      ERROR("smart plugin: smart_read_nvme_disk failed, %d", err);
+    } else {
+      switch (get_vendor_id(dev, name)) {
+      case INTEL_VENDOR_ID:
+        err = smart_read_nvme_intel_disk(dev, name);
+        if (err) {
+          ERROR("smart plugin: smart_read_nvme_intel_disk failed, %d", err);
+        }
+        break;
+
+      default:
+        DEBUG("No support vendor specific attributes");
+        break;
+      }
+    }
+
+  } else {
+
+    if (sk_disk_open(dev, &d) < 0) {
+      ERROR("smart plugin: unable to open %s.", dev);
+      return;
+    }
+    smart_read_sata_disk(d, name);
+    sk_disk_free(d);
+  }
 }
 
 static int smart_read(void) {
@@ -220,16 +604,24 @@ static int smart_read(void) {
     return -1;
   }
   enumerate = udev_enumerate_new(handle_udev);
+  if (enumerate == NULL) {
+    ERROR("fail udev_enumerate_new");
+    return -1;
+  }
   udev_enumerate_add_match_subsystem(enumerate, "block");
   udev_enumerate_add_match_property(enumerate, "DEVTYPE", "disk");
   udev_enumerate_scan_devices(enumerate);
   devices = udev_enumerate_get_list_entry(enumerate);
+  if (devices == NULL) {
+    ERROR("udev returned an empty list deviecs");
+    return -1;
+  }
   udev_list_entry_foreach(dev_list_entry, devices) {
     const char *path, *devpath, *serial;
     path = udev_list_entry_get_name(dev_list_entry);
     dev = udev_device_new_from_syspath(handle_udev, path);
     devpath = udev_device_get_devnode(dev);
-    serial = udev_device_get_property_value(dev, "ID_SERIAL");
+    serial = udev_device_get_property_value(dev, "ID_SERIAL_SHORT");
 
     /* Query status with libatasmart */
     smart_handle_disk(devpath, serial);
@@ -243,6 +635,15 @@ static int smart_read(void) {
 } /* int smart_read */
 
 static int smart_init(void) {
+  int err;
+  if (use_serial) {
+    err = create_ignorelist_by_serial(ignorelist);
+    if (err != 0) {
+      ERROR("Enable to create ignorelist_by_serial");
+      return 1;
+    }
+  }
+
 #if defined(HAVE_SYS_CAPABILITY_H) && defined(CAP_SYS_RAWIO)
   if (check_capability(CAP_SYS_RAWIO) != 0) {
     if (getuid() == 0)
diff --git a/src/smart_test.c b/src/smart_test.c
new file mode 100644 (file)
index 0000000..4963b51
--- /dev/null
@@ -0,0 +1,108 @@
+/**
+ * collectd - src/smart_test.c
+ * MIT License
+ *
+ * Copyright (C) 2020  Intel Corporation. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *   Bartlomiej Kotlowski <bartlomiej.kotlowski@intel.com>
+ **/
+
+#include "smart.c"
+#include "testing.h"
+
+int VENDOR_ID = 0x8086;
+char *CORRECT_DEV_PATH = "/dev/nvme0n1";
+
+int ioctl(int __fd, unsigned long int __request, ...) {
+  va_list valist;
+  va_start(valist, __request);
+  struct nvme_admin_cmd *admin_cmd = va_arg(valist, struct nvme_admin_cmd *);
+
+  if (admin_cmd->opcode == NVME_ADMIN_IDENTIFY) {
+    // icotl ask about vid
+    __le16 *vid = (__le16 *)admin_cmd->addr;
+    *vid = VENDOR_ID;
+    return 0;
+  } else if (admin_cmd->opcode == NVME_ADMIN_GET_LOG_PAGE) {
+    // icotl ask about smart attributies
+    if (admin_cmd->cdw10 == NVME_SMART_INTEL_CDW10) {
+      // set intel specyfic attrubiuties
+      struct nvme_additional_smart_log *intel_smart_log =
+          (struct nvme_additional_smart_log *)admin_cmd->addr;
+      intel_smart_log->program_fail_cnt.norm = 100;
+      return 0;
+    } else if (admin_cmd->cdw10 == NVME_SMART_CDW10) {
+      // set global smart attrubiuties
+      union nvme_smart_log *smart_log = (union nvme_smart_log *)admin_cmd->addr;
+      smart_log->data.critical_warning = 0;
+      return 0;
+    }
+    return -1; // no mock func
+  }
+  return -1; // no mock func
+};
+
+int open(const char *__path, int __oflag, ...) {
+  if (__path == CORRECT_DEV_PATH) {
+    return 0;
+  }
+  return -1;
+}
+
+DEF_TEST(x) {
+  int ret;
+
+  ret = get_vendor_id(CORRECT_DEV_PATH, "stub");
+  EXPECT_EQ_INT(VENDOR_ID, ret);
+
+  VENDOR_ID = 0x144D;
+  ret = get_vendor_id(CORRECT_DEV_PATH, "stub");
+  EXPECT_EQ_INT(VENDOR_ID, ret);
+  VENDOR_ID = 0x8086;
+
+  // incorrect with DEV_PATH
+  ret = get_vendor_id("dev/nvme0nXX", "stub");
+  EXPECT_EQ_INT(-1, ret);
+
+  ret = smart_read_nvme_intel_disk(CORRECT_DEV_PATH, "stub");
+  EXPECT_EQ_INT(0, ret);
+
+  // incorrect with DEV_PATH
+  ret = smart_read_nvme_intel_disk("dev/nvme0nXX", "stub");
+  EXPECT_EQ_INT(-1, ret);
+
+  CORRECT_DEV_PATH = "dev/sda0";
+
+  ret = smart_read_nvme_disk(CORRECT_DEV_PATH, "stub");
+  EXPECT_EQ_INT(0, ret);
+
+  // incorrect with DEV_PATH
+  ret = smart_read_nvme_disk("/dev/sdaXX", "stub");
+  EXPECT_EQ_INT(-1, ret);
+
+  return 0;
+}
+
+int main(void) {
+  RUN_TEST(x);
+  END_TEST;
+}
index c41c22e63a1ce8193095ff983de9e6604580ee40..8ec1f6eeb01ad75f8a56c32fb0d5b7f16538e308 100644 (file)
@@ -255,6 +255,42 @@ smart_badsectors        value:GAUGE:0:U
 smart_powercycles       value:GAUGE:0:U
 smart_poweron           value:GAUGE:0:U
 smart_temperature       value:GAUGE:-300:300
+nvme_critical_warning                       value:GAUGE:0:31
+nvme_temperature                            value:GAUGE:-300:300
+nvme_avail_spare                            value:GAUGE:0:100
+nvme_avail_spare_thresh                     value:GAUGE:0:100
+nvme_percent_used                           value:GAUGE:0:255
+nvme_endu_grp_crit_warn_sumry               value:GAUGE:0:15
+nvme_data_units_read                        value:GAUGE:0:U
+nvme_data_units_written                     value:GAUGE:0:U
+nvme_host_commands_read                     value:GAUGE:0:U
+nvme_host_commands_written                  value:GAUGE:0:U
+nvme_ctrl_busy_time                         value:GAUGE:0:U
+nvme_power_cycles                           value:GAUGE:0:U
+nvme_power_on_hours                         value:GAUGE:0:U
+nvme_unsafe_shutdowns                       value:GAUGE:0:U
+nvme_media_errors                           value:GAUGE:0:U
+nvme_num_err_log_entries                    value:GAUGE:0:U
+nvme_warning_temp_time                      value:GAUGE:0:U
+nvme_critical_comp_time                     value:GAUGE:0:U
+nvme_temp_sensor                            value:GAUGE:-300:300
+nvme_thermal_mgmt_temp1_transition_count    value:GAUGE:0:U
+nvme_thermal_mgmt_temp1_total_time          value:GAUGE:0:U
+nvme_thermal_mgmt_temp2_transition_count    value:GAUGE:0:U
+nvme_thermal_mgmt_temp2_total_time          value:GAUGE:0:U
+nvme_program_fail_count                     value:GAUGE:0:U
+nvme_erase_fail_count                       value:GAUGE:0:U
+nvme_wear_leveling                          value:GAUGE:0:U
+nvme_end_to_end_error_detection_count       value:GAUGE:0:U
+nvme_crc_error_count                        value:GAUGE:0:U
+nvme_timed_workload_timer                   value:GAUGE:0:U
+nvme_timed_workload_media_wear              value:GAUGE:0:U
+nvme_timed_workload_host_reads              value:GAUGE:0:100
+nvme_thermal_throttle_status                value:GAUGE:0:U
+nvme_retry_buffer_overflow_count            value:GAUGE:0:U
+nvme_pll_lock_loss_count                    value:GAUGE:0:U
+nvme_nand_bytes_written                     value:GAUGE:0:U
+nvme_host_bytes_written                     value:GAUGE:0:U
 snr                     value:GAUGE:0:U
 spam_check              value:GAUGE:0:U
 spam_score              value:GAUGE:U:U