From 741311d7e3f66e429d4f6955b30d4bf2aa8303a8 Mon Sep 17 00:00:00 2001 From: "Kotlowski, Bartlomiej" Date: Fri, 3 Jul 2020 03:15:59 -0700 Subject: [PATCH] Add NVMe attributies to SMART plugin --- Makefile.am | 6 + src/collectd.conf.pod | 5 +- src/intel-nvme.h | 74 +++++++ src/nvme.h | 64 ++++++ src/smart.c | 441 ++++++++++++++++++++++++++++++++++++++++-- src/smart_test.c | 108 +++++++++++ src/types.db | 36 ++++ 7 files changed, 713 insertions(+), 21 deletions(-) create mode 100644 src/intel-nvme.h create mode 100644 src/nvme.h create mode 100644 src/smart_test.c diff --git a/Makefile.am b/Makefile.am index cb8205fcc..8ed3f6139 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1882,6 +1882,12 @@ smart_la_SOURCES = src/smart.c smart_la_CPPFLAGS = $(AM_CPPFLAGS) $(BUILD_WITH_LIBATASMART_CPPFLAGS) $(BUILD_WITH_LIBUDEV_CPPFLAGS) smart_la_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_LIBATASMART_LDFLAGS) $(BUILD_WITH_LIBUDEV_LDFLAGS) smart_la_LIBADD = libignorelist.la $(BUILD_WITH_LIBATASMART_LIBS) $(BUILD_WITH_LIBUDEV_LIBS) + +test_plugin_smart_SOURCES = src/smart_test.c +test_plugin_smart_CPPFLAGS = $(AM_CPPFLAGS) $(BUILD_WITH_LIBATASMART_CPPFLAGS) $(BUILD_WITH_LIBUDEV_CPPFLAGS) -fprofile-arcs -ftest-coverage -O0 -g +test_plugin_smart_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_LIBATASMART_LDFLAGS) $(BUILD_WITH_LIBUDEV_LDFLAGS) -fprofile-arcs +test_plugin_smart_LDADD = libplugin_mock.la $(BUILD_WITH_LIBATASMART_LIBS) $(BUILD_WITH_LIBUDEV_LIBS) +check_PROGRAMS += test_plugin_smart endif endif diff --git a/src/collectd.conf.pod b/src/collectd.conf.pod index 3b2516ef6..2c8a79ef9 100644 --- a/src/collectd.conf.pod +++ b/src/collectd.conf.pod @@ -8667,7 +8667,9 @@ The C plugin collects SMART information from physical disks. Values collectd include temperature, power cycle count, poweron time and bad sectors. Also, all SMART attributes are collected along with the normalized current value, the worst value, the threshold and -a human readable value. +a human readable value. The plugin can also collect SMART attributes +for NVMe disks (present in accordance with NVMe 1.4 spec) and Additional +SMART Attributes form Intel® NVMe disks. Using the following two options you can ignore some disks or configure the collection only of specific disks. @@ -8683,6 +8685,7 @@ is interpreted as a regular expression. Examples: Disk "sdd" Disk "/hda[34]/" + Disk "nvme0n1" See F for details. diff --git a/src/intel-nvme.h b/src/intel-nvme.h new file mode 100644 index 000000000..27ed6cfd2 --- /dev/null +++ b/src/intel-nvme.h @@ -0,0 +1,74 @@ +/** + * collectd - src/intel-nvme.h + * MIT License + * + * Copyright (C) 2020 Intel Corporation. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Bartlomiej Kotlowski + * **/ + +#include + +#ifdef __CHECKER__ +#define __force __attribute__((force)) +#else +#define __force +#endif + +#define NVME_SMART_INTEL_CDW10 0x008000ca +#define INTEL_VENDOR_ID 0x8086 + +struct __attribute__((packed)) nvme_additional_smart_log_item { + __u8 key; + __u8 _kp[2]; + __u8 norm; + __u8 _np; + union __attribute__((packed)) { + __u8 raw[6]; + struct __attribute__((packed)) wear_level { + __le16 min; + __le16 max; + __le16 avg; + } wear_level; + struct __attribute__((packed)) thermal_throttle { + __u8 pct; + __u32 count; + } thermal_throttle; + }; + __u8 _rp; +}; + +struct nvme_additional_smart_log { + struct nvme_additional_smart_log_item program_fail_cnt; + struct nvme_additional_smart_log_item erase_fail_cnt; + struct nvme_additional_smart_log_item wear_leveling_cnt; + struct nvme_additional_smart_log_item e2e_err_cnt; + struct nvme_additional_smart_log_item crc_err_cnt; + struct nvme_additional_smart_log_item timed_workload_media_wear; + struct nvme_additional_smart_log_item timed_workload_host_reads; + struct nvme_additional_smart_log_item timed_workload_timer; + struct nvme_additional_smart_log_item thermal_throttle_status; + struct nvme_additional_smart_log_item retry_buffer_overflow_cnt; + struct nvme_additional_smart_log_item pll_lock_loss_cnt; + struct nvme_additional_smart_log_item nand_bytes_written; + struct nvme_additional_smart_log_item host_bytes_written; +}; diff --git a/src/nvme.h b/src/nvme.h new file mode 100644 index 000000000..a25355a4f --- /dev/null +++ b/src/nvme.h @@ -0,0 +1,64 @@ +/** + * collectd - src/nvme.h + * MIT License + * + * Copyright (C) 2020 Intel Corporation. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Bartlomiej Kotlowski + * **/ + +#include + +#define NVME_NSID_ALL 0xffffffff +#define NVME_ADMIN_GET_LOG_PAGE 0x02 +#define NVME_ADMIN_IDENTIFY 0x06 + +union nvme_smart_log { + struct { + __u8 critical_warning; + __u8 temperature[2]; + __u8 avail_spare; + __u8 spare_thresh; + __u8 percent_used; + __u8 endu_grp_crit_warn_sumry; + __u8 rsvd1[25]; + __u8 data_units_read[16]; + __u8 data_units_written[16]; + __u8 host_commands_read[16]; + __u8 host_commands_written[16]; + __u8 ctrl_busy_time[16]; + __u8 power_cycles[16]; + __u8 power_on_hours[16]; + __u8 unsafe_shutdowns[16]; + __u8 media_errors[16]; + __u8 num_err_log_entries[16]; + __le32 warning_temp_time; + __le32 critical_comp_time; + __le16 temp_sensor[8]; + __le32 thm_temp1_trans_count; + __le32 thm_temp2_trans_count; + __le32 thm_temp1_total_time; + __le32 thm_temp2_total_time; + __u8 rsvd2[280]; + } data; + __u8 raw[512]; +}; diff --git a/src/smart.c b/src/smart.c index 627c16d55..c85bafa3c 100644 --- a/src/smart.c +++ b/src/smart.c @@ -22,6 +22,8 @@ * * Authors: * Vincent Bernat + * Maciej Fijalkowski + * Bartlomiej Kotlowski **/ #include "collectd.h" @@ -32,19 +34,42 @@ #include #include +#include + +#include "intel-nvme.h" +#include "nvme.h" #ifdef HAVE_SYS_CAPABILITY_H #include #endif +#define O_RDWR 02 +#define NVME_SMART_CDW10 0x00800002 +#define SHIFT_BYTE_LEFT 256 +struct nvme_admin_cmd { + __u8 opcode; + __u8 rsvd1[3]; + __u32 nsid; + __u8 rsvd2[16]; + __u64 addr; + __u8 rsvd3[4]; + __u32 data_len; + __u32 cdw10; + __u32 cdw11; + __u8 rsvd4[24]; +}; + +#define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd) + static const char *config_keys[] = {"Disk", "IgnoreSelected", "IgnoreSleepMode", "UseSerial"}; static int config_keys_num = STATIC_ARRAY_SIZE(config_keys); -static ignorelist_t *ignorelist; +static ignorelist_t *ignorelist, *ignorelist_by_serial; static int ignore_sleep_mode; static int use_serial; +static int invert_ignorelist; static int smart_config(const char *key, const char *value) { if (ignorelist == NULL) @@ -52,17 +77,17 @@ static int smart_config(const char *key, const char *value) { if (ignorelist == NULL) return 1; - if (strcasecmp("Disk", key) == 0) { + if (strncasecmp("Disk", key, 5) == 0) { ignorelist_add(ignorelist, value); - } else if (strcasecmp("IgnoreSelected", key) == 0) { - int invert = 1; + } else if (strncasecmp("IgnoreSelected", key, 15) == 0) { + invert_ignorelist = 1; if (IS_TRUE(value)) - invert = 0; - ignorelist_set_invert(ignorelist, invert); - } else if (strcasecmp("IgnoreSleepMode", key) == 0) { + invert_ignorelist = 0; + ignorelist_set_invert(ignorelist, invert_ignorelist); + } else if (strncasecmp("IgnoreSleepMode", key, 16) == 0) { if (IS_TRUE(value)) ignore_sleep_mode = 1; - } else if (strcasecmp("UseSerial", key) == 0) { + } else if (strncasecmp("UseSerial", key, 10) == 0) { if (IS_TRUE(value)) use_serial = 1; } else { @@ -72,6 +97,64 @@ static int smart_config(const char *key, const char *value) { return 0; } /* int smart_config */ +static int create_ignorelist_by_serial(ignorelist_t *il) { + + struct udev *handle_udev; + struct udev_enumerate *enumerate; + struct udev_list_entry *devices, *dev_list_entry; + struct udev_device *dev; + + if (ignorelist_by_serial == NULL) + ignorelist_by_serial = ignorelist_create(invert_ignorelist); + if (ignorelist_by_serial == NULL) + return 1; + + if (invert_ignorelist == 0) { + ignorelist_set_invert(ignorelist, 1); + } + + // Use udev to get a list of disks + handle_udev = udev_new(); + if (!handle_udev) { + ERROR("smart plugin: unable to initialize udev."); + return 1; + } + enumerate = udev_enumerate_new(handle_udev); + if (enumerate == NULL) { + ERROR("fail udev_enumerate_new"); + return 1; + } + udev_enumerate_add_match_subsystem(enumerate, "block"); + udev_enumerate_add_match_property(enumerate, "DEVTYPE", "disk"); + udev_enumerate_scan_devices(enumerate); + devices = udev_enumerate_get_list_entry(enumerate); + if (devices == NULL) { + ERROR("udev returned an empty list deviecs"); + return 1; + } + udev_list_entry_foreach(dev_list_entry, devices) { + const char *path, *devpath, *serial, *name; + path = udev_list_entry_get_name(dev_list_entry); + dev = udev_device_new_from_syspath(handle_udev, path); + devpath = udev_device_get_devnode(dev); + serial = udev_device_get_property_value(dev, "ID_SERIAL_SHORT"); + name = strrchr(devpath, '/'); + if (name != NULL) { + if (name[0] == '/') + name++; + + if (ignorelist_match(ignorelist, name) == 0 && serial != NULL) { + ignorelist_add(ignorelist_by_serial, serial); + } + } + } + + if (invert_ignorelist == 0) { + ignorelist_set_invert(ignorelist, 1); + } + return 0; +} + static void smart_submit(const char *dev, const char *type, const char *type_inst, double value) { value_list_t vl = VALUE_LIST_INIT; @@ -82,7 +165,6 @@ static void smart_submit(const char *dev, const char *type, sstrncpy(vl.plugin_instance, dev, sizeof(vl.plugin_instance)); sstrncpy(vl.type, type, sizeof(vl.type)); sstrncpy(vl.type_instance, type_inst, sizeof(vl.type_instance)); - plugin_dispatch_values(&vl); } @@ -123,7 +205,278 @@ static void handle_attribute(SkDisk *d, const SkSmartAttributeParsedData *a, } } -static void smart_read_disk(SkDisk *d, char const *name) { +static inline double compute_field(__u8 *data) { + double sum = 0; + double add = 0; + + for (int i = 0; i < 16; i++) { + add = data[15 - i]; + for (int j = i + 1; j < 16; j++) { + add *= SHIFT_BYTE_LEFT; + } + sum += add; + } + return sum; +} + +static inline double int48_to_double(__u8 *data) { + double sum = 0; + double add = 0; + + for (int i = 0; i < 6; i++) { + add = data[5 - i]; + for (int j = i + 1; j < 6; j++) { + add *= SHIFT_BYTE_LEFT; + } + sum += add; + } + return sum; +} + +/** + * There is a bunch of metrics that are 16 bytes long and the need to be + * converted onto the single double value, so they can be dispatched + */ +#define NVME_METRIC_16B(metric) \ + { "nvme_" #metric, offsetof(union nvme_smart_log, data.metric), "" } + +static inline uint16_t le16_to_cpu(__le16 x) { + return le16toh((__force __u16)x); +} + +struct nvme_metric_16b { + char *label; + unsigned int offset; + char *type_inst; +} nvme_metrics[] = { + NVME_METRIC_16B(data_units_read), NVME_METRIC_16B(data_units_written), + NVME_METRIC_16B(host_commands_read), NVME_METRIC_16B(host_commands_written), + NVME_METRIC_16B(ctrl_busy_time), NVME_METRIC_16B(power_cycles), + NVME_METRIC_16B(power_on_hours), NVME_METRIC_16B(unsafe_shutdowns), + NVME_METRIC_16B(media_errors), NVME_METRIC_16B(num_err_log_entries), +}; + +static void smart_nvme_submit_16b(char const *name, __u8 *raw) { + int i = 0; + + for (; i < STATIC_ARRAY_SIZE(nvme_metrics); i++) { + DEBUG("%s : %f", nvme_metrics[i].label, + compute_field(&raw[nvme_metrics[i].offset])); + smart_submit(name, nvme_metrics[i].label, nvme_metrics[i].type_inst, + compute_field(&raw[nvme_metrics[i].offset])); + } +} + +static int get_vendor_id(const char *dev, char const *name) { + + int fd, err; + __le16 vid; + + fd = open(dev, O_RDWR); + if (fd < 0) { + ERROR("open failed with %s\n", strerror(errno)); + return fd; + } + + err = ioctl(fd, NVME_IOCTL_ADMIN_CMD, + &(struct nvme_admin_cmd){.opcode = NVME_ADMIN_IDENTIFY, + .nsid = NVME_NSID_ALL, + .addr = (unsigned long)&vid, + .data_len = sizeof(vid), + .cdw10 = 1, + .cdw11 = 0}); + + if (err < 0) { + ERROR("ioctl for NVME_IOCTL_ADMIN_CMD failed with %s\n", strerror(errno)); + close(fd); + return err; + } + + close(fd); + return (int)le16_to_cpu(vid); +} + +static int smart_read_nvme_disk(const char *dev, char const *name) { + union nvme_smart_log smart_log = {}; + int fd, status; + + fd = open(dev, O_RDWR); + if (fd < 0) { + ERROR("open failed with %s\n", strerror(errno)); + return fd; + } + + /** + * Prepare Get Log Page command + * Fill following fields (see NVMe 1.4 spec, section 5.14.1) + * - Number of DWORDS (bits 27:16) - the struct that will be passed for + * filling has 512 bytes which gives 128 (0x80) DWORDS + * - Log Page Indentifier (bits 7:0) - for SMART the id is 0x02 + */ + + status = ioctl(fd, NVME_IOCTL_ADMIN_CMD, + &(struct nvme_admin_cmd){.opcode = NVME_ADMIN_GET_LOG_PAGE, + .nsid = NVME_NSID_ALL, + .addr = (unsigned long)&smart_log, + .data_len = sizeof(smart_log), + .cdw10 = NVME_SMART_CDW10}); + if (status < 0) { + ERROR("ioctl for NVME_IOCTL_ADMIN_CMD failed with %s\n", strerror(errno)); + close(fd); + return status; + } else { + smart_submit(name, "nvme_critical_warning", "", + (double)smart_log.data.critical_warning); + smart_submit(name, "nvme_temperature", "", + ((double)(smart_log.data.temperature[1] << 8) + + smart_log.data.temperature[0] - 273)); + smart_submit(name, "nvme_avail_spare", "", + (double)smart_log.data.avail_spare); + smart_submit(name, "nvme_avail_spare_thresh", "", + (double)smart_log.data.spare_thresh); + smart_submit(name, "nvme_percent_used", "", + (double)smart_log.data.percent_used); + smart_submit(name, "nvme_endu_grp_crit_warn_sumry", "", + (double)smart_log.data.endu_grp_crit_warn_sumry); + smart_submit(name, "nvme_warning_temp_time", "", + (double)smart_log.data.warning_temp_time); + smart_submit(name, "nvme_critical_comp_time", "", + (double)smart_log.data.critical_comp_time); + smart_submit(name, "nvme_temp_sensor", "sensor_1", + (double)smart_log.data.temp_sensor[0] - 273); + smart_submit(name, "nvme_temp_sensor", "sensor_2", + (double)smart_log.data.temp_sensor[1] - 273); + smart_submit(name, "nvme_temp_sensor", "sensor_3", + (double)smart_log.data.temp_sensor[2] - 273); + smart_submit(name, "nvme_temp_sensor", "sensor_4", + (double)smart_log.data.temp_sensor[3] - 273); + smart_submit(name, "nvme_temp_sensor", "sensor_5", + (double)smart_log.data.temp_sensor[4] - 273); + smart_submit(name, "nvme_temp_sensor", "sensor_6", + (double)smart_log.data.temp_sensor[5] - 273); + smart_submit(name, "nvme_temp_sensor", "sensor_7", + (double)smart_log.data.temp_sensor[6] - 273); + smart_submit(name, "nvme_temp_sensor", "sensor_8", + (double)smart_log.data.temp_sensor[7] - 273); + smart_submit(name, "nvme_thermal_mgmt_temp1_transition_count", "", + (double)smart_log.data.thm_temp1_trans_count); + smart_submit(name, "nvme_thermal_mgmt_temp1_total_time", "", + (double)smart_log.data.thm_temp1_total_time); + smart_submit(name, "nvme_thermal_mgmt_temp2_transition_count", "", + (double)smart_log.data.thm_temp2_trans_count); + smart_submit(name, "nvme_thermal_mgmt_temp2_total_time", "", + (double)smart_log.data.thm_temp2_total_time); + smart_nvme_submit_16b(name, smart_log.raw); + } + + close(fd); + return 0; +} + +static int smart_read_nvme_intel_disk(const char *dev, char const *name) { + + DEBUG("name = %s", name); + DEBUG("dev = %s", dev); + + struct nvme_additional_smart_log intel_smart_log; + int fd, status; + fd = open(dev, O_RDWR); + if (fd < 0) { + ERROR("open failed with %s\n", strerror(errno)); + return fd; + } + + /** + * Prepare Get Log Page command + * - Additional SMART Attributes (Log Identfiter CAh) + */ + + status = + ioctl(fd, NVME_IOCTL_ADMIN_CMD, + &(struct nvme_admin_cmd){.opcode = NVME_ADMIN_GET_LOG_PAGE, + .nsid = NVME_NSID_ALL, + .addr = (unsigned long)&intel_smart_log, + .data_len = sizeof(intel_smart_log), + .cdw10 = NVME_SMART_INTEL_CDW10}); + if (status < 0) { + ERROR("ioctl for NVME_IOCTL_ADMIN_CMD failed with %s\n", strerror(errno)); + close(fd); + return status; + } else { + + smart_submit(name, "nvme_program_fail_count", "norm", + (double)intel_smart_log.program_fail_cnt.norm); + smart_submit(name, "nvme_program_fail_count", "raw", + int48_to_double(intel_smart_log.program_fail_cnt.raw)); + smart_submit(name, "nvme_erase_fail_count", "norm", + (double)intel_smart_log.erase_fail_cnt.norm); + smart_submit(name, "nvme_erase_fail_count", "raw", + int48_to_double(intel_smart_log.program_fail_cnt.raw)); + smart_submit(name, "nvme_wear_leveling", "norm", + (double)intel_smart_log.wear_leveling_cnt.norm); + smart_submit( + name, "nvme_wear_leveling", "min", + (double)le16_to_cpu(intel_smart_log.wear_leveling_cnt.wear_level.min)); + smart_submit( + name, "nvme_wear_leveling", "max", + (double)le16_to_cpu(intel_smart_log.wear_leveling_cnt.wear_level.max)); + smart_submit( + name, "nvme_wear_leveling", "avg", + (double)le16_to_cpu(intel_smart_log.wear_leveling_cnt.wear_level.avg)); + smart_submit(name, "nvme_end_to_end_error_detection_count", "norm", + (double)intel_smart_log.e2e_err_cnt.norm); + smart_submit(name, "nvme_end_to_end_error_detection_count", "raw", + int48_to_double(intel_smart_log.e2e_err_cnt.raw)); + smart_submit(name, "nvme_crc_error_count", "norm", + (double)intel_smart_log.crc_err_cnt.norm); + smart_submit(name, "nvme_crc_error_count", "raw", + int48_to_double(intel_smart_log.crc_err_cnt.raw)); + smart_submit(name, "nvme_timed_workload_media_wear", "norm", + (double)intel_smart_log.timed_workload_media_wear.norm); + smart_submit( + name, "nvme_timed_workload_media_wear", "raw", + int48_to_double(intel_smart_log.timed_workload_media_wear.raw)); + smart_submit(name, "nvme_timed_workload_host_reads", "norm", + (double)intel_smart_log.timed_workload_host_reads.norm); + smart_submit( + name, "nvme_timed_workload_host_reads", "raw", + int48_to_double(intel_smart_log.timed_workload_host_reads.raw)); + smart_submit(name, "nvme_timed_workload_timer", "norm", + (double)intel_smart_log.timed_workload_timer.norm); + smart_submit(name, "nvme_timed_workload_timer", "raw", + int48_to_double(intel_smart_log.timed_workload_timer.raw)); + smart_submit(name, "nvme_thermal_throttle_status", "norm", + (double)intel_smart_log.thermal_throttle_status.norm); + smart_submit( + name, "nvme_thermal_throttle_status", "pct", + (double)intel_smart_log.thermal_throttle_status.thermal_throttle.pct); + smart_submit( + name, "nvme_thermal_throttle_status", "count", + (double)intel_smart_log.thermal_throttle_status.thermal_throttle.count); + smart_submit(name, "nvme_retry_buffer_overflow_count", "norm", + (double)intel_smart_log.retry_buffer_overflow_cnt.norm); + smart_submit( + name, "nvme_retry_buffer_overflow_count", "raw", + int48_to_double(intel_smart_log.retry_buffer_overflow_cnt.raw)); + smart_submit(name, "nvme_pll_lock_loss_count", "norm", + (double)intel_smart_log.pll_lock_loss_cnt.norm); + smart_submit(name, "nvme_pll_lock_loss_count", "raw", + int48_to_double(intel_smart_log.pll_lock_loss_cnt.raw)); + smart_submit(name, "nvme_nand_bytes_written", "norm", + (double)intel_smart_log.host_bytes_written.norm); + smart_submit(name, "nvme_nand_bytes_written", "raw", + int48_to_double(intel_smart_log.host_bytes_written.raw)); + smart_submit(name, "nvme_host_bytes_written", "norm", + (double)intel_smart_log.host_bytes_written.norm); + smart_submit(name, "nvme_host_bytes_written", "raw", + int48_to_double(intel_smart_log.host_bytes_written.raw)); + } + + close(fd); + return 0; +} + +static void smart_read_sata_disk(SkDisk *d, char const *name) { SkBool available = FALSE; if (sk_disk_identify_is_available(d, &available) < 0 || !available) { DEBUG("smart plugin: disk %s cannot be identified.", name); @@ -183,6 +536,7 @@ static void smart_read_disk(SkDisk *d, char const *name) { static void smart_handle_disk(const char *dev, const char *serial) { SkDisk *d = NULL; const char *name; + int err; if (use_serial && serial) { name = serial; @@ -192,19 +546,49 @@ static void smart_handle_disk(const char *dev, const char *serial) { return; name++; } - if (ignorelist_match(ignorelist, name) != 0) { - DEBUG("smart plugin: ignoring %s.", dev); - return; + + if (use_serial) { + if (ignorelist_match(ignorelist_by_serial, name) != 0) { + DEBUG("smart plugin: ignoring %s. Name = %s", dev, name); + return; + } + } else { + if (ignorelist_match(ignorelist, name) != 0) { + DEBUG("smart plugin: ignoring %s. Name = %s", dev, name); + return; + } } DEBUG("smart plugin: checking SMART status of %s.", dev); - if (sk_disk_open(dev, &d) < 0) { - ERROR("smart plugin: unable to open %s.", dev); - return; - } - smart_read_disk(d, name); - sk_disk_free(d); + if (strstr(dev, "nvme")) { + err = smart_read_nvme_disk(dev, name); + if (err) { + ERROR("smart plugin: smart_read_nvme_disk failed, %d", err); + } else { + switch (get_vendor_id(dev, name)) { + case INTEL_VENDOR_ID: + err = smart_read_nvme_intel_disk(dev, name); + if (err) { + ERROR("smart plugin: smart_read_nvme_intel_disk failed, %d", err); + } + break; + + default: + DEBUG("No support vendor specific attributes"); + break; + } + } + + } else { + + if (sk_disk_open(dev, &d) < 0) { + ERROR("smart plugin: unable to open %s.", dev); + return; + } + smart_read_sata_disk(d, name); + sk_disk_free(d); + } } static int smart_read(void) { @@ -220,16 +604,24 @@ static int smart_read(void) { return -1; } enumerate = udev_enumerate_new(handle_udev); + if (enumerate == NULL) { + ERROR("fail udev_enumerate_new"); + return -1; + } udev_enumerate_add_match_subsystem(enumerate, "block"); udev_enumerate_add_match_property(enumerate, "DEVTYPE", "disk"); udev_enumerate_scan_devices(enumerate); devices = udev_enumerate_get_list_entry(enumerate); + if (devices == NULL) { + ERROR("udev returned an empty list deviecs"); + return -1; + } udev_list_entry_foreach(dev_list_entry, devices) { const char *path, *devpath, *serial; path = udev_list_entry_get_name(dev_list_entry); dev = udev_device_new_from_syspath(handle_udev, path); devpath = udev_device_get_devnode(dev); - serial = udev_device_get_property_value(dev, "ID_SERIAL"); + serial = udev_device_get_property_value(dev, "ID_SERIAL_SHORT"); /* Query status with libatasmart */ smart_handle_disk(devpath, serial); @@ -243,6 +635,15 @@ static int smart_read(void) { } /* int smart_read */ static int smart_init(void) { + int err; + if (use_serial) { + err = create_ignorelist_by_serial(ignorelist); + if (err != 0) { + ERROR("Enable to create ignorelist_by_serial"); + return 1; + } + } + #if defined(HAVE_SYS_CAPABILITY_H) && defined(CAP_SYS_RAWIO) if (check_capability(CAP_SYS_RAWIO) != 0) { if (getuid() == 0) diff --git a/src/smart_test.c b/src/smart_test.c new file mode 100644 index 000000000..4963b51eb --- /dev/null +++ b/src/smart_test.c @@ -0,0 +1,108 @@ +/** + * collectd - src/smart_test.c + * MIT License + * + * Copyright (C) 2020 Intel Corporation. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Bartlomiej Kotlowski + **/ + +#include "smart.c" +#include "testing.h" + +int VENDOR_ID = 0x8086; +char *CORRECT_DEV_PATH = "/dev/nvme0n1"; + +int ioctl(int __fd, unsigned long int __request, ...) { + va_list valist; + va_start(valist, __request); + struct nvme_admin_cmd *admin_cmd = va_arg(valist, struct nvme_admin_cmd *); + + if (admin_cmd->opcode == NVME_ADMIN_IDENTIFY) { + // icotl ask about vid + __le16 *vid = (__le16 *)admin_cmd->addr; + *vid = VENDOR_ID; + return 0; + } else if (admin_cmd->opcode == NVME_ADMIN_GET_LOG_PAGE) { + // icotl ask about smart attributies + if (admin_cmd->cdw10 == NVME_SMART_INTEL_CDW10) { + // set intel specyfic attrubiuties + struct nvme_additional_smart_log *intel_smart_log = + (struct nvme_additional_smart_log *)admin_cmd->addr; + intel_smart_log->program_fail_cnt.norm = 100; + return 0; + } else if (admin_cmd->cdw10 == NVME_SMART_CDW10) { + // set global smart attrubiuties + union nvme_smart_log *smart_log = (union nvme_smart_log *)admin_cmd->addr; + smart_log->data.critical_warning = 0; + return 0; + } + return -1; // no mock func + } + return -1; // no mock func +}; + +int open(const char *__path, int __oflag, ...) { + if (__path == CORRECT_DEV_PATH) { + return 0; + } + return -1; +} + +DEF_TEST(x) { + int ret; + + ret = get_vendor_id(CORRECT_DEV_PATH, "stub"); + EXPECT_EQ_INT(VENDOR_ID, ret); + + VENDOR_ID = 0x144D; + ret = get_vendor_id(CORRECT_DEV_PATH, "stub"); + EXPECT_EQ_INT(VENDOR_ID, ret); + VENDOR_ID = 0x8086; + + // incorrect with DEV_PATH + ret = get_vendor_id("dev/nvme0nXX", "stub"); + EXPECT_EQ_INT(-1, ret); + + ret = smart_read_nvme_intel_disk(CORRECT_DEV_PATH, "stub"); + EXPECT_EQ_INT(0, ret); + + // incorrect with DEV_PATH + ret = smart_read_nvme_intel_disk("dev/nvme0nXX", "stub"); + EXPECT_EQ_INT(-1, ret); + + CORRECT_DEV_PATH = "dev/sda0"; + + ret = smart_read_nvme_disk(CORRECT_DEV_PATH, "stub"); + EXPECT_EQ_INT(0, ret); + + // incorrect with DEV_PATH + ret = smart_read_nvme_disk("/dev/sdaXX", "stub"); + EXPECT_EQ_INT(-1, ret); + + return 0; +} + +int main(void) { + RUN_TEST(x); + END_TEST; +} diff --git a/src/types.db b/src/types.db index c41c22e63..8ec1f6eeb 100644 --- a/src/types.db +++ b/src/types.db @@ -255,6 +255,42 @@ smart_badsectors value:GAUGE:0:U smart_powercycles value:GAUGE:0:U smart_poweron value:GAUGE:0:U smart_temperature value:GAUGE:-300:300 +nvme_critical_warning value:GAUGE:0:31 +nvme_temperature value:GAUGE:-300:300 +nvme_avail_spare value:GAUGE:0:100 +nvme_avail_spare_thresh value:GAUGE:0:100 +nvme_percent_used value:GAUGE:0:255 +nvme_endu_grp_crit_warn_sumry value:GAUGE:0:15 +nvme_data_units_read value:GAUGE:0:U +nvme_data_units_written value:GAUGE:0:U +nvme_host_commands_read value:GAUGE:0:U +nvme_host_commands_written value:GAUGE:0:U +nvme_ctrl_busy_time value:GAUGE:0:U +nvme_power_cycles value:GAUGE:0:U +nvme_power_on_hours value:GAUGE:0:U +nvme_unsafe_shutdowns value:GAUGE:0:U +nvme_media_errors value:GAUGE:0:U +nvme_num_err_log_entries value:GAUGE:0:U +nvme_warning_temp_time value:GAUGE:0:U +nvme_critical_comp_time value:GAUGE:0:U +nvme_temp_sensor value:GAUGE:-300:300 +nvme_thermal_mgmt_temp1_transition_count value:GAUGE:0:U +nvme_thermal_mgmt_temp1_total_time value:GAUGE:0:U +nvme_thermal_mgmt_temp2_transition_count value:GAUGE:0:U +nvme_thermal_mgmt_temp2_total_time value:GAUGE:0:U +nvme_program_fail_count value:GAUGE:0:U +nvme_erase_fail_count value:GAUGE:0:U +nvme_wear_leveling value:GAUGE:0:U +nvme_end_to_end_error_detection_count value:GAUGE:0:U +nvme_crc_error_count value:GAUGE:0:U +nvme_timed_workload_timer value:GAUGE:0:U +nvme_timed_workload_media_wear value:GAUGE:0:U +nvme_timed_workload_host_reads value:GAUGE:0:100 +nvme_thermal_throttle_status value:GAUGE:0:U +nvme_retry_buffer_overflow_count value:GAUGE:0:U +nvme_pll_lock_loss_count value:GAUGE:0:U +nvme_nand_bytes_written value:GAUGE:0:U +nvme_host_bytes_written value:GAUGE:0:U snr value:GAUGE:0:U spam_check value:GAUGE:0:U spam_score value:GAUGE:U:U -- 2.47.2