From: Pawel Zak Date: Thu, 5 Nov 2020 16:28:50 +0000 (+0100) Subject: New input plugin for RAS (Reliability, Availability and Serviceability) X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=df7b529e17e88feb6d7df7884d9577b22ea72f5c;p=thirdparty%2Fcollectd.git New input plugin for RAS (Reliability, Availability and Serviceability) --- diff --git a/Makefile.am b/Makefile.am index effa6815e..4f0320751 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1827,6 +1827,21 @@ protocols_la_LDFLAGS = $(PLUGIN_LDFLAGS) protocols_la_LIBADD = libignorelist.la endif +if BUILD_PLUGIN_RAS +pkglib_LTLIBRARIES += ras.la +ras_la_SOURCES = src/ras.c +ras_la_CPPFLAGS = $(AM_CPPFLAGS) $(BUILD_WITH_SQLITE3_CPPFLAGS) +ras_la_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_SQLITE3_LDFLAGS) +ras_la_LIBADD = $(BUILD_WITH_SQLITE3_LIBS) + +test_plugin_ras_SOURCES = src/ras_test.c +test_plugin_ras_CPPFLAGS = $(AM_CPPFLAGS) $(BUILD_WITH_SQLITE3_CPPFLAGS) +test_plugin_ras_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_SQLITE3_LDFLAGS) +test_plugin_ras_LDADD = libplugin_mock.la $(BUILD_WITH_SQLITE3_LIBS) +check_PROGRAMS += test_plugin_ras + +endif + if BUILD_PLUGIN_REDFISH pkglib_LTLIBRARIES += redfish.la redfish_la_SOURCES = src/redfish.c diff --git a/README b/README index 8b5255636..8e1c1a6e5 100644 --- a/README +++ b/README @@ -389,6 +389,9 @@ Features collectd without the need to start a heavy interpreter every interval. See collectd-python(5) for details. + - ras + The ras plugin gathers and counts errors provided by RASDaemon + - redis The redis plugin gathers information from a Redis server, including: uptime, used memory, total connections etc. @@ -1019,6 +1022,10 @@ Prerequisites Used by the `slurm` plugin. + * libsqlite3 (optional) + Used by the `ras` plugin. + + * libstatgrab (optional) Used by various plugins to collect statistics on systems other than Linux and/or Solaris. diff --git a/configure.ac b/configure.ac index 69ad27b93..e81076757 100644 --- a/configure.ac +++ b/configure.ac @@ -2254,6 +2254,52 @@ AC_SUBST([BUILD_WITH_LIBREDFISH_LDFLAGS]) # }}} +# --with-sqlite3 {{{ +AC_ARG_WITH([sqlite3], + [AS_HELP_STRING([--with-sqlite3@<:@=PREFIX@:>@], [Path to sqlite3.])], + [ + if test "x$withval" = "xyes"; then + with_sqlite3="yes" + else if test "x$withval" = "xno"; then + with_sqlite3="no" + else + with_sqlite3="yes" + SQLITE3_CPPFLAGS="$SQLITE3_CPPFLAGS -I$withval/include" + SQLITE3_LDFLAGS="$SQLITE3_LDFLAGS -L$withval/lib" + fi; fi + ], + [with_sqlite3="yes"] +) + +SAVE_CPPFLAGS="$CPPFLAGS" +SAVE_LDFLAGS="$LDFLAGS" +CPPFLAGS="$CPPFLAGS $SQLITE3_CPPFLAGS" +LDFLAGS="$LDFLAGS $SQLITE3_LDFLAGS" + +if test "x$with_sqlite3" = "xyes"; then + if test "x$SQLITE3_CPPFLAGS" != "x"; then + AC_MSG_NOTICE([sqlite3 CPPFLAGS: $SQLITE3_CPPFLAGS]) + fi + AC_CHECK_HEADERS([sqlite3.h], + [with_sqlite3="yes"], + [with_sqlite3="no (sqlite3.h not found)"] + ) +fi + +CPPFLAGS="$SAVE_CPPFLAGS" +LDFLAGS="$SAVE_LDFLAGS" + +if test "x$with_sqlite3" = "xyes"; then + BUILD_WITH_SQLITE3_CPPFLAGS="$SQLITE3_CPPFLAGS" + BUILD_WITH_SQLITE3_LDFLAGS="$SQLITE3_LDFLAGS" + BUILD_WITH_SQLITE3_LIBS="-lsqlite3" +fi + +AC_SUBST([BUILD_WITH_SQLITE3_CPPFLAGS]) +AC_SUBST([BUILD_WITH_SQLITE3_LDFLAGS]) +AC_SUBST([BUILD_WITH_SQLITE3_LIBS]) +# }}} + # --with-libcurl {{{ with_curl_config="curl-config" with_curl_cflags="" @@ -6654,6 +6700,7 @@ plugin_processes="no" plugin_procevent="no" plugin_protocols="no" plugin_python="no" +plugin_ras="no" plugin_serial="no" plugin_smart="no" plugin_swap="no" @@ -6980,6 +7027,10 @@ if test "x$with_libpython" != "xno"; then plugin_python="yes" fi +if test "x$with_sqlite3" = "xyes"; then + plugin_ras="yes" +fi + if test "x$with_libatasmart" = "xyes" && test "x$with_libudev" = "xyes"; then plugin_smart="yes" fi @@ -7160,6 +7211,7 @@ AC_PLUGIN([processes], [$plugin_processes], [Process statistic AC_PLUGIN([procevent], [$plugin_procevent], [Process event (start, stop) statistics]) AC_PLUGIN([protocols], [$plugin_protocols], [Protocol (IP, TCP, ...) statistics]) AC_PLUGIN([python], [$plugin_python], [Embed a Python interpreter]) +AC_PLUGIN([ras], [$plugin_ras], [RAS plugin]) AC_PLUGIN([redfish], [$with_libredfish], [Redfish plugin]) AC_PLUGIN([redis], [$with_libhiredis], [Redis plugin]) AC_PLUGIN([routeros], [$with_librouteros], [RouterOS plugin]) @@ -7606,6 +7658,7 @@ AC_MSG_RESULT([ processes . . . . . . $enable_processes]) AC_MSG_RESULT([ procevent . . . . . . $enable_procevent]) AC_MSG_RESULT([ protocols . . . . . . $enable_protocols]) AC_MSG_RESULT([ python . . . . . . . $enable_python]) +AC_MSG_RESULT([ ras . . . . . . . . . $enable_ras]) AC_MSG_RESULT([ redfish . . . . . . . $enable_redfish]) AC_MSG_RESULT([ redis . . . . . . . . $enable_redis]) AC_MSG_RESULT([ routeros . . . . . . $enable_routeros]) diff --git a/src/collectd.conf.in b/src/collectd.conf.in index 846f2ab64..892fd55c8 100644 --- a/src/collectd.conf.in +++ b/src/collectd.conf.in @@ -1491,6 +1491,10 @@ # # +# +# DB_Path "/var/lib/rasdaemon/ras-mc_event.db" +# + # # # Host "redis.example.com" diff --git a/src/collectd.conf.pod b/src/collectd.conf.pod index 7dbbb68f1..d3c412dcb 100644 --- a/src/collectd.conf.pod +++ b/src/collectd.conf.pod @@ -8165,6 +8165,60 @@ matching values will be ignored. This plugin embeds a Python-interpreter into collectd and provides an interface to collectd's plugin system. See L for its documentation. +=head2 Plugin C + +The C plugin gathers and counts errors provided by [RASDaemon] +(https://github.com/mchehab/rasdaemon). This plugin requires access to SQLite3 +database from `RASDaemon`. + +Metrics: + type: ras_errors + plugin_instance: CPU_(number CPU) for metrics per CPU Core metric. For metrics per Server metrics this value is empty. + type_instance: + per CPU Core: + - memory_read_corrected_errors + - memory_read_uncorrectable_errors + - memory_write_corrected_errors + - memory_write_uncorrectable_errors + - cache_l0_l1_errors + - tlb_instruction_errors + - processor_base_errors + - processor_bus_errors + - internal_timer_errors + - smm_handler_code_access_violation_errors + - internal_parity_errors + - frc_errors + - external_mce_errors + - microcode_rom_parity_errors + - unclassified_mce_errors + per Server: + - cache_l2_errors + - upi_errors + +Please note that `processor_base_errors` is aggregate counter measuring the following MCE events: +- internal_timer_errors +- smm_handler_code_access_violation_errors +- internal_parity_errors +- frc_errors +- external_mce_errors +- microcode_rom_parity_errors +- unclassified_mce_errors + + +In addition `RASDaemon` runs, by default, with `--enable-sqlite3` flag. In case of +problems with SQLite3 database please verify this is still a default option. + +=over 4 + +=item B I + +Path to the RASDemon database (sqlite3). Please make sure that user has read +permissions to this database. Example and default setting: + + DB_Path "/var/lib/rasdaemon/ras-mc_event.db" + +=back + =head2 Plugin C The C plugin collects sensor data using REST protocol called diff --git a/src/ras.c b/src/ras.c new file mode 100644 index 000000000..bb81ac802 --- /dev/null +++ b/src/ras.c @@ -0,0 +1,408 @@ +/** + * collectd - src/ras.c + * MIT License + * + * Copyright (C) 2020 Intel Corporation. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Bartlomiej Kotlowski + **/ + +#include "collectd.h" +#include "plugin.h" +#include "sqlite3.h" +#include "utils/common/common.h" + +#include +#include +#include + +#define RAS_PLUGIN "ras" +#define DEFAULT_DB_PATH "/var/lib/rasdaemon/ras-mc_event.db" +#define SQL_QUERY_BUFFER_SIZE 128 +#define PLUGIN_INST_SIZE 30 +#define RAS_TYPE "ras_errors" +#define STRTOL_ERROR_RET_VAL 0 + +struct ras_metrics_per_CPU { + unsigned long long ras_unclassified_mce_errors; + unsigned long long ras_microcode_rom_parity_errors; + unsigned long long ras_external_mce_errors; + unsigned long long ras_frc_errors; + unsigned long long ras_internal_parity_error; + unsigned long long ras_smm_handler_code_access_violation_errors; + unsigned long long ras_internal_timer_errors; + unsigned long long ras_processor_bus_errors; + unsigned long long ras_processor_base_errors; + unsigned long long ras_memory_read_corrected_errors; + unsigned long long ras_memory_write_corrected_errors; + unsigned long long ras_memory_read_uncorrectable_errors; + unsigned long long ras_memory_write_uncorrectable_errors; + unsigned long long ras_cache_l0_l1_errors; + unsigned long long ras_tlb_instruction_errors; +}; + +struct ras_metrics_per_server { + unsigned long long ras_cache_l2_errors; + unsigned long long ras_upi_errors; + struct ras_metrics_per_CPU *per_CPU; +} static ras_metrics_server; + +static const char *config_keys[] = {"DB_Path"}; + +static int config_keys_num = STATIC_ARRAY_SIZE(config_keys); +static int nprocs; +static long int max_id = 0; + +static char path_database[1024] = DEFAULT_DB_PATH; + +static sqlite3 *db; + +// checking if the file is not e.g. a symlink +static bool check_path_correct(const char *path) { + + struct stat sb; + + if (lstat(path, &sb) == -1) { + WARNING("Failed stat check for file: %s", path); + return false; + } + + if (S_ISREG(sb.st_mode) == 0) { + WARNING("Not a regular file: %s", path); + return false; + } + + return true; +} + +// checking unsigned long long int overflow +static void safe_incremented_counter(unsigned long long *value) { + if (*value == ULLONG_MAX) { + WARNING("The counter can't be incremented"); + return; + } + *value += 1; + return; +} + +// checking and validating a string when trying to convert it to an long int +static bool convert_to_number(char *text, long int *number) { + if (text == NULL) { + WARNING("Error when trying to read a numeric value. NULL value"); + return false; + } + *number = strtol(text, NULL, 10); + if (*number == STRTOL_ERROR_RET_VAL) { + if (sizeof(text) == sizeof(char *) && text[0] == '0') { + return true; + } else { + WARNING("Number is not an integer. Data read: %s", text); + return false; + } + } + + if (*number < 0) { + WARNING("Number can't be negative. Data read: %s", text); + return false; + } + + if (errno == ERANGE) { + WARNING("Number can't be greater than LONG_MAX. Data read: %s", text); + return false; + } + return true; +} + +static int ras_config(const char *key, const char *value) { + if (strcasecmp("DB_Path", key) == 0) { + sstrncpy(path_database, value, sizeof(path_database)); + } else { + DEBUG("DB_Path not provided in config. Using default: %s", DEFAULT_DB_PATH); + } + return 0; +} /* int ras_config */ + +static void ras_submit(const char *dev, const char *type, const char *type_inst, + unsigned long long value) { + value_list_t vl = VALUE_LIST_INIT; + + vl.values = &(value_t) {.counter = value}; + vl.values_len = 1; + sstrncpy(vl.plugin, RAS_PLUGIN, sizeof(vl.plugin)); + sstrncpy(vl.plugin_instance, dev, sizeof(vl.plugin_instance)); + sstrncpy(vl.type, type, sizeof(vl.type)); + sstrncpy(vl.type_instance, type_inst, sizeof(vl.type_instance)); + plugin_dispatch_values(&vl); +} +/* void ras_submit */ + +// Assigning the error to the appropriate counter. e.g. error with error_msg +// contains "Microcode ROM parity error" and cpu 0, should increment counter +// ras_microcode_rom_parity_errors for the 0 cpu +static void classify_entries(int cpu, char *error_msg, char *mcistatus_msg) { + + if (strstr(error_msg, "Unclassified") || + strstr(error_msg, "Internal unclassified")) { + safe_incremented_counter( + &ras_metrics_server.per_CPU[cpu].ras_unclassified_mce_errors); + safe_incremented_counter( + &ras_metrics_server.per_CPU[cpu].ras_processor_base_errors); + } + if (strstr(error_msg, "Microcode ROM parity error")) { + safe_incremented_counter( + &ras_metrics_server.per_CPU[cpu].ras_microcode_rom_parity_errors); + safe_incremented_counter( + &ras_metrics_server.per_CPU[cpu].ras_processor_base_errors); + } + if (strstr(error_msg, "External error")) { + safe_incremented_counter( + &ras_metrics_server.per_CPU[cpu].ras_external_mce_errors); + safe_incremented_counter( + &ras_metrics_server.per_CPU[cpu].ras_processor_base_errors); + } + if (strstr(error_msg, "FRC error")) { + safe_incremented_counter(&ras_metrics_server.per_CPU[cpu].ras_frc_errors); + safe_incremented_counter( + &ras_metrics_server.per_CPU[cpu].ras_processor_base_errors); + } + if (strstr(error_msg, "Internal parity error")) { + safe_incremented_counter( + &ras_metrics_server.per_CPU[cpu].ras_internal_parity_error); + safe_incremented_counter( + &ras_metrics_server.per_CPU[cpu].ras_processor_base_errors); + } + if (strstr(error_msg, "SMM Handler Code Access Violation")) { + safe_incremented_counter( + &ras_metrics_server.per_CPU[cpu] + .ras_smm_handler_code_access_violation_errors); + safe_incremented_counter( + &ras_metrics_server.per_CPU[cpu].ras_processor_base_errors); + } + if (strstr(error_msg, "Internal Timer error")) { + safe_incremented_counter( + &ras_metrics_server.per_CPU[cpu].ras_internal_timer_errors); + safe_incremented_counter( + &ras_metrics_server.per_CPU[cpu].ras_processor_base_errors); + } + if (strstr(error_msg, "BUS") && strstr(error_msg, "Error")) { + safe_incremented_counter( + &ras_metrics_server.per_CPU[cpu].ras_processor_bus_errors); + } + if (strstr(error_msg, "Memory read error")) { + if (strstr(mcistatus_msg, "Uncorrected_error")) { + safe_incremented_counter(&ras_metrics_server.per_CPU[cpu] + .ras_memory_read_uncorrectable_errors); + } else { + if (strstr(mcistatus_msg, "Corrected_error")) { + safe_incremented_counter( + &ras_metrics_server.per_CPU[cpu].ras_memory_read_corrected_errors); + } + } + } + if (strstr(error_msg, "Memory write error")) { + if (strstr(mcistatus_msg, "Uncorrected_error")) { + safe_incremented_counter(&ras_metrics_server.per_CPU[cpu] + .ras_memory_write_uncorrectable_errors); + } else { + if (strstr(mcistatus_msg, "Corrected_error")) { + safe_incremented_counter( + &ras_metrics_server.per_CPU[cpu].ras_memory_write_corrected_errors); + } + } + } + + if (((strstr(error_msg, "CACHE Level-0")) || + (strstr(error_msg, "CACHE Level-1"))) && + strstr(error_msg, "Error")) { + safe_incremented_counter( + &ras_metrics_server.per_CPU[cpu].ras_cache_l0_l1_errors); + } + if (strstr(error_msg, "Instruction TLB") && strstr(error_msg, "Error")) { + safe_incremented_counter( + &ras_metrics_server.per_CPU[cpu].ras_tlb_instruction_errors); + } + if (strstr(error_msg, "CACHE Level-2") && strstr(error_msg, "Error")) { + safe_incremented_counter(&ras_metrics_server.ras_cache_l2_errors); + } + if (strstr(error_msg, "UPI:")) { + safe_incremented_counter(&ras_metrics_server.ras_upi_errors); + } + return; +} + +// function is invoked for each result row coming out of the evaluated SQL +// statements +static int callback(void *NotUsed, int argc, char **argv, char **azColName) { + + long int cpu; + long int id; + // argv[0] = id , argv[1] = cpu, argv[2] = error_msg, + // argv[3] = mcistatus_msg + if (convert_to_number(argv[0], &id) && convert_to_number(argv[1], &cpu) && + argv[2] != NULL && argv[3] != NULL) { + if (cpu <= nprocs) { + classify_entries(cpu, argv[2], argv[3]); + } else { + WARNING("CPU number can't be greater than the total number of CPU. CPU: " + "%ld", + cpu); + WARNING("Can't read data id %s, cpu %s, error_msg %s, mcistatus_msg %s", + argv[0], argv[1], argv[2], argv[3]); + } + } else { + WARNING("Can't read data id %s, cpu %s, error_msg %s, mcistatus_msg %s", + argv[0], argv[1], argv[2], argv[3]); + } + + if (max_id < id) { + max_id = id; + } + + return 0; +} + +static void ras_submit_all_metrics() { + ras_submit("", RAS_TYPE, "cache_l2", ras_metrics_server.ras_cache_l2_errors); + ras_submit("", RAS_TYPE, "upi", ras_metrics_server.ras_upi_errors); + + char plugin_inst[PLUGIN_INST_SIZE]; + int cx; + for (int i = 0; i < nprocs; i++) { + cx = snprintf(plugin_inst, PLUGIN_INST_SIZE * sizeof(char), "CPU_%d", i); + if (cx < 0 || cx >= PLUGIN_INST_SIZE * sizeof(char)) { + ERROR("Error encountered during plugin's instance name creation"); + return; + } + + ras_submit(plugin_inst, RAS_TYPE, "unclassified_mce", + ras_metrics_server.per_CPU[i].ras_unclassified_mce_errors); + ras_submit(plugin_inst, RAS_TYPE, "microcode_rom_parity", + ras_metrics_server.per_CPU[i].ras_microcode_rom_parity_errors); + + ras_submit(plugin_inst, RAS_TYPE, "external_mce", + ras_metrics_server.per_CPU[i].ras_external_mce_errors); + + ras_submit(plugin_inst, RAS_TYPE, "frc", + ras_metrics_server.per_CPU[i].ras_frc_errors); + + ras_submit(plugin_inst, RAS_TYPE, "internal_parity", + ras_metrics_server.per_CPU[i].ras_internal_parity_error); + + ras_submit(plugin_inst, RAS_TYPE, "smm_handler_code_access_violation", + ras_metrics_server.per_CPU[i] + .ras_smm_handler_code_access_violation_errors); + + ras_submit(plugin_inst, RAS_TYPE, "internal_timer", + ras_metrics_server.per_CPU[i].ras_internal_timer_errors); + + ras_submit(plugin_inst, RAS_TYPE, "processor_bus", + ras_metrics_server.per_CPU[i].ras_processor_bus_errors); + + ras_submit(plugin_inst, RAS_TYPE, "processor_base", + ras_metrics_server.per_CPU[i].ras_processor_base_errors); + + ras_submit(plugin_inst, RAS_TYPE, "memory_read_corrected", + ras_metrics_server.per_CPU[i].ras_memory_read_corrected_errors); + + ras_submit(plugin_inst, RAS_TYPE, "memory_write_corrected", + ras_metrics_server.per_CPU[i].ras_memory_write_corrected_errors); + + ras_submit( + plugin_inst, RAS_TYPE, "memory_read_uncorrectable", + ras_metrics_server.per_CPU[i].ras_memory_read_uncorrectable_errors); + + ras_submit( + plugin_inst, RAS_TYPE, "memory_write_uncorrectable", + ras_metrics_server.per_CPU[i].ras_memory_write_uncorrectable_errors); + + ras_submit(plugin_inst, RAS_TYPE, "cache_l0_l1", + ras_metrics_server.per_CPU[i].ras_cache_l0_l1_errors); + + ras_submit(plugin_inst, RAS_TYPE, "tlb_instruction", + ras_metrics_server.per_CPU[i].ras_tlb_instruction_errors); + } +} + +static int ras_read(void) { + char *err_msg = 0; + char sql_query[SQL_QUERY_BUFFER_SIZE]; + int rc; + int cx; + + cx = snprintf(sql_query, SQL_QUERY_BUFFER_SIZE * sizeof(char), + "select id, cpu, error_msg, mcistatus_msg from " + "mce_record where id>%ld", + max_id); + + if (cx < 0 || cx >= SQL_QUERY_BUFFER_SIZE * sizeof(char)) { + ERROR("Error encountered during SQL query creation"); + return -1; + } + + rc = sqlite3_exec(db, sql_query, callback, 0, &err_msg); + if (rc != 0) { + DEBUG("SQL error: %s\n", err_msg); + sqlite3_free(err_msg); + return -1; + } + ras_submit_all_metrics(); + + return 0; +} /* int ras_read */ + +static int ras_init(void) { + int rc; + if (!check_path_correct(path_database)) { + ERROR("Incorrect path to Database: %s", path_database); + return -1; + } + rc = sqlite3_open_v2(path_database, &db, SQLITE_OPEN_READONLY, NULL); + + if (rc) { + ERROR("Can't open database: %s", sqlite3_errmsg(db)); + return -1; + } else { + INFO("Database opened successfully"); + } + + nprocs = get_nprocs_conf(); + ras_metrics_server.per_CPU = (struct ras_metrics_per_CPU *) calloc( + nprocs, sizeof(struct ras_metrics_per_CPU)); + if (ras_metrics_server.per_CPU == NULL) { + ERROR("Fail allocated memory"); + return -1; + } + return 0; +} /* int ras_init */ + +static int ras_shutdown(void) { + sqlite3_close(db); + free(ras_metrics_server.per_CPU); + return 0; +} + +void module_register(void) { + plugin_register_config(RAS_PLUGIN, ras_config, config_keys, config_keys_num); + plugin_register_init(RAS_PLUGIN, ras_init); + plugin_register_read(RAS_PLUGIN, ras_read); + plugin_register_shutdown(RAS_PLUGIN, ras_shutdown); +} /* void module_register */ diff --git a/src/ras_test.c b/src/ras_test.c new file mode 100644 index 000000000..c3e219a42 --- /dev/null +++ b/src/ras_test.c @@ -0,0 +1,260 @@ +/** + * collectd - src/ras_test.c + * MIT License + * + * Copyright (C) 2020 Intel Corporation. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Authors: + * Bartlomiej Kotlowski + **/ + +#include "ras.c" +#include "testing.h" + +#define NPROCS 2 + +void clear_ras_metrics_server() { + nprocs = NPROCS; + if (ras_metrics_server.per_CPU != NULL) { + free(ras_metrics_server.per_CPU); + } + ras_metrics_server.ras_cache_l2_errors = 0; + ras_metrics_server.ras_upi_errors = 0; + + ras_metrics_server.per_CPU = (struct ras_metrics_per_CPU *) calloc( + nprocs, sizeof(struct ras_metrics_per_CPU)); +} + +DEF_TEST(classify_entries) { + int CPU = 0; + clear_ras_metrics_server(); + classify_entries(CPU, "Unclassified", "foo"); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_unclassified_mce_errors, + 1); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_base_errors, + 1); + + clear_ras_metrics_server(); + classify_entries(CPU, "Internal unclassified", "foo"); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_unclassified_mce_errors, + 1); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_base_errors, + 1); + + clear_ras_metrics_server(); + classify_entries(CPU, "Microcode ROM parity error", "foo"); + EXPECT_EQ_UINT64( + ras_metrics_server.per_CPU[CPU].ras_microcode_rom_parity_errors, 1); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_base_errors, + 1); + + clear_ras_metrics_server(); + classify_entries(CPU, "External error", "foo"); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_external_mce_errors, 1); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_base_errors, + 1); + + clear_ras_metrics_server(); + classify_entries(CPU, "FRC error", "foo"); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_frc_errors, 1); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_base_errors, + 1); + + clear_ras_metrics_server(); + classify_entries(CPU, "Internal parity error", "foo"); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_internal_parity_error, + 1); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_base_errors, + 1); + + clear_ras_metrics_server(); + classify_entries(CPU, "SMM Handler Code Access Violation", "foo"); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU] + .ras_smm_handler_code_access_violation_errors, + 1); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_base_errors, + 1); + + clear_ras_metrics_server(); + classify_entries(CPU, "Internal Timer error", "foo"); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_internal_timer_errors, + 1); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_base_errors, + 1); + + clear_ras_metrics_server(); + classify_entries(CPU, "BUS Error", "foo"); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_bus_errors, 1); + + clear_ras_metrics_server(); + classify_entries(CPU, "Error", "foo"); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_bus_errors, 0); + + clear_ras_metrics_server(); + classify_entries(CPU, "BUS", "foo"); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_bus_errors, 0); + + clear_ras_metrics_server(); + classify_entries(CPU, "Memory read error", "Uncorrected_error"); + EXPECT_EQ_UINT64( + ras_metrics_server.per_CPU[CPU].ras_memory_read_uncorrectable_errors, 1); + + clear_ras_metrics_server(); + classify_entries(CPU, "Memory read error", "Corrected_error"); + EXPECT_EQ_UINT64( + ras_metrics_server.per_CPU[CPU].ras_memory_read_corrected_errors, 1); + + clear_ras_metrics_server(); + classify_entries(CPU, "Memory read error", "foo"); + EXPECT_EQ_UINT64( + ras_metrics_server.per_CPU[CPU].ras_memory_read_uncorrectable_errors, 0); + EXPECT_EQ_UINT64( + ras_metrics_server.per_CPU[CPU].ras_memory_read_corrected_errors, 0); + + clear_ras_metrics_server(); + classify_entries(CPU, "Memory write error", "Uncorrected_error"); + EXPECT_EQ_UINT64( + ras_metrics_server.per_CPU[CPU].ras_memory_write_uncorrectable_errors, 1); + + clear_ras_metrics_server(); + classify_entries(CPU, "Memory write error", "Corrected_error"); + EXPECT_EQ_UINT64( + ras_metrics_server.per_CPU[CPU].ras_memory_write_corrected_errors, 1); + + clear_ras_metrics_server(); + classify_entries(CPU, "Memory write error", "foo"); + EXPECT_EQ_UINT64( + ras_metrics_server.per_CPU[CPU].ras_memory_write_uncorrectable_errors, 0); + EXPECT_EQ_UINT64( + ras_metrics_server.per_CPU[CPU].ras_memory_write_corrected_errors, 0); + + clear_ras_metrics_server(); + classify_entries(CPU, "CACHE Level-1 Error", "foo"); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_cache_l0_l1_errors, 1); + EXPECT_EQ_UINT64(ras_metrics_server.ras_cache_l2_errors, 0); + + clear_ras_metrics_server(); + classify_entries(CPU, "CACHE Level-1 Error", "foo"); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_cache_l0_l1_errors, 1); + EXPECT_EQ_UINT64(ras_metrics_server.ras_cache_l2_errors, 0); + + clear_ras_metrics_server(); + classify_entries(CPU, "CACHE Level-2 Error", "foo"); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_cache_l0_l1_errors, 0); + EXPECT_EQ_UINT64(ras_metrics_server.ras_cache_l2_errors, 1); + + clear_ras_metrics_server(); + classify_entries(CPU, "CACHE Level-3 Error", "foo"); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_cache_l0_l1_errors, 0); + EXPECT_EQ_UINT64(ras_metrics_server.ras_cache_l2_errors, 0); + + clear_ras_metrics_server(); + classify_entries(CPU, "CACHE Level-0", "foo"); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_cache_l0_l1_errors, 0); + EXPECT_EQ_UINT64(ras_metrics_server.ras_cache_l2_errors, 0); + + clear_ras_metrics_server(); + classify_entries(CPU, "Instruction TLB Error", "foo"); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_tlb_instruction_errors, + 1); + + clear_ras_metrics_server(); + classify_entries(CPU, "Instruction TLB", "foo"); + EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_tlb_instruction_errors, + 0); + + clear_ras_metrics_server(); + classify_entries(CPU, "UPI:", "foo"); + EXPECT_EQ_UINT64(ras_metrics_server.ras_upi_errors, 1); + + free(ras_metrics_server.per_CPU); + return 0; +} + +DEF_TEST(safe_incremented_counter) { + unsigned long long value; + + value = 0; + safe_incremented_counter(&value); + EXPECT_EQ_UINT64(1, value); + + value = LONG_MAX; + safe_incremented_counter(&value); + EXPECT_EQ_UINT64(LONG_MAX + 1, value); + + value = ULLONG_MAX; + safe_incremented_counter(&value); + EXPECT_EQ_UINT64(ULLONG_MAX, value); + + return 0; +} + +DEF_TEST(convert_to_number) { + bool ret; + long int number; + + ret = convert_to_number("0", &number); + EXPECT_EQ_INT(0, number); + EXPECT_EQ_INT(1, ret ? 1 : 0); + + ret = convert_to_number("123", &number); + EXPECT_EQ_INT(123, number); + EXPECT_EQ_INT(1, ret ? 1 : 0); + + // convert max int + ret = convert_to_number("2147483647", &number); + EXPECT_EQ_INT(1, ret ? 1 : 0); + EXPECT_EQ_INT(2147483647, number); + + switch (sizeof(long int)) { + case 4: + // convert over max long int + ret = convert_to_number("2147483648", &number); + EXPECT_EQ_INT(0, ret ? 1 : 0); + break; + + case 8: + // convert max long int + ret = convert_to_number("9223372036854775807", &number); + EXPECT_EQ_INT(1, ret ? 1 : 0); + EXPECT_EQ_INT(1, (number == LONG_MAX) ? 1 : 0); + // convert over max long int + ret = convert_to_number("9223372036854775809", &number); + EXPECT_EQ_INT(0, ret ? 1 : 0); + + break; + } + // convert max int + ret = convert_to_number("foo", &number); + EXPECT_EQ_INT(0, ret ? 1 : 0); + + ret = convert_to_number("-1", &number); + EXPECT_EQ_INT(0, ret ? 1 : 0); + + return 0; +} + +int main(void) { + RUN_TEST(classify_entries); + RUN_TEST(safe_incremented_counter); + RUN_TEST(convert_to_number); + END_TEST; +} diff --git a/src/types.db b/src/types.db index 2e4a41c7c..a976a4316 100644 --- a/src/types.db +++ b/src/types.db @@ -241,6 +241,7 @@ ps_vm value:GAUGE:0:9223372036854775807 pstates_enabled value:GAUGE:0:1 pubsub value:GAUGE:0:U queue_length value:GAUGE:0:U +ras_errors value:COUNTER:0:U records value:GAUGE:0:U redis_command_cputime value:DERIVE:0:U requests value:GAUGE:0:U