protocols_la_LIBADD = libignorelist.la
endif
+if BUILD_PLUGIN_RAS
+pkglib_LTLIBRARIES += ras.la
+ras_la_SOURCES = src/ras.c
+ras_la_CPPFLAGS = $(AM_CPPFLAGS) $(BUILD_WITH_SQLITE3_CPPFLAGS)
+ras_la_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_SQLITE3_LDFLAGS)
+ras_la_LIBADD = $(BUILD_WITH_SQLITE3_LIBS)
+
+test_plugin_ras_SOURCES = src/ras_test.c
+test_plugin_ras_CPPFLAGS = $(AM_CPPFLAGS) $(BUILD_WITH_SQLITE3_CPPFLAGS)
+test_plugin_ras_LDFLAGS = $(PLUGIN_LDFLAGS) $(BUILD_WITH_SQLITE3_LDFLAGS)
+test_plugin_ras_LDADD = libplugin_mock.la $(BUILD_WITH_SQLITE3_LIBS)
+check_PROGRAMS += test_plugin_ras
+
+endif
+
if BUILD_PLUGIN_REDFISH
pkglib_LTLIBRARIES += redfish.la
redfish_la_SOURCES = src/redfish.c
collectd without the need to start a heavy interpreter every interval.
See collectd-python(5) for details.
+ - ras
+ The ras plugin gathers and counts errors provided by RASDaemon
+
- redis
The redis plugin gathers information from a Redis server, including:
uptime, used memory, total connections etc.
Used by the `slurm` plugin.
<https://slurm.schedmd.com/>
+ * libsqlite3 (optional)
+ Used by the `ras` plugin.
+ <https://sqlite.org/>
+
* libstatgrab (optional)
Used by various plugins to collect statistics on systems other than Linux
and/or Solaris.
# }}}
+# --with-sqlite3 {{{
+AC_ARG_WITH([sqlite3],
+ [AS_HELP_STRING([--with-sqlite3@<:@=PREFIX@:>@], [Path to sqlite3.])],
+ [
+ if test "x$withval" = "xyes"; then
+ with_sqlite3="yes"
+ else if test "x$withval" = "xno"; then
+ with_sqlite3="no"
+ else
+ with_sqlite3="yes"
+ SQLITE3_CPPFLAGS="$SQLITE3_CPPFLAGS -I$withval/include"
+ SQLITE3_LDFLAGS="$SQLITE3_LDFLAGS -L$withval/lib"
+ fi; fi
+ ],
+ [with_sqlite3="yes"]
+)
+
+SAVE_CPPFLAGS="$CPPFLAGS"
+SAVE_LDFLAGS="$LDFLAGS"
+CPPFLAGS="$CPPFLAGS $SQLITE3_CPPFLAGS"
+LDFLAGS="$LDFLAGS $SQLITE3_LDFLAGS"
+
+if test "x$with_sqlite3" = "xyes"; then
+ if test "x$SQLITE3_CPPFLAGS" != "x"; then
+ AC_MSG_NOTICE([sqlite3 CPPFLAGS: $SQLITE3_CPPFLAGS])
+ fi
+ AC_CHECK_HEADERS([sqlite3.h],
+ [with_sqlite3="yes"],
+ [with_sqlite3="no (sqlite3.h not found)"]
+ )
+fi
+
+CPPFLAGS="$SAVE_CPPFLAGS"
+LDFLAGS="$SAVE_LDFLAGS"
+
+if test "x$with_sqlite3" = "xyes"; then
+ BUILD_WITH_SQLITE3_CPPFLAGS="$SQLITE3_CPPFLAGS"
+ BUILD_WITH_SQLITE3_LDFLAGS="$SQLITE3_LDFLAGS"
+ BUILD_WITH_SQLITE3_LIBS="-lsqlite3"
+fi
+
+AC_SUBST([BUILD_WITH_SQLITE3_CPPFLAGS])
+AC_SUBST([BUILD_WITH_SQLITE3_LDFLAGS])
+AC_SUBST([BUILD_WITH_SQLITE3_LIBS])
+# }}}
+
# --with-libcurl {{{
with_curl_config="curl-config"
with_curl_cflags=""
plugin_procevent="no"
plugin_protocols="no"
plugin_python="no"
+plugin_ras="no"
plugin_serial="no"
plugin_smart="no"
plugin_swap="no"
plugin_python="yes"
fi
+if test "x$with_sqlite3" = "xyes"; then
+ plugin_ras="yes"
+fi
+
if test "x$with_libatasmart" = "xyes" && test "x$with_libudev" = "xyes"; then
plugin_smart="yes"
fi
AC_PLUGIN([procevent], [$plugin_procevent], [Process event (start, stop) statistics])
AC_PLUGIN([protocols], [$plugin_protocols], [Protocol (IP, TCP, ...) statistics])
AC_PLUGIN([python], [$plugin_python], [Embed a Python interpreter])
+AC_PLUGIN([ras], [$plugin_ras], [RAS plugin])
AC_PLUGIN([redfish], [$with_libredfish], [Redfish plugin])
AC_PLUGIN([redis], [$with_libhiredis], [Redis plugin])
AC_PLUGIN([routeros], [$with_librouteros], [RouterOS plugin])
AC_MSG_RESULT([ procevent . . . . . . $enable_procevent])
AC_MSG_RESULT([ protocols . . . . . . $enable_protocols])
AC_MSG_RESULT([ python . . . . . . . $enable_python])
+AC_MSG_RESULT([ ras . . . . . . . . . $enable_ras])
AC_MSG_RESULT([ redfish . . . . . . . $enable_redfish])
AC_MSG_RESULT([ redis . . . . . . . . $enable_redis])
AC_MSG_RESULT([ routeros . . . . . . $enable_routeros])
# </Module>
#</Plugin>
+#<Plugin ras>
+# DB_Path "/var/lib/rasdaemon/ras-mc_event.db"
+#</Plugin>
+
#<Plugin redis>
# <Node example>
# Host "redis.example.com"
This plugin embeds a Python-interpreter into collectd and provides an interface
to collectd's plugin system. See L<collectd-python(5)> for its documentation.
+=head2 Plugin C<ras>
+
+The C<ras> plugin gathers and counts errors provided by [RASDaemon]
+(https://github.com/mchehab/rasdaemon). This plugin requires access to SQLite3
+database from `RASDaemon`.
+
+Metrics:
+ type: ras_errors
+ plugin_instance: CPU_(number CPU) for metrics per CPU Core metric. For metrics per Server metrics this value is empty.
+ type_instance:
+ per CPU Core:
+ - memory_read_corrected_errors
+ - memory_read_uncorrectable_errors
+ - memory_write_corrected_errors
+ - memory_write_uncorrectable_errors
+ - cache_l0_l1_errors
+ - tlb_instruction_errors
+ - processor_base_errors
+ - processor_bus_errors
+ - internal_timer_errors
+ - smm_handler_code_access_violation_errors
+ - internal_parity_errors
+ - frc_errors
+ - external_mce_errors
+ - microcode_rom_parity_errors
+ - unclassified_mce_errors
+ per Server:
+ - cache_l2_errors
+ - upi_errors
+
+Please note that `processor_base_errors` is aggregate counter measuring the following MCE events:
+- internal_timer_errors
+- smm_handler_code_access_violation_errors
+- internal_parity_errors
+- frc_errors
+- external_mce_errors
+- microcode_rom_parity_errors
+- unclassified_mce_errors
+
+
+In addition `RASDaemon` runs, by default, with `--enable-sqlite3` flag. In case of
+problems with SQLite3 database please verify this is still a default option.
+
+=over 4
+
+=item B<DB_Path> I<Path>
+
+Path to the RASDemon database (sqlite3). Please make sure that user has read
+permissions to this database. Example and default setting:
+
+ DB_Path "/var/lib/rasdaemon/ras-mc_event.db"
+
+=back
+
=head2 Plugin C<redfish>
The C<redfish> plugin collects sensor data using REST protocol called
--- /dev/null
+/**
+ * collectd - src/ras.c
+ * MIT License
+ *
+ * Copyright (C) 2020 Intel Corporation. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * Bartlomiej Kotlowski <bartlomiej.kotlowski@intel.com>
+ **/
+
+#include "collectd.h"
+#include "plugin.h"
+#include "sqlite3.h"
+#include "utils/common/common.h"
+
+#include <sys/stat.h>
+#include <sys/sysinfo.h>
+#include <sys/types.h>
+
+#define RAS_PLUGIN "ras"
+#define DEFAULT_DB_PATH "/var/lib/rasdaemon/ras-mc_event.db"
+#define SQL_QUERY_BUFFER_SIZE 128
+#define PLUGIN_INST_SIZE 30
+#define RAS_TYPE "ras_errors"
+#define STRTOL_ERROR_RET_VAL 0
+
+struct ras_metrics_per_CPU {
+ unsigned long long ras_unclassified_mce_errors;
+ unsigned long long ras_microcode_rom_parity_errors;
+ unsigned long long ras_external_mce_errors;
+ unsigned long long ras_frc_errors;
+ unsigned long long ras_internal_parity_error;
+ unsigned long long ras_smm_handler_code_access_violation_errors;
+ unsigned long long ras_internal_timer_errors;
+ unsigned long long ras_processor_bus_errors;
+ unsigned long long ras_processor_base_errors;
+ unsigned long long ras_memory_read_corrected_errors;
+ unsigned long long ras_memory_write_corrected_errors;
+ unsigned long long ras_memory_read_uncorrectable_errors;
+ unsigned long long ras_memory_write_uncorrectable_errors;
+ unsigned long long ras_cache_l0_l1_errors;
+ unsigned long long ras_tlb_instruction_errors;
+};
+
+struct ras_metrics_per_server {
+ unsigned long long ras_cache_l2_errors;
+ unsigned long long ras_upi_errors;
+ struct ras_metrics_per_CPU *per_CPU;
+} static ras_metrics_server;
+
+static const char *config_keys[] = {"DB_Path"};
+
+static int config_keys_num = STATIC_ARRAY_SIZE(config_keys);
+static int nprocs;
+static long int max_id = 0;
+
+static char path_database[1024] = DEFAULT_DB_PATH;
+
+static sqlite3 *db;
+
+// checking if the file is not e.g. a symlink
+static bool check_path_correct(const char *path) {
+
+ struct stat sb;
+
+ if (lstat(path, &sb) == -1) {
+ WARNING("Failed stat check for file: %s", path);
+ return false;
+ }
+
+ if (S_ISREG(sb.st_mode) == 0) {
+ WARNING("Not a regular file: %s", path);
+ return false;
+ }
+
+ return true;
+}
+
+// checking unsigned long long int overflow
+static void safe_incremented_counter(unsigned long long *value) {
+ if (*value == ULLONG_MAX) {
+ WARNING("The counter can't be incremented");
+ return;
+ }
+ *value += 1;
+ return;
+}
+
+// checking and validating a string when trying to convert it to an long int
+static bool convert_to_number(char *text, long int *number) {
+ if (text == NULL) {
+ WARNING("Error when trying to read a numeric value. NULL value");
+ return false;
+ }
+ *number = strtol(text, NULL, 10);
+ if (*number == STRTOL_ERROR_RET_VAL) {
+ if (sizeof(text) == sizeof(char *) && text[0] == '0') {
+ return true;
+ } else {
+ WARNING("Number is not an integer. Data read: %s", text);
+ return false;
+ }
+ }
+
+ if (*number < 0) {
+ WARNING("Number can't be negative. Data read: %s", text);
+ return false;
+ }
+
+ if (errno == ERANGE) {
+ WARNING("Number can't be greater than LONG_MAX. Data read: %s", text);
+ return false;
+ }
+ return true;
+}
+
+static int ras_config(const char *key, const char *value) {
+ if (strcasecmp("DB_Path", key) == 0) {
+ sstrncpy(path_database, value, sizeof(path_database));
+ } else {
+ DEBUG("DB_Path not provided in config. Using default: %s", DEFAULT_DB_PATH);
+ }
+ return 0;
+} /* int ras_config */
+
+static void ras_submit(const char *dev, const char *type, const char *type_inst,
+ unsigned long long value) {
+ value_list_t vl = VALUE_LIST_INIT;
+
+ vl.values = &(value_t) {.counter = value};
+ vl.values_len = 1;
+ sstrncpy(vl.plugin, RAS_PLUGIN, sizeof(vl.plugin));
+ sstrncpy(vl.plugin_instance, dev, sizeof(vl.plugin_instance));
+ sstrncpy(vl.type, type, sizeof(vl.type));
+ sstrncpy(vl.type_instance, type_inst, sizeof(vl.type_instance));
+ plugin_dispatch_values(&vl);
+}
+/* void ras_submit */
+
+// Assigning the error to the appropriate counter. e.g. error with error_msg
+// contains "Microcode ROM parity error" and cpu 0, should increment counter
+// ras_microcode_rom_parity_errors for the 0 cpu
+static void classify_entries(int cpu, char *error_msg, char *mcistatus_msg) {
+
+ if (strstr(error_msg, "Unclassified") ||
+ strstr(error_msg, "Internal unclassified")) {
+ safe_incremented_counter(
+ &ras_metrics_server.per_CPU[cpu].ras_unclassified_mce_errors);
+ safe_incremented_counter(
+ &ras_metrics_server.per_CPU[cpu].ras_processor_base_errors);
+ }
+ if (strstr(error_msg, "Microcode ROM parity error")) {
+ safe_incremented_counter(
+ &ras_metrics_server.per_CPU[cpu].ras_microcode_rom_parity_errors);
+ safe_incremented_counter(
+ &ras_metrics_server.per_CPU[cpu].ras_processor_base_errors);
+ }
+ if (strstr(error_msg, "External error")) {
+ safe_incremented_counter(
+ &ras_metrics_server.per_CPU[cpu].ras_external_mce_errors);
+ safe_incremented_counter(
+ &ras_metrics_server.per_CPU[cpu].ras_processor_base_errors);
+ }
+ if (strstr(error_msg, "FRC error")) {
+ safe_incremented_counter(&ras_metrics_server.per_CPU[cpu].ras_frc_errors);
+ safe_incremented_counter(
+ &ras_metrics_server.per_CPU[cpu].ras_processor_base_errors);
+ }
+ if (strstr(error_msg, "Internal parity error")) {
+ safe_incremented_counter(
+ &ras_metrics_server.per_CPU[cpu].ras_internal_parity_error);
+ safe_incremented_counter(
+ &ras_metrics_server.per_CPU[cpu].ras_processor_base_errors);
+ }
+ if (strstr(error_msg, "SMM Handler Code Access Violation")) {
+ safe_incremented_counter(
+ &ras_metrics_server.per_CPU[cpu]
+ .ras_smm_handler_code_access_violation_errors);
+ safe_incremented_counter(
+ &ras_metrics_server.per_CPU[cpu].ras_processor_base_errors);
+ }
+ if (strstr(error_msg, "Internal Timer error")) {
+ safe_incremented_counter(
+ &ras_metrics_server.per_CPU[cpu].ras_internal_timer_errors);
+ safe_incremented_counter(
+ &ras_metrics_server.per_CPU[cpu].ras_processor_base_errors);
+ }
+ if (strstr(error_msg, "BUS") && strstr(error_msg, "Error")) {
+ safe_incremented_counter(
+ &ras_metrics_server.per_CPU[cpu].ras_processor_bus_errors);
+ }
+ if (strstr(error_msg, "Memory read error")) {
+ if (strstr(mcistatus_msg, "Uncorrected_error")) {
+ safe_incremented_counter(&ras_metrics_server.per_CPU[cpu]
+ .ras_memory_read_uncorrectable_errors);
+ } else {
+ if (strstr(mcistatus_msg, "Corrected_error")) {
+ safe_incremented_counter(
+ &ras_metrics_server.per_CPU[cpu].ras_memory_read_corrected_errors);
+ }
+ }
+ }
+ if (strstr(error_msg, "Memory write error")) {
+ if (strstr(mcistatus_msg, "Uncorrected_error")) {
+ safe_incremented_counter(&ras_metrics_server.per_CPU[cpu]
+ .ras_memory_write_uncorrectable_errors);
+ } else {
+ if (strstr(mcistatus_msg, "Corrected_error")) {
+ safe_incremented_counter(
+ &ras_metrics_server.per_CPU[cpu].ras_memory_write_corrected_errors);
+ }
+ }
+ }
+
+ if (((strstr(error_msg, "CACHE Level-0")) ||
+ (strstr(error_msg, "CACHE Level-1"))) &&
+ strstr(error_msg, "Error")) {
+ safe_incremented_counter(
+ &ras_metrics_server.per_CPU[cpu].ras_cache_l0_l1_errors);
+ }
+ if (strstr(error_msg, "Instruction TLB") && strstr(error_msg, "Error")) {
+ safe_incremented_counter(
+ &ras_metrics_server.per_CPU[cpu].ras_tlb_instruction_errors);
+ }
+ if (strstr(error_msg, "CACHE Level-2") && strstr(error_msg, "Error")) {
+ safe_incremented_counter(&ras_metrics_server.ras_cache_l2_errors);
+ }
+ if (strstr(error_msg, "UPI:")) {
+ safe_incremented_counter(&ras_metrics_server.ras_upi_errors);
+ }
+ return;
+}
+
+// function is invoked for each result row coming out of the evaluated SQL
+// statements
+static int callback(void *NotUsed, int argc, char **argv, char **azColName) {
+
+ long int cpu;
+ long int id;
+ // argv[0] = id , argv[1] = cpu, argv[2] = error_msg,
+ // argv[3] = mcistatus_msg
+ if (convert_to_number(argv[0], &id) && convert_to_number(argv[1], &cpu) &&
+ argv[2] != NULL && argv[3] != NULL) {
+ if (cpu <= nprocs) {
+ classify_entries(cpu, argv[2], argv[3]);
+ } else {
+ WARNING("CPU number can't be greater than the total number of CPU. CPU: "
+ "%ld",
+ cpu);
+ WARNING("Can't read data id %s, cpu %s, error_msg %s, mcistatus_msg %s",
+ argv[0], argv[1], argv[2], argv[3]);
+ }
+ } else {
+ WARNING("Can't read data id %s, cpu %s, error_msg %s, mcistatus_msg %s",
+ argv[0], argv[1], argv[2], argv[3]);
+ }
+
+ if (max_id < id) {
+ max_id = id;
+ }
+
+ return 0;
+}
+
+static void ras_submit_all_metrics() {
+ ras_submit("", RAS_TYPE, "cache_l2", ras_metrics_server.ras_cache_l2_errors);
+ ras_submit("", RAS_TYPE, "upi", ras_metrics_server.ras_upi_errors);
+
+ char plugin_inst[PLUGIN_INST_SIZE];
+ int cx;
+ for (int i = 0; i < nprocs; i++) {
+ cx = snprintf(plugin_inst, PLUGIN_INST_SIZE * sizeof(char), "CPU_%d", i);
+ if (cx < 0 || cx >= PLUGIN_INST_SIZE * sizeof(char)) {
+ ERROR("Error encountered during plugin's instance name creation");
+ return;
+ }
+
+ ras_submit(plugin_inst, RAS_TYPE, "unclassified_mce",
+ ras_metrics_server.per_CPU[i].ras_unclassified_mce_errors);
+ ras_submit(plugin_inst, RAS_TYPE, "microcode_rom_parity",
+ ras_metrics_server.per_CPU[i].ras_microcode_rom_parity_errors);
+
+ ras_submit(plugin_inst, RAS_TYPE, "external_mce",
+ ras_metrics_server.per_CPU[i].ras_external_mce_errors);
+
+ ras_submit(plugin_inst, RAS_TYPE, "frc",
+ ras_metrics_server.per_CPU[i].ras_frc_errors);
+
+ ras_submit(plugin_inst, RAS_TYPE, "internal_parity",
+ ras_metrics_server.per_CPU[i].ras_internal_parity_error);
+
+ ras_submit(plugin_inst, RAS_TYPE, "smm_handler_code_access_violation",
+ ras_metrics_server.per_CPU[i]
+ .ras_smm_handler_code_access_violation_errors);
+
+ ras_submit(plugin_inst, RAS_TYPE, "internal_timer",
+ ras_metrics_server.per_CPU[i].ras_internal_timer_errors);
+
+ ras_submit(plugin_inst, RAS_TYPE, "processor_bus",
+ ras_metrics_server.per_CPU[i].ras_processor_bus_errors);
+
+ ras_submit(plugin_inst, RAS_TYPE, "processor_base",
+ ras_metrics_server.per_CPU[i].ras_processor_base_errors);
+
+ ras_submit(plugin_inst, RAS_TYPE, "memory_read_corrected",
+ ras_metrics_server.per_CPU[i].ras_memory_read_corrected_errors);
+
+ ras_submit(plugin_inst, RAS_TYPE, "memory_write_corrected",
+ ras_metrics_server.per_CPU[i].ras_memory_write_corrected_errors);
+
+ ras_submit(
+ plugin_inst, RAS_TYPE, "memory_read_uncorrectable",
+ ras_metrics_server.per_CPU[i].ras_memory_read_uncorrectable_errors);
+
+ ras_submit(
+ plugin_inst, RAS_TYPE, "memory_write_uncorrectable",
+ ras_metrics_server.per_CPU[i].ras_memory_write_uncorrectable_errors);
+
+ ras_submit(plugin_inst, RAS_TYPE, "cache_l0_l1",
+ ras_metrics_server.per_CPU[i].ras_cache_l0_l1_errors);
+
+ ras_submit(plugin_inst, RAS_TYPE, "tlb_instruction",
+ ras_metrics_server.per_CPU[i].ras_tlb_instruction_errors);
+ }
+}
+
+static int ras_read(void) {
+ char *err_msg = 0;
+ char sql_query[SQL_QUERY_BUFFER_SIZE];
+ int rc;
+ int cx;
+
+ cx = snprintf(sql_query, SQL_QUERY_BUFFER_SIZE * sizeof(char),
+ "select id, cpu, error_msg, mcistatus_msg from "
+ "mce_record where id>%ld",
+ max_id);
+
+ if (cx < 0 || cx >= SQL_QUERY_BUFFER_SIZE * sizeof(char)) {
+ ERROR("Error encountered during SQL query creation");
+ return -1;
+ }
+
+ rc = sqlite3_exec(db, sql_query, callback, 0, &err_msg);
+ if (rc != 0) {
+ DEBUG("SQL error: %s\n", err_msg);
+ sqlite3_free(err_msg);
+ return -1;
+ }
+ ras_submit_all_metrics();
+
+ return 0;
+} /* int ras_read */
+
+static int ras_init(void) {
+ int rc;
+ if (!check_path_correct(path_database)) {
+ ERROR("Incorrect path to Database: %s", path_database);
+ return -1;
+ }
+ rc = sqlite3_open_v2(path_database, &db, SQLITE_OPEN_READONLY, NULL);
+
+ if (rc) {
+ ERROR("Can't open database: %s", sqlite3_errmsg(db));
+ return -1;
+ } else {
+ INFO("Database opened successfully");
+ }
+
+ nprocs = get_nprocs_conf();
+ ras_metrics_server.per_CPU = (struct ras_metrics_per_CPU *) calloc(
+ nprocs, sizeof(struct ras_metrics_per_CPU));
+ if (ras_metrics_server.per_CPU == NULL) {
+ ERROR("Fail allocated memory");
+ return -1;
+ }
+ return 0;
+} /* int ras_init */
+
+static int ras_shutdown(void) {
+ sqlite3_close(db);
+ free(ras_metrics_server.per_CPU);
+ return 0;
+}
+
+void module_register(void) {
+ plugin_register_config(RAS_PLUGIN, ras_config, config_keys, config_keys_num);
+ plugin_register_init(RAS_PLUGIN, ras_init);
+ plugin_register_read(RAS_PLUGIN, ras_read);
+ plugin_register_shutdown(RAS_PLUGIN, ras_shutdown);
+} /* void module_register */
--- /dev/null
+/**
+ * collectd - src/ras_test.c
+ * MIT License
+ *
+ * Copyright (C) 2020 Intel Corporation. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ * Bartlomiej Kotlowski <bartlomiej.kotlowski@intel.com>
+ **/
+
+#include "ras.c"
+#include "testing.h"
+
+#define NPROCS 2
+
+void clear_ras_metrics_server() {
+ nprocs = NPROCS;
+ if (ras_metrics_server.per_CPU != NULL) {
+ free(ras_metrics_server.per_CPU);
+ }
+ ras_metrics_server.ras_cache_l2_errors = 0;
+ ras_metrics_server.ras_upi_errors = 0;
+
+ ras_metrics_server.per_CPU = (struct ras_metrics_per_CPU *) calloc(
+ nprocs, sizeof(struct ras_metrics_per_CPU));
+}
+
+DEF_TEST(classify_entries) {
+ int CPU = 0;
+ clear_ras_metrics_server();
+ classify_entries(CPU, "Unclassified", "foo");
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_unclassified_mce_errors,
+ 1);
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_base_errors,
+ 1);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "Internal unclassified", "foo");
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_unclassified_mce_errors,
+ 1);
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_base_errors,
+ 1);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "Microcode ROM parity error", "foo");
+ EXPECT_EQ_UINT64(
+ ras_metrics_server.per_CPU[CPU].ras_microcode_rom_parity_errors, 1);
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_base_errors,
+ 1);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "External error", "foo");
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_external_mce_errors, 1);
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_base_errors,
+ 1);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "FRC error", "foo");
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_frc_errors, 1);
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_base_errors,
+ 1);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "Internal parity error", "foo");
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_internal_parity_error,
+ 1);
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_base_errors,
+ 1);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "SMM Handler Code Access Violation", "foo");
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU]
+ .ras_smm_handler_code_access_violation_errors,
+ 1);
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_base_errors,
+ 1);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "Internal Timer error", "foo");
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_internal_timer_errors,
+ 1);
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_base_errors,
+ 1);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "BUS Error", "foo");
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_bus_errors, 1);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "Error", "foo");
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_bus_errors, 0);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "BUS", "foo");
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_processor_bus_errors, 0);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "Memory read error", "Uncorrected_error");
+ EXPECT_EQ_UINT64(
+ ras_metrics_server.per_CPU[CPU].ras_memory_read_uncorrectable_errors, 1);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "Memory read error", "Corrected_error");
+ EXPECT_EQ_UINT64(
+ ras_metrics_server.per_CPU[CPU].ras_memory_read_corrected_errors, 1);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "Memory read error", "foo");
+ EXPECT_EQ_UINT64(
+ ras_metrics_server.per_CPU[CPU].ras_memory_read_uncorrectable_errors, 0);
+ EXPECT_EQ_UINT64(
+ ras_metrics_server.per_CPU[CPU].ras_memory_read_corrected_errors, 0);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "Memory write error", "Uncorrected_error");
+ EXPECT_EQ_UINT64(
+ ras_metrics_server.per_CPU[CPU].ras_memory_write_uncorrectable_errors, 1);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "Memory write error", "Corrected_error");
+ EXPECT_EQ_UINT64(
+ ras_metrics_server.per_CPU[CPU].ras_memory_write_corrected_errors, 1);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "Memory write error", "foo");
+ EXPECT_EQ_UINT64(
+ ras_metrics_server.per_CPU[CPU].ras_memory_write_uncorrectable_errors, 0);
+ EXPECT_EQ_UINT64(
+ ras_metrics_server.per_CPU[CPU].ras_memory_write_corrected_errors, 0);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "CACHE Level-1 Error", "foo");
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_cache_l0_l1_errors, 1);
+ EXPECT_EQ_UINT64(ras_metrics_server.ras_cache_l2_errors, 0);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "CACHE Level-1 Error", "foo");
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_cache_l0_l1_errors, 1);
+ EXPECT_EQ_UINT64(ras_metrics_server.ras_cache_l2_errors, 0);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "CACHE Level-2 Error", "foo");
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_cache_l0_l1_errors, 0);
+ EXPECT_EQ_UINT64(ras_metrics_server.ras_cache_l2_errors, 1);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "CACHE Level-3 Error", "foo");
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_cache_l0_l1_errors, 0);
+ EXPECT_EQ_UINT64(ras_metrics_server.ras_cache_l2_errors, 0);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "CACHE Level-0", "foo");
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_cache_l0_l1_errors, 0);
+ EXPECT_EQ_UINT64(ras_metrics_server.ras_cache_l2_errors, 0);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "Instruction TLB Error", "foo");
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_tlb_instruction_errors,
+ 1);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "Instruction TLB", "foo");
+ EXPECT_EQ_UINT64(ras_metrics_server.per_CPU[CPU].ras_tlb_instruction_errors,
+ 0);
+
+ clear_ras_metrics_server();
+ classify_entries(CPU, "UPI:", "foo");
+ EXPECT_EQ_UINT64(ras_metrics_server.ras_upi_errors, 1);
+
+ free(ras_metrics_server.per_CPU);
+ return 0;
+}
+
+DEF_TEST(safe_incremented_counter) {
+ unsigned long long value;
+
+ value = 0;
+ safe_incremented_counter(&value);
+ EXPECT_EQ_UINT64(1, value);
+
+ value = LONG_MAX;
+ safe_incremented_counter(&value);
+ EXPECT_EQ_UINT64(LONG_MAX + 1, value);
+
+ value = ULLONG_MAX;
+ safe_incremented_counter(&value);
+ EXPECT_EQ_UINT64(ULLONG_MAX, value);
+
+ return 0;
+}
+
+DEF_TEST(convert_to_number) {
+ bool ret;
+ long int number;
+
+ ret = convert_to_number("0", &number);
+ EXPECT_EQ_INT(0, number);
+ EXPECT_EQ_INT(1, ret ? 1 : 0);
+
+ ret = convert_to_number("123", &number);
+ EXPECT_EQ_INT(123, number);
+ EXPECT_EQ_INT(1, ret ? 1 : 0);
+
+ // convert max int
+ ret = convert_to_number("2147483647", &number);
+ EXPECT_EQ_INT(1, ret ? 1 : 0);
+ EXPECT_EQ_INT(2147483647, number);
+
+ switch (sizeof(long int)) {
+ case 4:
+ // convert over max long int
+ ret = convert_to_number("2147483648", &number);
+ EXPECT_EQ_INT(0, ret ? 1 : 0);
+ break;
+
+ case 8:
+ // convert max long int
+ ret = convert_to_number("9223372036854775807", &number);
+ EXPECT_EQ_INT(1, ret ? 1 : 0);
+ EXPECT_EQ_INT(1, (number == LONG_MAX) ? 1 : 0);
+ // convert over max long int
+ ret = convert_to_number("9223372036854775809", &number);
+ EXPECT_EQ_INT(0, ret ? 1 : 0);
+
+ break;
+ }
+ // convert max int
+ ret = convert_to_number("foo", &number);
+ EXPECT_EQ_INT(0, ret ? 1 : 0);
+
+ ret = convert_to_number("-1", &number);
+ EXPECT_EQ_INT(0, ret ? 1 : 0);
+
+ return 0;
+}
+
+int main(void) {
+ RUN_TEST(classify_entries);
+ RUN_TEST(safe_incremented_counter);
+ RUN_TEST(convert_to_number);
+ END_TEST;
+}
pstates_enabled value:GAUGE:0:1
pubsub value:GAUGE:0:U
queue_length value:GAUGE:0:U
+ras_errors value:COUNTER:0:U
records value:GAUGE:0:U
redis_command_cputime value:DERIVE:0:U
requests value:GAUGE:0:U