From: Pablo Llopis Date: Wed, 2 Jan 2019 20:05:17 +0000 (+0100) Subject: Add new slurm plugin X-Git-Tag: collectd-5.11.0~3^2~16 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=6139038fdae818ac7f4904743fd4e479ddfaca13;p=thirdparty%2Fcollectd.git Add new slurm plugin --- diff --git a/Makefile.am b/Makefile.am index 85f8da8a7..3620423f8 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1697,6 +1697,14 @@ sigrok_la_LDFLAGS = $(PLUGIN_LDFLAGS) sigrok_la_LIBADD = $(LIBSIGROK_LIBS) endif +if BUILD_PLUGIN_SLURM +pkglib_LTLIBRARIES += slurm.la +slurm_la_SOURCES = src/slurm.c +slurm_la_CFLAGS = $(AM_CFLAGS) $(BUILD_WITH_LIBSLURM_CFLAGS) +slurm_la_LDFLAGS = $(PLUGIN_LDFLAGS) +slurm_la_LIBADD = $(BUILD_WITH_LIBSLURM_LIBS) +endif + if BUILD_PLUGIN_SMART if BUILD_WITH_LIBUDEV pkglib_LTLIBRARIES += smart.la diff --git a/configure.ac b/configure.ac index c95422f4e..d97040b89 100644 --- a/configure.ac +++ b/configure.ac @@ -6352,6 +6352,83 @@ AC_DEFUN( ] )# AC_PLUGIN(name, default, info) +# --with-libslurm {{{ +AC_ARG_WITH([libslurm], + [AS_HELP_STRING([--with-libslurm@<:@=PREFIX@:>@], [Path to the libslurm library.])], + [ + if test "x$withval" = "xno"; then + with_libslurm="no" + else if test "x$withval" = "xyes"; then + with_libslurm="use_pkgconfig" + else if test -d "$with_libslurm/lib"; then + AC_MSG_NOTICE([Not checking for libslurm: Manually configured]) + with_libslurm_cflags="-I$withval/include" + with_libslurm_libs="-L$withval/lib -llibslurm" + with_libslurm="yes" + fi; fi; fi + ], + [with_libslurm="use_pkgconfig"] +) + +# configure using pkg-config +if test "x$with_libslurm" = "xuse_pkgconfig"; then + AC_MSG_NOTICE([Checking for libslurm using $PKG_CONFIG]) + $PKG_CONFIG --exists 'slurm' 2>/dev/null + if test $? -ne 0; then + with_libslurm="no (pkg-config doesn't know libslurm)" + fi +fi + +if test "x$with_libslurm" = "xuse_pkgconfig"; then + with_libslurm_cflags="`$PKG_CONFIG --cflags 'slurm'`" + if test $? -ne 0; then + with_libslurm="no ($PKG_CONFIG failed)" + fi + + with_libslurm_libs="`$PKG_CONFIG --libs 'slurm'`" + if test $? -ne 0; then + with_libslurm="no ($PKG_CONFIG failed)" + fi +fi + +if test "x$with_libslurm" = "xuse_pkgconfig"; then + with_libslurm="yes" +fi + +if test "x$with_libslurm" = "xyes"; then + SAVE_CPPFLAGS="$CPPFLAGS" + CPPFLAGS="$CPPFLAGS $with_libslurm_cflags" + + AC_CHECK_HEADERS([slurm/slurm.h], + [with_libslurm="yes"], + [with_libslurm="no (slurm/slurm.h not found)"] + ) + + CPPFLAGS="$SAVE_CPPFLAGS" +fi + +if test "x$with_libslurm" = "xyes"; then + SAVE_LDFLAGS="$LDFLAGS" + LDFLAGS="$LDFLAGS $with_libslurm_libs" + + AC_CHECK_LIB([slurm], [slurm_load_jobs], + [with_libslurm="yes"], + [with_libslurm="no (symbol slurm_load_jobs not found)"] + ) + + LDFLAGS="$SAVE_LDFLAGS" +fi + +if test "x$with_libslurm" = "xyes"; then + BUILD_WITH_LIBSLURM_CFLAGS="$with_libslurm_cflags" + BUILD_WITH_LIBSLURM_LIBS="$with_libslurm_libs" +fi + +AC_SUBST([BUILD_WITH_LIBSLURM_CFLAGS]) +AC_SUBST([BUILD_WITH_LIBSLURM_LIBS]) +# }}} + + m4_divert_once([HELP_ENABLE], [ collectd features:]) # FIXME: Remove these calls to `AC_COLLECTD' and then remove that macro. @@ -6890,6 +6967,7 @@ AC_PLUGIN([rrdtool], [$with_librrd], [RRDTool output pl AC_PLUGIN([sensors], [$with_libsensors], [lm_sensors statistics]) AC_PLUGIN([serial], [$plugin_serial], [serial port traffic]) AC_PLUGIN([sigrok], [$with_libsigrok], [sigrok acquisition sources]) +AC_PLUGIN([slurm], [$with_libslurm], [SLURM jobs and nodes status]) AC_PLUGIN([smart], [$plugin_smart], [SMART statistics]) AC_PLUGIN([snmp], [$with_libnetsnmp], [SNMP querying plugin]) AC_PLUGIN([snmp_agent], [$with_libnetsnmpagent], [SNMP agent plugin]) @@ -7183,7 +7261,11 @@ AC_MSG_RESULT([ librouteros . . . . . $with_librouteros]) AC_MSG_RESULT([ librrd . . . . . . . $with_librrd]) AC_MSG_RESULT([ libsensors . . . . . $with_libsensors]) AC_MSG_RESULT([ libsigrok . . . . . $with_libsigrok]) +<<<<<<< HEAD AC_MSG_RESULT([ libssl . . . . . . . $with_libssl]) +======= +AC_MSG_RESULT([ libslurm . . . . . . $with_libslurm]) +>>>>>>> Add new slurm plugin AC_MSG_RESULT([ libstatgrab . . . . . $with_libstatgrab]) AC_MSG_RESULT([ libtokyotyrant . . . $with_libtokyotyrant]) AC_MSG_RESULT([ libudev . . . . . . . $with_libudev]) @@ -7316,6 +7398,7 @@ AC_MSG_RESULT([ rrdtool . . . . . . . $enable_rrdtool]) AC_MSG_RESULT([ sensors . . . . . . . $enable_sensors]) AC_MSG_RESULT([ serial . . . . . . . $enable_serial]) AC_MSG_RESULT([ sigrok . . . . . . . $enable_sigrok]) +AC_MSG_RESULT([ slurm . . . . . . . . $enable_slurm]) AC_MSG_RESULT([ smart . . . . . . . . $enable_smart]) AC_MSG_RESULT([ snmp . . . . . . . . $enable_snmp]) AC_MSG_RESULT([ snmp_agent . . . . . $enable_snmp_agent]) diff --git a/src/slurm.c b/src/slurm.c new file mode 100644 index 000000000..640820f77 --- /dev/null +++ b/src/slurm.c @@ -0,0 +1,228 @@ +/** + * collectd - src/slurm.c + * Copyright (C) 2018 Pablo Llopis + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; only version 2 of the License is applicable. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Authors: + * Pablo Llopis + **/ + +#define _DEFAULT_SOURCE +#define _BSD_SOURCE + +#include "collectd.h" + +#include "common.h" +#include "plugin.h" + +#include +#include +#include +#include +#include +#include + +#define PLUGIN_NAME "slurm" +#define PART_NAME_SIZE 128 + +typedef struct partition_state_st { + char name[PART_NAME_SIZE]; + /* counts nodes states indexed by enum node_states in slurm.h */ + uint32_t nodes_states[NODE_STATE_END]; + /* counts jobs states indexed by enum job_states in slurm.h */ + uint32_t jobs_states[JOB_END]; +} partition_state_t; + +/* based on enum node_states from slurm.h */ +static const char* node_state_names[] = { + "unknown", + "down", + "idle", + "allocated", + "error", + "mixed", + "future", +}; + +/* based on enum job_states from slurm.h */ +static const char* job_state_names[] = { + "pending", + "running", + "suspended", + "complete", + "cancelled", + "failed", + "timeout", + "node_fail", + "preempted", + "boot_fail", + "deadline", + "oom", +}; + +static partition_state_t *alloc_partition_states(uint32_t num_partitions, partition_info_t *partitions) +{ + partition_state_t *partition_states; + + partition_states = (partition_state_t*)calloc(num_partitions, sizeof(partition_state_t)); + if (!partition_states) { + return NULL; + } + + for (int i=0; iname, "count", job_state_names[i], partition->jobs_states[i]); + } + for (int i=0; iname, "count", node_state_names[i], partition->nodes_states[i]); + } +} + +static int slurm_read(void) { + job_info_msg_t * job_buffer_ptr = NULL; + job_info_t * job_ptr; + partition_info_msg_t *part_buffer_ptr = NULL; + partition_info_t *part_ptr; + partition_state_t *partition_states; + partition_state_t *partition_state; + node_info_msg_t *node_buffer_ptr = NULL; + node_info_t *node_ptr; + + if (slurm_load_jobs((time_t)NULL, + &job_buffer_ptr, SHOW_ALL) ) { + ERROR("slurm_load_jobs error"); + return -1; + } + + if (slurm_load_node((time_t)NULL, + &node_buffer_ptr, SHOW_ALL) ) { + slurm_free_job_info_msg(job_buffer_ptr); + ERROR("slurm_load_node error"); + return -1; + } + + if (slurm_load_partitions((time_t)NULL, + &part_buffer_ptr, 0) ) { + slurm_free_job_info_msg(job_buffer_ptr); + slurm_free_node_info_msg(node_buffer_ptr); + ERROR("slurm_load_partitions error"); + return -1; + } + + /* SLURM APIs provide *non-relational* data about nodes, partitions and jobs. + * We allocate a data structure that relates all three together, and the following + * two for loops fill this data structure. The data structure is an array + * of partition_state_t that holds job and node states. */ + uint32_t num_partitions = part_buffer_ptr->record_count; + partition_states = alloc_partition_states(num_partitions, part_buffer_ptr->partition_array); + if (!partition_states) { + slurm_free_job_info_msg(job_buffer_ptr); + slurm_free_node_info_msg(node_buffer_ptr); + slurm_free_partition_info_msg(part_buffer_ptr); + ERROR("alloc_partition_states"); + return -1; + } + + /* fill partition_states array with per-partition job state information */ + for (int i=0; irecord_count; i++) { + job_ptr = &job_buffer_ptr->job_array[i]; + partition_state = find_partition(partition_states, num_partitions, job_ptr->partition); + if (!partition_state) { + ERROR("slurm_read: cannot find partition %s from jobid %d" + " in partition list returned by slurm_load_partitions", + job_ptr->partition, job_ptr->job_id); + continue; + } + + uint8_t job_state = job_ptr->job_state & JOB_STATE_BASE; + partition_state->jobs_states[job_state]++; + } + + /* fill partition_states array with per-partition node state information */ + for (int i=0; irecord_count; i++) { + part_ptr = &part_buffer_ptr->partition_array[i]; + + partition_state = find_partition(partition_states, num_partitions, part_ptr->name); + if (!partition_state) { + ERROR("slurm_read: cannot find partition %s" + " in partition list returned by slurm_load_partitions", + part_ptr->name); + continue; + } + + for (int j=0; part_ptr->node_inx; j+=2) { + if (part_ptr->node_inx[j] == -1) + break; + for (int k = part_ptr->node_inx[j]; + k <= part_ptr->node_inx[j+1]; + k++) { + node_ptr = &node_buffer_ptr->node_array[k]; + /* some non-existant nodes (name is NULL) may show up as node_state FUTURE */ + uint8_t node_state = node_ptr->node_state & NODE_STATE_BASE; + partition_state->nodes_states[node_state]++; + } + } + } + + for (int i=0; i