]> git.ipfire.org Git - thirdparty/collectd.git/commitdiff
Add new slurm plugin
authorPablo Llopis <pablo.llopis@cern.ch>
Wed, 2 Jan 2019 20:05:17 +0000 (21:05 +0100)
committerPablo Llopis <pablo.llopis@cern.ch>
Wed, 29 May 2019 09:50:34 +0000 (11:50 +0200)
Makefile.am
configure.ac
src/slurm.c [new file with mode: 0644]

index 85f8da8a7345cf3d7dc8d54ca156eed07a60bbd0..3620423f8e12c91d3b7e4301fcec4a7d9df1607b 100644 (file)
@@ -1697,6 +1697,14 @@ sigrok_la_LDFLAGS = $(PLUGIN_LDFLAGS)
 sigrok_la_LIBADD = $(LIBSIGROK_LIBS)
 endif
 
+if BUILD_PLUGIN_SLURM
+pkglib_LTLIBRARIES += slurm.la
+slurm_la_SOURCES = src/slurm.c
+slurm_la_CFLAGS = $(AM_CFLAGS) $(BUILD_WITH_LIBSLURM_CFLAGS)
+slurm_la_LDFLAGS = $(PLUGIN_LDFLAGS)
+slurm_la_LIBADD = $(BUILD_WITH_LIBSLURM_LIBS)
+endif
+
 if BUILD_PLUGIN_SMART
 if BUILD_WITH_LIBUDEV
 pkglib_LTLIBRARIES += smart.la
index c95422f4e1b8b93da6cdf856adfafeccc561966e..d97040b89d0089d8ffe1a37543d50083f4a9e41f 100644 (file)
@@ -6352,6 +6352,83 @@ AC_DEFUN(
   ]
 )# AC_PLUGIN(name, default, info)
 
+# --with-libslurm {{{
+AC_ARG_WITH([libslurm],
+  [AS_HELP_STRING([--with-libslurm@<:@=PREFIX@:>@], [Path to the libslurm library.])],
+  [
+    if test "x$withval" = "xno"; then
+      with_libslurm="no"
+    else if test "x$withval" = "xyes"; then
+      with_libslurm="use_pkgconfig"
+    else if test -d "$with_libslurm/lib"; then
+      AC_MSG_NOTICE([Not checking for libslurm: Manually configured])
+      with_libslurm_cflags="-I$withval/include"
+      with_libslurm_libs="-L$withval/lib -llibslurm"
+      with_libslurm="yes"
+    fi; fi; fi
+  ],
+  [with_libslurm="use_pkgconfig"]
+)
+
+# configure using pkg-config
+if test "x$with_libslurm" = "xuse_pkgconfig"; then
+  AC_MSG_NOTICE([Checking for libslurm using $PKG_CONFIG])
+  $PKG_CONFIG --exists 'slurm' 2>/dev/null
+  if test $? -ne 0; then
+    with_libslurm="no (pkg-config doesn't know libslurm)"
+  fi
+fi
+
+if test "x$with_libslurm" = "xuse_pkgconfig"; then
+  with_libslurm_cflags="`$PKG_CONFIG --cflags 'slurm'`"
+  if test $? -ne 0; then
+    with_libslurm="no ($PKG_CONFIG failed)"
+  fi
+
+  with_libslurm_libs="`$PKG_CONFIG --libs 'slurm'`"
+  if test $? -ne 0; then
+    with_libslurm="no ($PKG_CONFIG failed)"
+  fi
+fi
+
+if test "x$with_libslurm" = "xuse_pkgconfig"; then
+  with_libslurm="yes"
+fi
+
+if test "x$with_libslurm" = "xyes"; then
+  SAVE_CPPFLAGS="$CPPFLAGS"
+  CPPFLAGS="$CPPFLAGS $with_libslurm_cflags"
+
+  AC_CHECK_HEADERS([slurm/slurm.h],
+    [with_libslurm="yes"],
+    [with_libslurm="no (slurm/slurm.h not found)"]
+  )
+
+  CPPFLAGS="$SAVE_CPPFLAGS"
+fi
+
+if test "x$with_libslurm" = "xyes"; then
+  SAVE_LDFLAGS="$LDFLAGS"
+  LDFLAGS="$LDFLAGS $with_libslurm_libs"
+
+  AC_CHECK_LIB([slurm], [slurm_load_jobs],
+    [with_libslurm="yes"],
+    [with_libslurm="no (symbol slurm_load_jobs not found)"]
+  )
+
+  LDFLAGS="$SAVE_LDFLAGS"
+fi
+
+if test "x$with_libslurm" = "xyes"; then
+  BUILD_WITH_LIBSLURM_CFLAGS="$with_libslurm_cflags"
+  BUILD_WITH_LIBSLURM_LIBS="$with_libslurm_libs"
+fi
+
+AC_SUBST([BUILD_WITH_LIBSLURM_CFLAGS])
+AC_SUBST([BUILD_WITH_LIBSLURM_LIBS])
+# }}}
+
+
 m4_divert_once([HELP_ENABLE], [
 collectd features:])
 # FIXME: Remove these calls to `AC_COLLECTD' and then remove that macro.
@@ -6890,6 +6967,7 @@ AC_PLUGIN([rrdtool],             [$with_librrd],              [RRDTool output pl
 AC_PLUGIN([sensors],             [$with_libsensors],          [lm_sensors statistics])
 AC_PLUGIN([serial],              [$plugin_serial],            [serial port traffic])
 AC_PLUGIN([sigrok],              [$with_libsigrok],           [sigrok acquisition sources])
+AC_PLUGIN([slurm],               [$with_libslurm],            [SLURM jobs and nodes status])
 AC_PLUGIN([smart],               [$plugin_smart],             [SMART statistics])
 AC_PLUGIN([snmp],                [$with_libnetsnmp],          [SNMP querying plugin])
 AC_PLUGIN([snmp_agent],          [$with_libnetsnmpagent],     [SNMP agent plugin])
@@ -7183,7 +7261,11 @@ AC_MSG_RESULT([    librouteros . . . . . $with_librouteros])
 AC_MSG_RESULT([    librrd  . . . . . . . $with_librrd])
 AC_MSG_RESULT([    libsensors  . . . . . $with_libsensors])
 AC_MSG_RESULT([    libsigrok   . . . . . $with_libsigrok])
+<<<<<<< HEAD
 AC_MSG_RESULT([    libssl  . . . . . . . $with_libssl])
+=======
+AC_MSG_RESULT([    libslurm .  . . . . . $with_libslurm])
+>>>>>>> Add new slurm plugin
 AC_MSG_RESULT([    libstatgrab . . . . . $with_libstatgrab])
 AC_MSG_RESULT([    libtokyotyrant  . . . $with_libtokyotyrant])
 AC_MSG_RESULT([    libudev . . . . . . . $with_libudev])
@@ -7316,6 +7398,7 @@ AC_MSG_RESULT([    rrdtool . . . . . . . $enable_rrdtool])
 AC_MSG_RESULT([    sensors . . . . . . . $enable_sensors])
 AC_MSG_RESULT([    serial  . . . . . . . $enable_serial])
 AC_MSG_RESULT([    sigrok  . . . . . . . $enable_sigrok])
+AC_MSG_RESULT([    slurm . . . . . . . . $enable_slurm])
 AC_MSG_RESULT([    smart . . . . . . . . $enable_smart])
 AC_MSG_RESULT([    snmp  . . . . . . . . $enable_snmp])
 AC_MSG_RESULT([    snmp_agent  . . . . . $enable_snmp_agent])
diff --git a/src/slurm.c b/src/slurm.c
new file mode 100644 (file)
index 0000000..640820f
--- /dev/null
@@ -0,0 +1,228 @@
+/**
+ * collectd - src/slurm.c
+ * Copyright (C) 2018       Pablo Llopis
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; only version 2 of the License is applicable.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ * Authors:
+ *   Pablo Llopis <pablo.llopis at gmail.com>
+ **/
+
+#define _DEFAULT_SOURCE
+#define _BSD_SOURCE
+
+#include "collectd.h"
+
+#include "common.h"
+#include "plugin.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <slurm/slurm.h>
+#include <slurm/slurm_errno.h>
+#include <sys/types.h>
+#include <string.h>
+
+#define PLUGIN_NAME    "slurm"
+#define PART_NAME_SIZE 128
+
+typedef struct partition_state_st {
+  char name[PART_NAME_SIZE];
+  /* counts nodes states indexed by enum node_states in slurm.h */
+  uint32_t nodes_states[NODE_STATE_END];
+  /* counts jobs states indexed by enum job_states in slurm.h */
+  uint32_t jobs_states[JOB_END];
+} partition_state_t;
+
+/* based on enum node_states from slurm.h */
+static const char* node_state_names[] = {
+  "unknown",
+  "down",
+  "idle",
+  "allocated",
+  "error",
+  "mixed",
+  "future",
+};
+
+/* based on enum job_states from slurm.h */
+static const char* job_state_names[] = {
+  "pending",
+  "running",
+  "suspended",
+  "complete",
+  "cancelled",
+  "failed",
+  "timeout",
+  "node_fail",
+  "preempted",
+  "boot_fail",
+  "deadline",
+  "oom",
+};
+
+static partition_state_t *alloc_partition_states(uint32_t num_partitions, partition_info_t *partitions)
+{
+  partition_state_t *partition_states;
+
+  partition_states = (partition_state_t*)calloc(num_partitions, sizeof(partition_state_t));
+  if (!partition_states) {
+    return NULL;
+  }
+
+  for (int i=0; i<num_partitions; i++)
+    sstrncpy(partition_states[i].name, partitions[i].name, PART_NAME_SIZE);
+
+  return partition_states;
+}
+
+static partition_state_t *find_partition(partition_state_t *partitions, uint32_t num_partitions, char *name)
+{
+  partition_state_t *part = NULL;
+
+  for(int i=0; i<num_partitions; i++) {
+    if (strncmp(name, partitions[i].name, PART_NAME_SIZE) == 0)
+      part = &partitions[i];
+  }
+
+  return part;
+}
+
+/*
+ * Submit one gauge value
+ */
+static void slurm_submit(const char *plugin_instance, const char *type,
+                         const char *type_instance, gauge_t value) {
+  value_list_t vl = VALUE_LIST_INIT;
+
+  vl.values = &(value_t){.gauge = value};
+  vl.values_len = 1;
+  sstrncpy(vl.plugin, PLUGIN_NAME, sizeof(vl.plugin));
+  if (plugin_instance != NULL)
+    sstrncpy(vl.plugin_instance, plugin_instance, sizeof(vl.plugin_instance));
+  sstrncpy(vl.type, type, sizeof(vl.type));
+  if (type_instance != NULL)
+    sstrncpy(vl.type_instance, type_instance, sizeof(vl.type_instance));
+
+  plugin_dispatch_values(&vl);
+}
+
+static void slurm_submit_partition(partition_state_t *partition) {
+  for (int i=0; i<JOB_END; i++) {
+    slurm_submit(partition->name, "count", job_state_names[i], partition->jobs_states[i]);
+  }
+  for (int i=0; i<NODE_STATE_END; i++) {
+    slurm_submit(partition->name, "count", node_state_names[i], partition->nodes_states[i]);
+  }
+}
+
+static int slurm_read(void) {
+  job_info_msg_t       * job_buffer_ptr = NULL;
+  job_info_t * job_ptr;
+  partition_info_msg_t *part_buffer_ptr = NULL;
+  partition_info_t *part_ptr;
+  partition_state_t *partition_states;
+  partition_state_t *partition_state;
+  node_info_msg_t *node_buffer_ptr = NULL;
+  node_info_t *node_ptr;
+
+  if (slurm_load_jobs((time_t)NULL,
+        &job_buffer_ptr, SHOW_ALL) ) {
+    ERROR("slurm_load_jobs error");
+    return -1;
+  }
+
+  if (slurm_load_node((time_t)NULL,
+        &node_buffer_ptr, SHOW_ALL) ) {
+    slurm_free_job_info_msg(job_buffer_ptr);
+    ERROR("slurm_load_node error");
+    return -1;
+  }
+
+  if (slurm_load_partitions((time_t)NULL,
+        &part_buffer_ptr, 0) ) {
+    slurm_free_job_info_msg(job_buffer_ptr);
+    slurm_free_node_info_msg(node_buffer_ptr);
+    ERROR("slurm_load_partitions error");
+    return -1;
+  }
+
+  /* SLURM APIs provide *non-relational* data about nodes, partitions and jobs.
+   * We allocate a data structure that relates all three together, and the following
+   * two for loops fill this data structure. The data structure is an array
+   * of partition_state_t that holds job and node states. */
+  uint32_t num_partitions = part_buffer_ptr->record_count;
+  partition_states = alloc_partition_states(num_partitions, part_buffer_ptr->partition_array);
+  if (!partition_states) {
+    slurm_free_job_info_msg(job_buffer_ptr);
+    slurm_free_node_info_msg(node_buffer_ptr);
+    slurm_free_partition_info_msg(part_buffer_ptr);
+    ERROR("alloc_partition_states");
+    return -1;
+  }
+
+  /* fill partition_states array with per-partition job state information */
+  for (int i=0; i<job_buffer_ptr->record_count; i++) {
+    job_ptr = &job_buffer_ptr->job_array[i];
+    partition_state = find_partition(partition_states, num_partitions, job_ptr->partition);
+    if (!partition_state) {
+      ERROR("slurm_read: cannot find partition %s from jobid %d"
+            " in partition list returned by slurm_load_partitions",
+            job_ptr->partition, job_ptr->job_id);
+      continue;
+    }
+
+    uint8_t job_state = job_ptr->job_state & JOB_STATE_BASE;
+    partition_state->jobs_states[job_state]++;
+  }
+
+  /* fill partition_states array with per-partition node state information */
+  for (int i=0; i<part_buffer_ptr->record_count; i++) {
+    part_ptr = &part_buffer_ptr->partition_array[i];
+
+    partition_state = find_partition(partition_states, num_partitions, part_ptr->name);
+    if (!partition_state) {
+      ERROR("slurm_read: cannot find partition %s"
+            " in partition list returned by slurm_load_partitions",
+            part_ptr->name);
+      continue;
+    }
+
+    for (int j=0; part_ptr->node_inx; j+=2) {
+      if (part_ptr->node_inx[j] == -1)
+        break;
+      for (int k = part_ptr->node_inx[j];
+          k <= part_ptr->node_inx[j+1];
+          k++) {
+        node_ptr = &node_buffer_ptr->node_array[k];
+        /* some non-existant nodes (name is NULL) may show up as node_state FUTURE */
+        uint8_t node_state = node_ptr->node_state & NODE_STATE_BASE;
+        partition_state->nodes_states[node_state]++;
+      }
+    }
+  }
+
+  for (int i=0; i<num_partitions; i++)
+    slurm_submit_partition(&partition_states[i]);
+
+  slurm_free_job_info_msg (job_buffer_ptr);
+  slurm_free_node_info_msg(node_buffer_ptr);
+  slurm_free_partition_info_msg(part_buffer_ptr);
+  free(partition_states);
+  return 0;
+}
+
+void module_register(void) {
+  plugin_register_read("slurm", slurm_read);
+}