]> git.ipfire.org Git - thirdparty/collectd.git/commitdiff
Report job states just like sinfo.
authorPablo Llopis <pablo.llopis@cern.ch>
Mon, 1 Apr 2019 16:57:19 +0000 (18:57 +0200)
committerPablo Llopis <pablo.llopis@cern.ch>
Wed, 29 May 2019 09:50:35 +0000 (11:50 +0200)
Do not decouple node flags from node states as was the case up to now,
as it will not be possible to tell which node flag corresponds to which
node state. Instead, node state includes the node state flags (such
as draining, non_responding, etc) in the name.

src/collectd.conf.pod
src/slurm.c
src/types.db

index 5c489ef6f16d80245f89977495ac5e27bd7e1803..1ae0650fec4558d16ee9c54ffb48b4f0455edefd 100644 (file)
@@ -7912,8 +7912,6 @@ slurm.conf.
 Note that this plugin needs the B<Globals> option set to I<true> in order to
 function properly.
 
-=back
-
 =head2 Plugin C<smart>
 
 The C<smart> plugin collects SMART information from physical
index 276baae7ab6781cb8a3bb3c255c7e3751686a12b..eabc779101b3e460d11f2c5ab058299f3671ef03 100644 (file)
 /* this function declaration is missing in slurm.h */
 extern void slurm_free_stats_response_msg(stats_info_response_msg_t *msg);
 
+enum slurm_node_states {
+  MAINT_NONRESP,
+  MAINT,
+  REBOOT_NONRESP,
+  REBOOT,
+  DRAINING_MAINT,
+  DRAINING_REBOOT,
+  DRAINING_POWERUP,
+  DRAINING_POWERDOWN,
+  DRAINING_NONRESP,
+  DRAINING,
+  DRAINED_MAINT,
+  DRAINED_REBOOT,
+  DRAINED_POWERUP,
+  DRAINED_POWERDOWN,
+  DRAINED_NONRESP,
+  DRAINED,
+  FAILING_NONRESP,
+  FAILING,
+  FAIL_NONRESP,
+  FAIL,
+  CANCEL_REBOOT,
+  POWER_DOWN,
+  POWER_UP,
+  DOWN_MAINT,
+  DOWN_REBOOT,
+  DOWN_POWERUP,
+  DOWN_POWERDOWN,
+  DOWN_NONRESP,
+  DOWN,
+  ALLOCATED_MAINT,
+  ALLOCATED_REBOOT,
+  ALLOCATED_POWERUP,
+  ALLOCATED_POWERDOWN,
+  ALLOCATED_NONRESP,
+  ALLOCATED_COMP,
+  ALLOCATED,
+  COMPLETING_MAINT,
+  COMPLETING_REBOOT,
+  COMPLETING_POWERUP,
+  COMPLETING_POWERDOWN,
+  COMPLETING_NONRESP,
+  COMPLETING,
+  IDLE_MAINT,
+  IDLE_REBOOT,
+  IDLE_POWERUP,
+  IDLE_POWERDOWN,
+  IDLE_NONRESP,
+  PERFCTRS,
+  RESERVED,
+  IDLE,
+  MIXED_MAINT,
+  MIXED_REBOOT,
+  MIXED_POWERUP,
+  MIXED_POWERDOWN,
+  MIXED_NONRESP,
+  MIXED,
+  FUTURE_MAINT,
+  FUTURE_REBOOT,
+  FUTURE_POWERUP,
+  FUTURE_POWERDOWN,
+  FUTURE_NONRESP,
+  FUTURE,
+  RESUME,
+  UNKNOWN_NONRESP,
+  UNKNOWN,
+  UNKNOWN2
+};
+
+char *node_state_names[] = {"MAINT_NONRESP",
+                            "MAINT",
+                            "REBOOT_NONRESP",
+                            "REBOOT",
+                            "DRAINING_MAINT",
+                            "DRAINING_REBOOT",
+                            "DRAINING_POWERUP",
+                            "DRAINING_POWERDOWN",
+                            "DRAINING_NONRESP",
+                            "DRAINING",
+                            "DRAINED_MAINT",
+                            "DRAINED_REBOOT",
+                            "DRAINED_POWERUP",
+                            "DRAINED_POWERDOWN",
+                            "DRAINED_NONRESP",
+                            "DRAINED",
+                            "FAILING_NONRESP",
+                            "FAILING",
+                            "FAIL_NONRESP",
+                            "FAIL",
+                            "CANCEL_REBOOT",
+                            "POWER_DOWN",
+                            "POWER_UP",
+                            "DOWN_MAINT",
+                            "DOWN_REBOOT",
+                            "DOWN_POWERUP",
+                            "DOWN_POWERDOWN",
+                            "DOWN_NONRESP",
+                            "DOWN",
+                            "ALLOCATED_MAINT",
+                            "ALLOCATED_REBOOT",
+                            "ALLOCATED_POWERUP",
+                            "ALLOCATED_POWERDOWN",
+                            "ALLOCATED_NONRESP",
+                            "ALLOCATED_COMP",
+                            "ALLOCATED",
+                            "COMPLETING_MAINT",
+                            "COMPLETING_REBOOT",
+                            "COMPLETING_POWERUP",
+                            "COMPLETING_POWERDOWN",
+                            "COMPLETING_NONRESP",
+                            "COMPLETING",
+                            "IDLE_MAINT",
+                            "IDLE_REBOOT",
+                            "IDLE_POWERUP",
+                            "IDLE_POWERDOWN",
+                            "IDLE_NONRESP",
+                            "PERFCTRS",
+                            "RESERVED",
+                            "IDLE",
+                            "MIXED_MAINT",
+                            "MIXED_REBOOT",
+                            "MIXED_POWERUP",
+                            "MIXED_POWERDOWN",
+                            "MIXED_NONRESP",
+                            "MIXED",
+                            "FUTURE_MAINT",
+                            "FUTURE_REBOOT",
+                            "FUTURE_POWERUP",
+                            "FUTURE_POWERDOWN",
+                            "FUTURE_NONRESP",
+                            "FUTURE",
+                            "RESUME",
+                            "UNKNOWN_NONRESP",
+                            "UNKNOWN",
+                            "?"};
+
+/* based on src/common/slurm_protocol_defs.c node_state_string function */
+uint8_t slurm_node_state(uint32_t inx) {
+  int base = (inx & NODE_STATE_BASE);
+  bool comp_flag = (inx & NODE_STATE_COMPLETING);
+  bool drain_flag = (inx & NODE_STATE_DRAIN);
+  bool fail_flag = (inx & NODE_STATE_FAIL);
+  bool maint_flag = (inx & NODE_STATE_MAINT);
+  bool net_flag = (inx & NODE_STATE_NET);
+  bool reboot_flag = (inx & NODE_STATE_REBOOT);
+  bool res_flag = (inx & NODE_STATE_RES);
+  bool resume_flag = (inx & NODE_RESUME);
+  bool no_resp_flag = (inx & NODE_STATE_NO_RESPOND);
+  bool power_down_flag = (inx & NODE_STATE_POWER_SAVE);
+  bool power_up_flag = (inx & NODE_STATE_POWER_UP);
+
+  if (maint_flag) {
+    if (drain_flag || (base == NODE_STATE_ALLOCATED) ||
+        (base == NODE_STATE_DOWN) || (base == NODE_STATE_MIXED))
+      ;
+    else if (no_resp_flag)
+      return MAINT_NONRESP;
+    else
+      return MAINT;
+  }
+  if (reboot_flag) {
+    if ((base == NODE_STATE_ALLOCATED) || (base == NODE_STATE_MIXED))
+      ;
+    else if (no_resp_flag)
+      return REBOOT_NONRESP;
+    else
+      return REBOOT;
+  }
+  if (drain_flag) {
+    if (comp_flag || (base == NODE_STATE_ALLOCATED) ||
+        (base == NODE_STATE_MIXED)) {
+      if (maint_flag)
+        return DRAINING_MAINT;
+      if (reboot_flag)
+        return DRAINING_REBOOT;
+      if (power_up_flag)
+        return DRAINING_POWERUP;
+      if (power_down_flag)
+        return DRAINING_POWERDOWN;
+      if (no_resp_flag)
+        return DRAINING_NONRESP;
+      return DRAINING;
+    } else {
+      if (maint_flag)
+        return DRAINED_MAINT;
+      if (reboot_flag)
+        return DRAINED_REBOOT;
+      if (power_up_flag)
+        return DRAINED_POWERUP;
+      if (power_down_flag)
+        return DRAINED_POWERDOWN;
+      if (no_resp_flag)
+        return DRAINED_NONRESP;
+      return DRAINED;
+    }
+  }
+  if (fail_flag) {
+    if (comp_flag || (base == NODE_STATE_ALLOCATED)) {
+      if (no_resp_flag)
+        return FAILING_NONRESP;
+      return FAILING;
+    } else {
+      if (no_resp_flag)
+        return FAIL_NONRESP;
+      return FAIL;
+    }
+  }
+
+  if (inx == NODE_STATE_CANCEL_REBOOT)
+    return CANCEL_REBOOT;
+  if (inx == NODE_STATE_POWER_SAVE)
+    return POWER_DOWN;
+  if (inx == NODE_STATE_POWER_UP)
+    return POWER_UP;
+  if (base == NODE_STATE_DOWN) {
+    if (maint_flag)
+      return DOWN_MAINT;
+    if (reboot_flag)
+      return DOWN_REBOOT;
+    if (power_up_flag)
+      return DOWN_POWERUP;
+    if (power_down_flag)
+      return DOWN_POWERDOWN;
+    if (no_resp_flag)
+      return DOWN_NONRESP;
+    return DOWN;
+  }
+
+  if (base == NODE_STATE_ALLOCATED) {
+    if (maint_flag)
+      return ALLOCATED_MAINT;
+    if (reboot_flag)
+      return ALLOCATED_REBOOT;
+    if (power_up_flag)
+      return ALLOCATED_POWERUP;
+    if (power_down_flag)
+      return ALLOCATED_POWERDOWN;
+    if (no_resp_flag)
+      return ALLOCATED_NONRESP;
+    if (comp_flag)
+      return ALLOCATED_COMP;
+    return ALLOCATED;
+  }
+  if (comp_flag) {
+    if (maint_flag)
+      return COMPLETING_MAINT;
+    if (reboot_flag)
+      return COMPLETING_REBOOT;
+    if (power_up_flag)
+      return COMPLETING_POWERUP;
+    if (power_down_flag)
+      return COMPLETING_POWERDOWN;
+    if (no_resp_flag)
+      return COMPLETING_NONRESP;
+    return COMPLETING;
+  }
+  if (base == NODE_STATE_IDLE) {
+    if (maint_flag)
+      return IDLE_MAINT;
+    if (reboot_flag)
+      return IDLE_REBOOT;
+    if (power_up_flag)
+      return IDLE_POWERUP;
+    if (power_down_flag)
+      return IDLE_POWERDOWN;
+    if (no_resp_flag)
+      return IDLE_NONRESP;
+    if (net_flag)
+      return PERFCTRS;
+    if (res_flag)
+      return RESERVED;
+    return IDLE;
+  }
+  if (base == NODE_STATE_MIXED) {
+    if (maint_flag)
+      return MIXED_MAINT;
+    if (reboot_flag)
+      return MIXED_REBOOT;
+    if (power_up_flag)
+      return MIXED_POWERUP;
+    if (power_down_flag)
+      return MIXED_POWERDOWN;
+    if (no_resp_flag)
+      return MIXED_NONRESP;
+    return MIXED;
+  }
+  if (base == NODE_STATE_FUTURE) {
+    if (maint_flag)
+      return FUTURE_MAINT;
+    if (reboot_flag)
+      return FUTURE_REBOOT;
+    if (power_up_flag)
+      return FUTURE_POWERUP;
+    if (power_down_flag)
+      return FUTURE_POWERDOWN;
+    if (no_resp_flag)
+      return FUTURE_NONRESP;
+    return FUTURE;
+  }
+  if (resume_flag)
+    return RESUME;
+  if (base == NODE_STATE_UNKNOWN) {
+    if (no_resp_flag)
+      return UNKNOWN_NONRESP;
+    return UNKNOWN;
+  }
+  return UNKNOWN2;
+}
+
+#define NUM_NODE_STATES (sizeof(node_state_names) / sizeof(node_state_names[0]))
+
 typedef struct partition_state_st {
   char name[PART_NAME_SIZE];
-  /* counts nodes states indexed by enum node_states in slurm.h */
-  uint32_t nodes_states[NODE_STATE_END];
+  uint32_t nodes_states_count[NUM_NODE_STATES];
   /* counts jobs states indexed by enum job_states in slurm.h */
-  uint32_t jobs_states[JOB_END];
-  /* other node flags */
-  uint32_t drain;
-  uint32_t completing;
-  uint32_t no_respond;
-  uint32_t power_save;
-  uint32_t fail;
+  uint32_t jobs_states_count[JOB_END];
 } partition_state_t;
 
-/* based on enum node_states from slurm.h */
-static const char *node_state_names[] = {
-    "unknown", "down", "idle", "allocated", "error", "mixed", "future",
-};
-
 /* based on enum job_states from slurm.h */
 static const char *job_state_names[] = {
     "pending", "running",   "suspended", "complete",  "cancelled", "failed",
@@ -115,20 +414,12 @@ static void slurm_submit(const char *plugin_instance, const char *type,
 static void slurm_submit_partition(partition_state_t *partition) {
   for (int i = 0; i < JOB_END; i++) {
     slurm_submit(partition->name, "slurm_job_state", job_state_names[i],
-                 partition->jobs_states[i]);
+                 partition->jobs_states_count[i]);
   }
-  for (int i = 0; i < NODE_STATE_END; i++) {
+  for (int i = 0; i < NUM_NODE_STATES; i++) {
     slurm_submit(partition->name, "slurm_node_state", node_state_names[i],
-                 partition->nodes_states[i]);
+                 partition->nodes_states_count[i]);
   }
-  slurm_submit(partition->name, "slurm_node_flag", "drain", partition->drain);
-  slurm_submit(partition->name, "slurm_node_flag", "completing",
-               partition->completing);
-  slurm_submit(partition->name, "slurm_node_flag", "no_respond",
-               partition->no_respond);
-  slurm_submit(partition->name, "slurm_node_flag", "power_save",
-               partition->power_save);
-  slurm_submit(partition->name, "slurm_node_flag", "fail", partition->fail);
 }
 
 static void slurm_submit_stats(stats_info_response_msg_t *stats_resp) {
@@ -253,7 +544,7 @@ static int slurm_read(void) {
     }
 
     uint8_t job_state = job_ptr->job_state & JOB_STATE_BASE;
-    partition_state->jobs_states[job_state]++;
+    partition_state->jobs_states_count[job_state]++;
   }
 
   /* fill partition_states array with per-partition node state information */
@@ -276,18 +567,8 @@ static int slurm_read(void) {
         node_ptr = &node_buffer_ptr->node_array[k];
         /* some non-existant nodes (name is NULL) may show up as node_state
          * FUTURE */
-        uint8_t node_state = node_ptr->node_state & NODE_STATE_BASE;
-        partition_state->nodes_states[node_state]++;
-        if (node_ptr->node_state & NODE_STATE_DRAIN)
-          partition_state->drain++;
-        if (node_ptr->node_state & NODE_STATE_COMPLETING)
-          partition_state->completing++;
-        if (node_ptr->node_state & NODE_STATE_NO_RESPOND)
-          partition_state->no_respond++;
-        if (node_ptr->node_state & NODE_STATE_POWER_SAVE)
-          partition_state->power_save++;
-        if (node_ptr->node_state & NODE_STATE_FAIL)
-          partition_state->fail++;
+        uint8_t node_state = slurm_node_state(node_ptr->node_state);
+        partition_state->nodes_states_count[node_state]++;
       }
     }
   }
index 0762671beaec34ac4e8d5a4b01123c5a3f50b0e5..acc0a6429d5177246317954d116502d55a916254 100644 (file)
@@ -233,7 +233,6 @@ signal_power            value:GAUGE:U:0
 signal_quality          value:GAUGE:0:U
 slurm_job_state         value:GAUGE:0:U
 slurm_node_state        value:GAUGE:0:U
-slurm_node_flag         value:GAUGE:0:U
 slurm_stats             value:GAUGE:0:U
 smart_attribute         current:GAUGE:0:255, worst:GAUGE:0:255, threshold:GAUGE:0:255, pretty:GAUGE:0:U
 smart_badsectors        value:GAUGE:0:U