From: Pablo Llopis Date: Mon, 1 Apr 2019 16:57:19 +0000 (+0200) Subject: Report job states just like sinfo. X-Git-Tag: collectd-5.11.0~3^2~5 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=396b639476ca7c4ae3817ec70bec6e1914c78e54;p=thirdparty%2Fcollectd.git Report job states just like sinfo. Do not decouple node flags from node states as was the case up to now, as it will not be possible to tell which node flag corresponds to which node state. Instead, node state includes the node state flags (such as draining, non_responding, etc) in the name. --- diff --git a/src/collectd.conf.pod b/src/collectd.conf.pod index 5c489ef6f..1ae0650fe 100644 --- a/src/collectd.conf.pod +++ b/src/collectd.conf.pod @@ -7912,8 +7912,6 @@ slurm.conf. Note that this plugin needs the B option set to I in order to function properly. -=back - =head2 Plugin C The C plugin collects SMART information from physical diff --git a/src/slurm.c b/src/slurm.c index 276baae7a..eabc77910 100644 --- a/src/slurm.c +++ b/src/slurm.c @@ -40,25 +40,324 @@ /* this function declaration is missing in slurm.h */ extern void slurm_free_stats_response_msg(stats_info_response_msg_t *msg); +enum slurm_node_states { + MAINT_NONRESP, + MAINT, + REBOOT_NONRESP, + REBOOT, + DRAINING_MAINT, + DRAINING_REBOOT, + DRAINING_POWERUP, + DRAINING_POWERDOWN, + DRAINING_NONRESP, + DRAINING, + DRAINED_MAINT, + DRAINED_REBOOT, + DRAINED_POWERUP, + DRAINED_POWERDOWN, + DRAINED_NONRESP, + DRAINED, + FAILING_NONRESP, + FAILING, + FAIL_NONRESP, + FAIL, + CANCEL_REBOOT, + POWER_DOWN, + POWER_UP, + DOWN_MAINT, + DOWN_REBOOT, + DOWN_POWERUP, + DOWN_POWERDOWN, + DOWN_NONRESP, + DOWN, + ALLOCATED_MAINT, + ALLOCATED_REBOOT, + ALLOCATED_POWERUP, + ALLOCATED_POWERDOWN, + ALLOCATED_NONRESP, + ALLOCATED_COMP, + ALLOCATED, + COMPLETING_MAINT, + COMPLETING_REBOOT, + COMPLETING_POWERUP, + COMPLETING_POWERDOWN, + COMPLETING_NONRESP, + COMPLETING, + IDLE_MAINT, + IDLE_REBOOT, + IDLE_POWERUP, + IDLE_POWERDOWN, + IDLE_NONRESP, + PERFCTRS, + RESERVED, + IDLE, + MIXED_MAINT, + MIXED_REBOOT, + MIXED_POWERUP, + MIXED_POWERDOWN, + MIXED_NONRESP, + MIXED, + FUTURE_MAINT, + FUTURE_REBOOT, + FUTURE_POWERUP, + FUTURE_POWERDOWN, + FUTURE_NONRESP, + FUTURE, + RESUME, + UNKNOWN_NONRESP, + UNKNOWN, + UNKNOWN2 +}; + +char *node_state_names[] = {"MAINT_NONRESP", + "MAINT", + "REBOOT_NONRESP", + "REBOOT", + "DRAINING_MAINT", + "DRAINING_REBOOT", + "DRAINING_POWERUP", + "DRAINING_POWERDOWN", + "DRAINING_NONRESP", + "DRAINING", + "DRAINED_MAINT", + "DRAINED_REBOOT", + "DRAINED_POWERUP", + "DRAINED_POWERDOWN", + "DRAINED_NONRESP", + "DRAINED", + "FAILING_NONRESP", + "FAILING", + "FAIL_NONRESP", + "FAIL", + "CANCEL_REBOOT", + "POWER_DOWN", + "POWER_UP", + "DOWN_MAINT", + "DOWN_REBOOT", + "DOWN_POWERUP", + "DOWN_POWERDOWN", + "DOWN_NONRESP", + "DOWN", + "ALLOCATED_MAINT", + "ALLOCATED_REBOOT", + "ALLOCATED_POWERUP", + "ALLOCATED_POWERDOWN", + "ALLOCATED_NONRESP", + "ALLOCATED_COMP", + "ALLOCATED", + "COMPLETING_MAINT", + "COMPLETING_REBOOT", + "COMPLETING_POWERUP", + "COMPLETING_POWERDOWN", + "COMPLETING_NONRESP", + "COMPLETING", + "IDLE_MAINT", + "IDLE_REBOOT", + "IDLE_POWERUP", + "IDLE_POWERDOWN", + "IDLE_NONRESP", + "PERFCTRS", + "RESERVED", + "IDLE", + "MIXED_MAINT", + "MIXED_REBOOT", + "MIXED_POWERUP", + "MIXED_POWERDOWN", + "MIXED_NONRESP", + "MIXED", + "FUTURE_MAINT", + "FUTURE_REBOOT", + "FUTURE_POWERUP", + "FUTURE_POWERDOWN", + "FUTURE_NONRESP", + "FUTURE", + "RESUME", + "UNKNOWN_NONRESP", + "UNKNOWN", + "?"}; + +/* based on src/common/slurm_protocol_defs.c node_state_string function */ +uint8_t slurm_node_state(uint32_t inx) { + int base = (inx & NODE_STATE_BASE); + bool comp_flag = (inx & NODE_STATE_COMPLETING); + bool drain_flag = (inx & NODE_STATE_DRAIN); + bool fail_flag = (inx & NODE_STATE_FAIL); + bool maint_flag = (inx & NODE_STATE_MAINT); + bool net_flag = (inx & NODE_STATE_NET); + bool reboot_flag = (inx & NODE_STATE_REBOOT); + bool res_flag = (inx & NODE_STATE_RES); + bool resume_flag = (inx & NODE_RESUME); + bool no_resp_flag = (inx & NODE_STATE_NO_RESPOND); + bool power_down_flag = (inx & NODE_STATE_POWER_SAVE); + bool power_up_flag = (inx & NODE_STATE_POWER_UP); + + if (maint_flag) { + if (drain_flag || (base == NODE_STATE_ALLOCATED) || + (base == NODE_STATE_DOWN) || (base == NODE_STATE_MIXED)) + ; + else if (no_resp_flag) + return MAINT_NONRESP; + else + return MAINT; + } + if (reboot_flag) { + if ((base == NODE_STATE_ALLOCATED) || (base == NODE_STATE_MIXED)) + ; + else if (no_resp_flag) + return REBOOT_NONRESP; + else + return REBOOT; + } + if (drain_flag) { + if (comp_flag || (base == NODE_STATE_ALLOCATED) || + (base == NODE_STATE_MIXED)) { + if (maint_flag) + return DRAINING_MAINT; + if (reboot_flag) + return DRAINING_REBOOT; + if (power_up_flag) + return DRAINING_POWERUP; + if (power_down_flag) + return DRAINING_POWERDOWN; + if (no_resp_flag) + return DRAINING_NONRESP; + return DRAINING; + } else { + if (maint_flag) + return DRAINED_MAINT; + if (reboot_flag) + return DRAINED_REBOOT; + if (power_up_flag) + return DRAINED_POWERUP; + if (power_down_flag) + return DRAINED_POWERDOWN; + if (no_resp_flag) + return DRAINED_NONRESP; + return DRAINED; + } + } + if (fail_flag) { + if (comp_flag || (base == NODE_STATE_ALLOCATED)) { + if (no_resp_flag) + return FAILING_NONRESP; + return FAILING; + } else { + if (no_resp_flag) + return FAIL_NONRESP; + return FAIL; + } + } + + if (inx == NODE_STATE_CANCEL_REBOOT) + return CANCEL_REBOOT; + if (inx == NODE_STATE_POWER_SAVE) + return POWER_DOWN; + if (inx == NODE_STATE_POWER_UP) + return POWER_UP; + if (base == NODE_STATE_DOWN) { + if (maint_flag) + return DOWN_MAINT; + if (reboot_flag) + return DOWN_REBOOT; + if (power_up_flag) + return DOWN_POWERUP; + if (power_down_flag) + return DOWN_POWERDOWN; + if (no_resp_flag) + return DOWN_NONRESP; + return DOWN; + } + + if (base == NODE_STATE_ALLOCATED) { + if (maint_flag) + return ALLOCATED_MAINT; + if (reboot_flag) + return ALLOCATED_REBOOT; + if (power_up_flag) + return ALLOCATED_POWERUP; + if (power_down_flag) + return ALLOCATED_POWERDOWN; + if (no_resp_flag) + return ALLOCATED_NONRESP; + if (comp_flag) + return ALLOCATED_COMP; + return ALLOCATED; + } + if (comp_flag) { + if (maint_flag) + return COMPLETING_MAINT; + if (reboot_flag) + return COMPLETING_REBOOT; + if (power_up_flag) + return COMPLETING_POWERUP; + if (power_down_flag) + return COMPLETING_POWERDOWN; + if (no_resp_flag) + return COMPLETING_NONRESP; + return COMPLETING; + } + if (base == NODE_STATE_IDLE) { + if (maint_flag) + return IDLE_MAINT; + if (reboot_flag) + return IDLE_REBOOT; + if (power_up_flag) + return IDLE_POWERUP; + if (power_down_flag) + return IDLE_POWERDOWN; + if (no_resp_flag) + return IDLE_NONRESP; + if (net_flag) + return PERFCTRS; + if (res_flag) + return RESERVED; + return IDLE; + } + if (base == NODE_STATE_MIXED) { + if (maint_flag) + return MIXED_MAINT; + if (reboot_flag) + return MIXED_REBOOT; + if (power_up_flag) + return MIXED_POWERUP; + if (power_down_flag) + return MIXED_POWERDOWN; + if (no_resp_flag) + return MIXED_NONRESP; + return MIXED; + } + if (base == NODE_STATE_FUTURE) { + if (maint_flag) + return FUTURE_MAINT; + if (reboot_flag) + return FUTURE_REBOOT; + if (power_up_flag) + return FUTURE_POWERUP; + if (power_down_flag) + return FUTURE_POWERDOWN; + if (no_resp_flag) + return FUTURE_NONRESP; + return FUTURE; + } + if (resume_flag) + return RESUME; + if (base == NODE_STATE_UNKNOWN) { + if (no_resp_flag) + return UNKNOWN_NONRESP; + return UNKNOWN; + } + return UNKNOWN2; +} + +#define NUM_NODE_STATES (sizeof(node_state_names) / sizeof(node_state_names[0])) + typedef struct partition_state_st { char name[PART_NAME_SIZE]; - /* counts nodes states indexed by enum node_states in slurm.h */ - uint32_t nodes_states[NODE_STATE_END]; + uint32_t nodes_states_count[NUM_NODE_STATES]; /* counts jobs states indexed by enum job_states in slurm.h */ - uint32_t jobs_states[JOB_END]; - /* other node flags */ - uint32_t drain; - uint32_t completing; - uint32_t no_respond; - uint32_t power_save; - uint32_t fail; + uint32_t jobs_states_count[JOB_END]; } partition_state_t; -/* based on enum node_states from slurm.h */ -static const char *node_state_names[] = { - "unknown", "down", "idle", "allocated", "error", "mixed", "future", -}; - /* based on enum job_states from slurm.h */ static const char *job_state_names[] = { "pending", "running", "suspended", "complete", "cancelled", "failed", @@ -115,20 +414,12 @@ static void slurm_submit(const char *plugin_instance, const char *type, static void slurm_submit_partition(partition_state_t *partition) { for (int i = 0; i < JOB_END; i++) { slurm_submit(partition->name, "slurm_job_state", job_state_names[i], - partition->jobs_states[i]); + partition->jobs_states_count[i]); } - for (int i = 0; i < NODE_STATE_END; i++) { + for (int i = 0; i < NUM_NODE_STATES; i++) { slurm_submit(partition->name, "slurm_node_state", node_state_names[i], - partition->nodes_states[i]); + partition->nodes_states_count[i]); } - slurm_submit(partition->name, "slurm_node_flag", "drain", partition->drain); - slurm_submit(partition->name, "slurm_node_flag", "completing", - partition->completing); - slurm_submit(partition->name, "slurm_node_flag", "no_respond", - partition->no_respond); - slurm_submit(partition->name, "slurm_node_flag", "power_save", - partition->power_save); - slurm_submit(partition->name, "slurm_node_flag", "fail", partition->fail); } static void slurm_submit_stats(stats_info_response_msg_t *stats_resp) { @@ -253,7 +544,7 @@ static int slurm_read(void) { } uint8_t job_state = job_ptr->job_state & JOB_STATE_BASE; - partition_state->jobs_states[job_state]++; + partition_state->jobs_states_count[job_state]++; } /* fill partition_states array with per-partition node state information */ @@ -276,18 +567,8 @@ static int slurm_read(void) { node_ptr = &node_buffer_ptr->node_array[k]; /* some non-existant nodes (name is NULL) may show up as node_state * FUTURE */ - uint8_t node_state = node_ptr->node_state & NODE_STATE_BASE; - partition_state->nodes_states[node_state]++; - if (node_ptr->node_state & NODE_STATE_DRAIN) - partition_state->drain++; - if (node_ptr->node_state & NODE_STATE_COMPLETING) - partition_state->completing++; - if (node_ptr->node_state & NODE_STATE_NO_RESPOND) - partition_state->no_respond++; - if (node_ptr->node_state & NODE_STATE_POWER_SAVE) - partition_state->power_save++; - if (node_ptr->node_state & NODE_STATE_FAIL) - partition_state->fail++; + uint8_t node_state = slurm_node_state(node_ptr->node_state); + partition_state->nodes_states_count[node_state]++; } } } diff --git a/src/types.db b/src/types.db index 0762671be..acc0a6429 100644 --- a/src/types.db +++ b/src/types.db @@ -233,7 +233,6 @@ signal_power value:GAUGE:U:0 signal_quality value:GAUGE:0:U slurm_job_state value:GAUGE:0:U slurm_node_state value:GAUGE:0:U -slurm_node_flag value:GAUGE:0:U slurm_stats value:GAUGE:0:U smart_attribute current:GAUGE:0:255, worst:GAUGE:0:255, threshold:GAUGE:0:255, pretty:GAUGE:0:U smart_badsectors value:GAUGE:0:U