]> git.ipfire.org Git - thirdparty/samba.git/commitdiff
ctdb-tool: Add UNKNOWN pseudo state
authorVinit Agnihotri <vagnihotri@ddn.com>
Tue, 26 Apr 2022 07:20:21 +0000 (17:20 +1000)
committerAmitay Isaacs <amitay@samba.org>
Tue, 28 Jun 2022 09:24:31 +0000 (09:24 +0000)
When a node is starting, CTDB reports remote nodes as unhealthy by
default.  This can be misleading.

To hide this, report an "UNKNOWN" pseudo state when a remote node is
not disconnected and the runstate is less than or equal to
"FIRST_RECOVERY".

Signed-off-by: Vinit Agnihotri <vagnihotri@ddn.com>
Signed-off-by: Martin Schwenke <martin@meltin.net>
Reviewed-by: Amitay Isaacs <amitay@gmail.com>
12 files changed:
ctdb/tests/UNIT/tool/ctdb.nodestatus.001.sh
ctdb/tests/UNIT/tool/ctdb.nodestatus.002.sh
ctdb/tests/UNIT/tool/ctdb.nodestatus.003.sh
ctdb/tests/UNIT/tool/ctdb.nodestatus.004.sh
ctdb/tests/UNIT/tool/ctdb.nodestatus.005.sh
ctdb/tests/UNIT/tool/ctdb.nodestatus.006.sh
ctdb/tests/UNIT/tool/ctdb.status.001.sh
ctdb/tests/UNIT/tool/ctdb.status.002.sh
ctdb/tests/scripts/integration.bash
ctdb/tools/ctdb.c
ctdb/tools/ctdb_lvs
ctdb/tools/ctdb_natgw

index 2217afcc0b983ad33c6b7cd131dbe1bc29a5cc53..3c754e2a838f3cdecdae7cce3646b2b9adbe2777 100755 (executable)
@@ -25,9 +25,9 @@ EOF
 simple_test all
 
 required_result 0 <<EOF
-|Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
-|0|192.168.20.41|0|0|0|0|0|0|0|N|
-|1|192.168.20.42|0|0|0|0|0|0|0|N|
-|2|192.168.20.43|0|0|0|0|0|0|0|Y|
+|Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
+|0|192.168.20.41|0|0|0|0|0|0|0|0|N|
+|1|192.168.20.42|0|0|0|0|0|0|0|0|N|
+|2|192.168.20.43|0|0|0|0|0|0|0|0|Y|
 EOF
 simple_test -X all
index c1706fd98e750dda29db301b4d14e3f8e5e6d9e1..a5981dffa5223801365375b1e98d1065cd8cb208 100755 (executable)
@@ -25,9 +25,9 @@ EOF
 simple_test all
 
 required_result 1 <<EOF
-|Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
-|0|192.168.20.41|0|0|0|0|0|0|0|N|
-|1|192.168.20.42|1|0|0|0|0|1|0|N|
-|2|192.168.20.43|0|0|0|0|0|0|0|Y|
+|Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
+|0|192.168.20.41|0|0|0|0|0|0|0|0|N|
+|1|192.168.20.42|1|0|0|0|0|0|1|0|N|
+|2|192.168.20.43|0|0|0|0|0|0|0|0|Y|
 EOF
 simple_test -X all
index 5912e6501aa071c2cee2500c575f43c5c7c404ac..52c2691876fff3ce3344684770aa69d37086f82c 100755 (executable)
@@ -25,9 +25,9 @@ EOF
 simple_test all
 
 required_result 2 <<EOF
-|Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
-|0|192.168.20.41|0|0|0|1|0|0|0|N|
-|1|192.168.20.42|0|0|0|0|0|0|0|N|
-|2|192.168.20.43|0|0|0|0|0|0|0|Y|
+|Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
+|0|192.168.20.41|0|0|0|0|1|0|0|0|N|
+|1|192.168.20.42|0|0|0|0|0|0|0|0|N|
+|2|192.168.20.43|0|0|0|0|0|0|0|0|Y|
 EOF
 simple_test -X all
index 01ccd5129b41ae1470f3cb78564ba79be8e65f7e..c060fb98b90f82670bb933a3571c6da0105c9a0d 100755 (executable)
@@ -22,7 +22,7 @@ EOF
 simple_test
 
 required_result 0 <<EOF
-|Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
-|2|192.168.20.43|0|0|0|0|0|0|0|Y|
+|Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
+|2|192.168.20.43|0|0|0|0|0|0|0|0|Y|
 EOF
 simple_test -X
index 0cd24ba9cab663e786b466682e964d46b4646b70..59f6905b0593f1ef3aa765b2ac0d5c64069449b4 100755 (executable)
@@ -22,7 +22,7 @@ EOF
 simple_test 0
 
 required_result 2 <<EOF
-|Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
-|0|192.168.20.41|0|0|0|1|0|0|0|N|
+|Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
+|0|192.168.20.41|0|0|0|0|1|0|0|0|N|
 EOF
 simple_test -X 0
index ec189fc4690e0e0b7e3aeb54da6b27519d0fabb1..7d744510d53a7019e825dfeae2592b032e07aced 100755 (executable)
@@ -22,8 +22,8 @@ EOF
 simple_test 0
 
 required_result 36 <<EOF
-|Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
-|0|192.168.20.41|0|0|1|0|1|1|0|N|
+|Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
+|0|192.168.20.41|0|0|0|1|0|1|1|0|N|
 EOF
 simple_test -X 0
 
index 0742bd870a294bc405189c698039b29b69811802..62c1dc7c98ac16ea5cbdf42215fc1e78fdb6b03b 100755 (executable)
@@ -38,9 +38,9 @@ EOF
 simple_test
 
 required_result 0 <<EOF
-|Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
-|0|192.168.20.41|0|0|0|0|0|0|0|Y|
-|1|192.168.20.42|0|0|0|0|0|0|0|N|
-|2|192.168.20.43|0|0|0|0|0|0|0|N|
+|Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
+|0|192.168.20.41|0|0|0|0|0|0|0|0|Y|
+|1|192.168.20.42|0|0|0|0|0|0|0|0|N|
+|2|192.168.20.43|0|0|0|0|0|0|0|0|N|
 EOF
 simple_test -X
index 259e91438db3b1151686f14415ee0b204490dfd2..0cce4435ee4346a539778b6dbed9cd60cf657f25 100755 (executable)
@@ -38,9 +38,9 @@ EOF
 simple_test
 
 required_result 0 <<EOF
-|Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
-|0|192.168.20.41|0|0|0|1|0|0|0|N|
-|1|192.168.20.42|0|0|0|0|0|0|0|Y|
-|2|192.168.20.43|0|0|0|0|0|0|0|N|
+|Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode|
+|0|192.168.20.41|0|0|0|0|1|0|0|0|N|
+|1|192.168.20.42|0|0|0|0|0|0|0|0|Y|
+|2|192.168.20.43|0|0|0|0|0|0|0|0|N|
 EOF
 simple_test -X
index eb3db1e18495ab7a361d413b03c438d60144c783..dbf9af0b346cb85ca1be363b2e3211f231f8d709 100644 (file)
@@ -446,16 +446,16 @@ node_has_status ()
 
        local bits
        case "$status" in
-       unhealthy)    bits="?|?|?|1|*" ;;
-       healthy)      bits="?|?|?|0|*" ;;
+       unhealthy)    bits="?|?|?|?|1|*" ;;
+       healthy)      bits="?|?|?|?|0|*" ;;
        disconnected) bits="1|*" ;;
        connected)    bits="0|*" ;;
-       banned)       bits="?|1|*" ;;
-       unbanned)     bits="?|0|*" ;;
-       disabled)     bits="?|?|1|*" ;;
-       enabled)      bits="?|?|0|*" ;;
-       stopped)      bits="?|?|?|?|1|*" ;;
-       notstopped)   bits="?|?|?|?|0|*" ;;
+       banned)       bits="?|?|1|*" ;;
+       unbanned)     bits="?|?|0|*" ;;
+       disabled)     bits="?|?|?|1|*" ;;
+       enabled)      bits="?|?|?|0|*" ;;
+       stopped)      bits="?|?|?|?|?|1|*" ;;
+       notstopped)   bits="?|?|?|?|?|0|*" ;;
        *)
                echo "node_has_status: unknown status \"$status\""
                return 1
index b5303289f38c788396c6d6d49d54c45037aa4e67..dd2245ecfa6e95706e3a877f4ae41a4d4b95966b 100644 (file)
@@ -52,6 +52,8 @@
 #define SRVID_CTDB_TOOL    (CTDB_SRVID_TOOL_RANGE | 0x0001000000000000LL)
 #define SRVID_CTDB_PUSHDB  (CTDB_SRVID_TOOL_RANGE | 0x0002000000000000LL)
 
+#define NODE_FLAGS_UNKNOWN 0x00000040
+
 static struct {
        const char *debuglevelstr;
        int timelimit;
@@ -111,6 +113,7 @@ static const char *pretty_print_flags(TALLOC_CTX *mem_ctx, uint32_t flags)
                const char *name;
        } flag_names[] = {
                { NODE_FLAGS_DISCONNECTED,          "DISCONNECTED" },
+               { NODE_FLAGS_UNKNOWN,               "UNKNOWN" },
                { NODE_FLAGS_PERMANENTLY_DISABLED,  "DISABLED" },
                { NODE_FLAGS_BANNED,                "BANNED" },
                { NODE_FLAGS_UNHEALTHY,             "UNHEALTHY" },
@@ -367,6 +370,64 @@ done:
        return true;
 }
 
+/*
+ *  Remote nodes are initialised as UNHEALTHY in the daemon and their
+ *  true status is udpated after they are connected.  However, there
+ *  is a small window when a healthy node may be shown as unhealthy
+ *  between connecting and the status update.  Hide this for nodes
+ *  that are not DISCONNECTED nodes by reporting them as UNKNOWN until
+ *  the runstate passes FIRST_RECOVERY.  Code paths where this is used
+ *  do not make any control decisions depending upon unknown/unhealthy
+ *  state.
+ */
+static struct ctdb_node_map *get_nodemap_unknown(
+       TALLOC_CTX *mem_ctx,
+       struct ctdb_context *ctdb,
+       struct ctdb_node_map *nodemap_in)
+{
+       unsigned int i;
+       int ret;
+       enum ctdb_runstate runstate;
+       struct ctdb_node_map *nodemap;
+
+       ret = ctdb_ctrl_get_runstate(mem_ctx,
+                                    ctdb->ev,
+                                    ctdb->client,
+                                    ctdb->cmd_pnn,
+                                    TIMEOUT(),
+                                    &runstate);
+       if (ret != 0 ) {
+               printf("Unable to get runstate");
+               return NULL;
+       }
+
+       nodemap = talloc_nodemap(mem_ctx, nodemap_in);
+       if (nodemap == NULL) {
+               printf("Unable to get nodemap");
+               return NULL;
+       }
+
+       nodemap->num = nodemap_in->num;
+       for (i=0; i<nodemap->num; i++) {
+               struct ctdb_node_and_flags *node_in = &nodemap_in->node[i];
+               struct ctdb_node_and_flags *node = &nodemap->node[i];
+
+               *node = *node_in;
+
+               if (node->flags & NODE_FLAGS_DELETED) {
+                       continue;
+               }
+
+               if ((runstate <= CTDB_RUNSTATE_FIRST_RECOVERY) &&
+                   !(node->flags & NODE_FLAGS_DISCONNECTED) &&
+                   (node->pnn != ctdb->cmd_pnn)) {
+                       node->flags = NODE_FLAGS_UNKNOWN;
+               }
+       }
+
+       return nodemap;
+}
+
 /* Compare IP address */
 static bool ctdb_same_ip(ctdb_sock_addr *ip1, ctdb_sock_addr *ip2)
 {
@@ -826,11 +887,12 @@ static void print_nodemap_machine(TALLOC_CTX *mem_ctx,
        struct ctdb_node_and_flags *node;
        unsigned int i;
 
-       printf("%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
+       printf("%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
               options.sep,
               "Node", options.sep,
               "IP", options.sep,
               "Disconnected", options.sep,
+              "Unknown", options.sep,
               "Banned", options.sep,
               "Disabled", options.sep,
               "Unhealthy", options.sep,
@@ -845,12 +907,13 @@ static void print_nodemap_machine(TALLOC_CTX *mem_ctx,
                        continue;
                }
 
-               printf("%s%u%s%s%s%d%s%d%s%d%s%d%s%d%s%d%s%d%s%c%s\n",
+               printf("%s%u%s%s%s%d%s%d%s%d%s%d%s%d%s%d%s%d%s%d%s%c%s\n",
                       options.sep,
                       node->pnn, options.sep,
                       ctdb_sock_addr_to_string(mem_ctx, &node->addr, false),
                       options.sep,
                       !! (node->flags & NODE_FLAGS_DISCONNECTED), options.sep,
+                      !! (node->flags & NODE_FLAGS_UNKNOWN), options.sep,
                       !! (node->flags & NODE_FLAGS_BANNED), options.sep,
                       !! (node->flags & NODE_FLAGS_PERMANENTLY_DISABLED),
                       options.sep,
@@ -935,6 +998,7 @@ static void print_status(TALLOC_CTX *mem_ctx,
 static int control_status(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb,
                          int argc, const char **argv)
 {
+       struct ctdb_node_map *nodemap_in;
        struct ctdb_node_map *nodemap;
        struct ctdb_vnn_map *vnnmap;
        int recmode;
@@ -945,7 +1009,12 @@ static int control_status(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb,
                usage("status");
        }
 
-       nodemap = get_nodemap(ctdb, false);
+       nodemap_in = get_nodemap(ctdb, false);
+       if (nodemap_in == NULL) {
+               return 1;
+       }
+
+       nodemap = get_nodemap_unknown(mem_ctx, ctdb, nodemap_in);
        if (nodemap == NULL) {
                return 1;
        }
@@ -5603,6 +5672,7 @@ static int control_nodestatus(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb,
                              int argc, const char **argv)
 {
        const char *nodestring = NULL;
+       struct ctdb_node_map *nodemap_in;
        struct ctdb_node_map *nodemap;
        unsigned int i;
        int ret;
@@ -5619,7 +5689,12 @@ static int control_nodestatus(TALLOC_CTX *mem_ctx, struct ctdb_context *ctdb,
                }
        }
 
-       if (! parse_nodestring(mem_ctx, ctdb, nodestring, &nodemap)) {
+       if (! parse_nodestring(mem_ctx, ctdb, nodestring, &nodemap_in)) {
+               return 1;
+       }
+
+       nodemap = get_nodemap_unknown(mem_ctx, ctdb, nodemap_in);
+       if (nodemap == NULL) {
                return 1;
        }
 
index ee521ba2bc10af4e998c0e945f461bc4f6ae9787..d0249b9d4f447f782fcdc68a2a175c894078744f 100755 (executable)
@@ -32,7 +32,7 @@ EOF
 
 nodestatus_X=""
 # Fields are:
-# Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode
+# Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode
 get_nodestatus_X ()
 {
        # Result is cached in global variable nodestatus_X
@@ -100,11 +100,11 @@ filter_nodes ()
        # them, so the first to succeed will print the nodes.
 
        # First try for a fully active and healthy node, so must not
-       # be DISABLED, UNHEALTHY or INACTIVE (last covers
+       # be UNKNOWN, DISABLED, UNHEALTHY or INACTIVE (last covers
        # DISCONNECTED, BANNED or STOPPED)
        awk -F '|' -v ns="$_ns" '
                BEGIN { ret = 255 }
-               ns ~ "@" $2 "@" && $5 == 0 && $6 == 0 && $8 == 0 {
+               ns ~ "@" $2 "@" && $4 == 0 && $6 == 0 && $7 == 0 && $9 == 0 {
                        print $1, $2 ; ret=0
                }
                END { exit ret }
@@ -115,7 +115,7 @@ EOF
        # DISABLED
        awk -F '|' -v ns="$_ns" '
                BEGIN { ret = 255 }
-               ns ~ "@" $2 "@" && $5 == 0 && $8 == 0 {
+               ns ~ "@" $2 "@" && $6 == 0 && $9 == 0 {
                        print $1, $2 ; ret=0
                }
                END { exit ret }
index b37b7d34032251684fa761391b1375c6290186e1..728cd9c9b32aaa7534cd2a9b434961c56749504d 100755 (executable)
@@ -32,7 +32,7 @@ EOF
 
 nodestatus_X=""
 # Fields are:
-# Node|IP|Disconnected|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode
+# Node|IP|Disconnected|Unknown|Banned|Disabled|Unhealthy|Stopped|Inactive|PartiallyOnline|ThisNode
 get_nodestatus_X ()
 {
     # Result is cached in global variable nodestatus_X
@@ -102,12 +102,12 @@ EOF
     # the first to succeed will select the leader node.
 
     # First try for a fully active and healthy node, so must not be
-    # DISABLED, UNHEALTHY or INACTIVE (last covers DISCONNECTED,
+    # UNKNOWN, DISABLED, UNHEALTHY or INACTIVE (last covers DISCONNECTED,
     # BANNED or STOPPED)
     awk -F '|' -v ms="$_ms" \
        'BEGIN { ret = 2 }
         ms ~ "@" $2 "@" &&
-            $5 == 0 && $6 == 0 && $8 == 0 { print $1, $2 ; ret=0 ; exit }
+           $4 == 0 && $6 == 0 && $7 == 0 && $9 == 0 { print $1, $2 ; ret=0 ; exit }
         END { exit ret }' <<EOF ||
 $nodestatus_X
 EOF
@@ -116,7 +116,7 @@ EOF
     awk -F '|' -v ms="$_ms" \
        'BEGIN { ret = 2 }
         ms ~ "@" $2 "@" &&
-            $3 == 0 && $5 == 0 && $7 == 0 { print $1, $2 ; ret=0 ; exit }
+            $3 == 0 && $6 == 0 && $8 == 0 { print $1, $2 ; ret=0 ; exit }
         END { exit ret }' <<EOF ||
 $nodestatus_X
 EOF
@@ -125,7 +125,7 @@ EOF
     awk -F '|' -v ms="$_ms" \
        'BEGIN { ret = 2 }
         ms ~ "@" $2 "@" &&
-            $3 == 0 && $5 == 0 { print $1, $2 ; ret=0 ; exit }
+            $3 == 0 && $6 == 0 { print $1, $2 ; ret=0 ; exit }
         END { exit ret }' <<EOF
 $nodestatus_X
 EOF