From c6bd23ee114d25cdee36b706f66ccacb3c5beee1 Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Sun, 6 May 2007 06:58:01 +1000 Subject: [PATCH] update to rhe recovery daemon ctdb_ctrl_ calls are timedout due to nodes arriving or leaving the cluster it crashes the recovery daemon afterwards with a SEGV but no useful stack backtrace (This used to be ctdb commit cd3abc7349e86555ccd87cd47a1dcc2adad2f46c) --- ctdb/direct/recoverd.c | 54 ++++++++++++++++++++++++++++++++++++------ ctdb/tests/recover.sh | 11 ++++++--- 2 files changed, 55 insertions(+), 10 deletions(-) diff --git a/ctdb/direct/recoverd.c b/ctdb/direct/recoverd.c index 810c999ffc4..1924db69324 100644 --- a/ctdb/direct/recoverd.c +++ b/ctdb/direct/recoverd.c @@ -59,7 +59,6 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev, /* pick a new generation number */ generation = random(); - /* change the vnnmap on this node to use the new generation number but not on any other nodes. this guarantees that if we abort the recovery prematurely @@ -92,7 +91,6 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev, } } - /* get a list of all databases */ ret = ctdb_ctrl_getdbmap(ctdb, timeval_current_ofs(1, 0), vnn, mem_ctx, &dbmap); if (ret != 0) { @@ -100,6 +98,52 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev, return -1; } + /* verify that all other nodes have all our databases */ +printf("Verify all other nodes have the same databases as we have\n"); + for (j=0; jnum; j++) { + /* we dont need to ourself ourselves */ + if (nodemap->nodes[j].vnn == vnn) { + continue; + } + /* dont check nodes that are unavailable */ + if (!(nodemap->nodes[j].flags&NODE_FLAGS_CONNECTED)) { + continue; + } +printf("checking node %d\n",nodemap->nodes[j].vnn); + + ret = ctdb_ctrl_getdbmap(ctdb, timeval_current_ofs(1, 0), nodemap->nodes[j].vnn, mem_ctx, &remote_dbmap); + if (ret != 0) { + printf("Unable to get dbids from node %u\n", vnn); + return -1; + } + + /* step through all local databases */ + for (db=0; dbnum;db++) { + const char *name; + + + for (i=0;inum;i++) { + if (dbmap->dbids[db] == remote_dbmap->dbids[i]) { + break; + } + } + /* the remote node already have this database */ + if (i!=remote_dbmap->num) { + continue; + } + /* ok so we need to create this database */ + ctdb_ctrl_getdbname(ctdb, timeval_current_ofs(1, 0), vnn, dbmap->dbids[db], mem_ctx, &name); + if (ret != 0) { + printf("Unable to get dbname from node %u\n", vnn); + return -1; + } + ctdb_ctrl_createdb(ctdb, timeval_current_ofs(1, 0), nodemap->nodes[j].vnn, mem_ctx, name); + if (ret != 0) { + printf("Unable to create remote db:%s\n", name); + return -1; + } + } + } /* verify that we have all database any other node has */ for (j=0; jnum; j++) { @@ -153,7 +197,6 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev, } - /* verify that all other nodes have all our databases */ for (j=0; jnum; j++) { /* we dont need to ourself ourselves */ @@ -181,7 +224,7 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev, } } /* the remote node already have this database */ - if (i!=dbmap->num) { + if (i!=remote_dbmap->num) { continue; } /* ok so we need to create this database */ @@ -198,7 +241,6 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev, } } - /* pull all records from all other nodes across to this node (this merges based on rsn) */ @@ -256,7 +298,6 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev, } } - /* build a new vnn map */ vnnmap = talloc_zero_size(mem_ctx, offsetof(struct ctdb_vnn_map, map) + 4*num_active); if (vnnmap == NULL) { @@ -327,7 +368,6 @@ again: exit(-1); } - /* we only check for recovery once every second */ timed_out = 0; event_add_timed(ctdb->ev, mem_ctx, timeval_current_ofs(1, 0), timeout_func, ctdb); diff --git a/ctdb/tests/recover.sh b/ctdb/tests/recover.sh index dbc0421fbbc..57e2f422fe2 100755 --- a/ctdb/tests/recover.sh +++ b/ctdb/tests/recover.sh @@ -1,13 +1,17 @@ #!/bin/sh killall -q ctdbd +killall -q recoverd echo "Starting 4 ctdb daemons" bin/ctdbd --nlist direct/4nodes.txt bin/ctdbd --nlist direct/4nodes.txt --listen=127.0.0.2:9001 --socket=/tmp/ctdb.socket.127.0.0.2 bin/ctdbd --nlist direct/4nodes.txt --listen=127.0.0.3:9001 --socket=/tmp/ctdb.socket.127.0.0.3 bin/ctdbd --nlist direct/4nodes.txt --listen=127.0.0.4:9001 --socket=/tmp/ctdb.socket.127.0.0.4 +echo "Starting one recovery daemon on node 0" +bin/recoverd --socket=/tmp/ctdb.socket >/dev/null 2>/dev/null & +echo echo "Attaching to some databases" bin/ctdb_control attach test1.tdb || exit 1 bin/ctdb_control attach test2.tdb || exit 1 @@ -74,11 +78,12 @@ CTDBPID=`./bin/ctdb_control getpid 2 | sed -e "s/Pid://"` kill $CTDBPID sleep 1 + echo echo -echo "Recovery the cluster" -echo "====================" -./bin/ctdb_control recover 0 0x220c2a7b +echo "wait 3 seconds to let the recovery daemon do its job" +echo "====================================================" +sleep 3 echo echo -- 2.47.3