update to rhe recovery daemon

author Ronnie Sahlberg <sahlberg@ronnie>

Sat, 5 May 2007 20:58:01 +0000 (06:58 +1000)

committer Ronnie Sahlberg <sahlberg@ronnie>

Sat, 5 May 2007 20:58:01 +0000 (06:58 +1000)
author Ronnie Sahlberg <sahlberg@ronnie>
Sat, 5 May 2007 20:58:01 +0000 (06:58 +1000)
committer Ronnie Sahlberg <sahlberg@ronnie>
Sat, 5 May 2007 20:58:01 +0000 (06:58 +1000)
diff --git a/ctdb/direct/recoverd.c b/ctdb/direct/recoverd.c

index 810c999ffc42d1e8dec11691ffe7c523cd8d1886..1924db69324e8d71dc48d201f0fbbbd340889f9a 100644 (file)
--- a/ctdb/direct/recoverd.c
+++ b/ctdb/direct/recoverd.c
@@ -59,7 +59,6 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev,
         /* pick a new generation number */
         generation = random();
  
-
         /* change the vnnmap on this node to use the new generation 
            number but not on any other nodes.
            this guarantees that if we abort the recovery prematurely
@@ -92,7 +91,6 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev,
                 }
         }
  
-
         /* get a list of all databases */
         ret = ctdb_ctrl_getdbmap(ctdb, timeval_current_ofs(1, 0), vnn, mem_ctx, &dbmap);
         if (ret != 0) {
@@ -100,6 +98,52 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev,
                 return -1;
         }
  
+       /* verify that all other nodes have all our databases */
+printf("Verify all other nodes have the same databases as we have\n");
+       for (j=0; j<nodemap->num; j++) {
+               /* we dont need to ourself ourselves */
+               if (nodemap->nodes[j].vnn == vnn) {
+                       continue;
+               }
+               /* dont check nodes that are unavailable */
+               if (!(nodemap->nodes[j].flags&NODE_FLAGS_CONNECTED)) {
+                       continue;
+               }
+printf("checking node %d\n",nodemap->nodes[j].vnn);
+
+               ret = ctdb_ctrl_getdbmap(ctdb, timeval_current_ofs(1, 0), nodemap->nodes[j].vnn, mem_ctx, &remote_dbmap);
+               if (ret != 0) {
+                       printf("Unable to get dbids from node %u\n", vnn);
+                       return -1;
+               }
+
+               /* step through all local databases */
+               for (db=0; db<dbmap->num;db++) {
+                       const char *name;
+
+
+                       for (i=0;i<remote_dbmap->num;i++) {
+                               if (dbmap->dbids[db] == remote_dbmap->dbids[i]) {
+                                       break;
+                               }
+                       }
+                       /* the remote node already have this database */
+                       if (i!=remote_dbmap->num) {
+                               continue;
+                       }
+                       /* ok so we need to create this database */
+                       ctdb_ctrl_getdbname(ctdb, timeval_current_ofs(1, 0), vnn, dbmap->dbids[db], mem_ctx, &name);
+                       if (ret != 0) {
+                               printf("Unable to get dbname from node %u\n", vnn);
+                               return -1;
+                       }
+                       ctdb_ctrl_createdb(ctdb, timeval_current_ofs(1, 0), nodemap->nodes[j].vnn, mem_ctx, name);
+                       if (ret != 0) {
+                               printf("Unable to create remote db:%s\n", name);
+                               return -1;
+                       }
+               }
+       }
  
         /* verify that we have all database any other node has */
         for (j=0; j<nodemap->num; j++) {
@@ -153,7 +197,6 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev,
         }
  
  
-
         /* verify that all other nodes have all our databases */
         for (j=0; j<nodemap->num; j++) {
                 /* we dont need to ourself ourselves */
@@ -181,7 +224,7 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev,
                                 }
                         }
                         /* the remote node already have this database */
-                       if (i!=dbmap->num) {
+                       if (i!=remote_dbmap->num) {
                                 continue;
                         }
                         /* ok so we need to create this database */
@@ -198,7 +241,6 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev,
                 }
         }
  
-
         /* pull all records from all other nodes across to this node
            (this merges based on rsn)
         */
@@ -256,7 +298,6 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev,
                 }
         }
  
-
         /* build a new vnn map */
         vnnmap = talloc_zero_size(mem_ctx, offsetof(struct ctdb_vnn_map, map) + 4*num_active);
         if (vnnmap == NULL) {
@@ -327,7 +368,6 @@ again:
                 exit(-1);
         }
  
-
         /* we only check for recovery once every second */
         timed_out = 0;
         event_add_timed(ctdb->ev, mem_ctx, timeval_current_ofs(1, 0), timeout_func, ctdb);
diff --git a/ctdb/tests/recover.sh b/ctdb/tests/recover.sh

index dbc0421fbbc9bb0a6af542c99fd2e527b4de1d82..57e2f422fe2f6e44446021ba036a4549d39c9a3a 100755 (executable)
--- a/ctdb/tests/recover.sh
+++ b/ctdb/tests/recover.sh
@@ -1,13 +1,17 @@
  #!/bin/sh
  
  killall -q ctdbd
+killall -q recoverd
  
  echo "Starting 4 ctdb daemons"
  bin/ctdbd --nlist direct/4nodes.txt
  bin/ctdbd --nlist direct/4nodes.txt --listen=127.0.0.2:9001 --socket=/tmp/ctdb.socket.127.0.0.2
  bin/ctdbd --nlist direct/4nodes.txt --listen=127.0.0.3:9001 --socket=/tmp/ctdb.socket.127.0.0.3
  bin/ctdbd --nlist direct/4nodes.txt --listen=127.0.0.4:9001 --socket=/tmp/ctdb.socket.127.0.0.4
+echo "Starting one recovery daemon on node 0"
+bin/recoverd --socket=/tmp/ctdb.socket >/dev/null 2>/dev/null &
  
+echo
  echo "Attaching to some databases"
  bin/ctdb_control attach test1.tdb || exit 1
  bin/ctdb_control attach test2.tdb || exit 1
@@ -74,11 +78,12 @@ CTDBPID=`./bin/ctdb_control getpid 2 | sed -e "s/Pid://"`
  kill $CTDBPID
  sleep 1
  
+
  echo
  echo
-echo "Recovery the cluster"
-echo "===================="
-./bin/ctdb_control recover 0 0x220c2a7b
+echo "wait 3 seconds to let the recovery daemon do its job"
+echo "===================================================="
+sleep 3
  
  echo
  echo
author	Ronnie Sahlberg <sahlberg@ronnie>
	Sat, 5 May 2007 20:58:01 +0000 (06:58 +1000)
committer	Ronnie Sahlberg <sahlberg@ronnie>
	Sat, 5 May 2007 20:58:01 +0000 (06:58 +1000)
ctdb/direct/recoverd.c		patch \| blob \| blame \| history
ctdb/tests/recover.sh		patch \| blob \| blame \| history