From c6bd23ee114d25cdee36b706f66ccacb3c5beee1 Mon Sep 17 00:00:00 2001
From: Ronnie Sahlberg <sahlberg@ronnie>
Date: Sun, 6 May 2007 06:58:01 +1000
Subject: [PATCH] update to rhe recovery daemon ctdb_ctrl_ calls are timedout
 due to nodes arriving or leaving the cluster it crashes the recovery daemon
 afterwards with a SEGV but no useful stack backtrace

(This used to be ctdb commit cd3abc7349e86555ccd87cd47a1dcc2adad2f46c)
---
 ctdb/direct/recoverd.c | 54 ++++++++++++++++++++++++++++++++++++------
 ctdb/tests/recover.sh  | 11 ++++++---
 2 files changed, 55 insertions(+), 10 deletions(-)
diff --git a/ctdb/direct/recoverd.c b/ctdb/direct/recoverd.c
index 810c999ffc4..1924db69324 100644
--- a/ctdb/direct/recoverd.c
+++ b/ctdb/direct/recoverd.c
@@ -59,7 +59,6 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev,
 	/* pick a new generation number */
 	generation = random();
 
-
 	/* change the vnnmap on this node to use the new generation 
 	   number but not on any other nodes.
 	   this guarantees that if we abort the recovery prematurely
@@ -92,7 +91,6 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev,
 		}
 	}
 
-
 	/* get a list of all databases */
 	ret = ctdb_ctrl_getdbmap(ctdb, timeval_current_ofs(1, 0), vnn, mem_ctx, &dbmap);
 	if (ret != 0) {
@@ -100,6 +98,52 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev,
 		return -1;
 	}
 
+	/* verify that all other nodes have all our databases */
+printf("Verify all other nodes have the same databases as we have\n");
+	for (j=0; j<nodemap->num; j++) {
+		/* we dont need to ourself ourselves */
+		if (nodemap->nodes[j].vnn == vnn) {
+			continue;
+		}
+		/* dont check nodes that are unavailable */
+		if (!(nodemap->nodes[j].flags&NODE_FLAGS_CONNECTED)) {
+			continue;
+		}
+printf("checking node %d\n",nodemap->nodes[j].vnn);
+
+		ret = ctdb_ctrl_getdbmap(ctdb, timeval_current_ofs(1, 0), nodemap->nodes[j].vnn, mem_ctx, &remote_dbmap);
+		if (ret != 0) {
+			printf("Unable to get dbids from node %u\n", vnn);
+			return -1;
+		}
+
+		/* step through all local databases */
+		for (db=0; db<dbmap->num;db++) {
+			const char *name;
+
+
+			for (i=0;i<remote_dbmap->num;i++) {
+				if (dbmap->dbids[db] == remote_dbmap->dbids[i]) {
+					break;
+				}
+			}
+			/* the remote node already have this database */
+			if (i!=remote_dbmap->num) {
+				continue;
+			}
+			/* ok so we need to create this database */
+			ctdb_ctrl_getdbname(ctdb, timeval_current_ofs(1, 0), vnn, dbmap->dbids[db], mem_ctx, &name);
+			if (ret != 0) {
+				printf("Unable to get dbname from node %u\n", vnn);
+				return -1;
+			}
+			ctdb_ctrl_createdb(ctdb, timeval_current_ofs(1, 0), nodemap->nodes[j].vnn, mem_ctx, name);
+			if (ret != 0) {
+				printf("Unable to create remote db:%s\n", name);
+				return -1;
+			}
+		}
+	}
 
 	/* verify that we have all database any other node has */
 	for (j=0; j<nodemap->num; j++) {
@@ -153,7 +197,6 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev,
 	}
 
 
-
 	/* verify that all other nodes have all our databases */
 	for (j=0; j<nodemap->num; j++) {
 		/* we dont need to ourself ourselves */
@@ -181,7 +224,7 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev,
 				}
 			}
 			/* the remote node already have this database */
-			if (i!=dbmap->num) {
+			if (i!=remote_dbmap->num) {
 				continue;
 			}
 			/* ok so we need to create this database */
@@ -198,7 +241,6 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev,
 		}
 	}
 
-
 	/* pull all records from all other nodes across to this node
 	   (this merges based on rsn)
 	*/
@@ -256,7 +298,6 @@ static int do_recovery(struct ctdb_context *ctdb, struct event_context *ev,
 		}
 	}
 
-
 	/* build a new vnn map */
 	vnnmap = talloc_zero_size(mem_ctx, offsetof(struct ctdb_vnn_map, map) + 4*num_active);
 	if (vnnmap == NULL) {
@@ -327,7 +368,6 @@ again:
 		exit(-1);
 	}
 
-
 	/* we only check for recovery once every second */
 	timed_out = 0;
 	event_add_timed(ctdb->ev, mem_ctx, timeval_current_ofs(1, 0), timeout_func, ctdb);
diff --git a/ctdb/tests/recover.sh b/ctdb/tests/recover.sh
index dbc0421fbbc..57e2f422fe2 100755
--- a/ctdb/tests/recover.sh
+++ b/ctdb/tests/recover.sh
@@ -1,13 +1,17 @@
 #!/bin/sh
 
 killall -q ctdbd
+killall -q recoverd
 
 echo "Starting 4 ctdb daemons"
 bin/ctdbd --nlist direct/4nodes.txt
 bin/ctdbd --nlist direct/4nodes.txt --listen=127.0.0.2:9001 --socket=/tmp/ctdb.socket.127.0.0.2
 bin/ctdbd --nlist direct/4nodes.txt --listen=127.0.0.3:9001 --socket=/tmp/ctdb.socket.127.0.0.3
 bin/ctdbd --nlist direct/4nodes.txt --listen=127.0.0.4:9001 --socket=/tmp/ctdb.socket.127.0.0.4
+echo "Starting one recovery daemon on node 0"
+bin/recoverd --socket=/tmp/ctdb.socket >/dev/null 2>/dev/null &
 
+echo
 echo "Attaching to some databases"
 bin/ctdb_control attach test1.tdb || exit 1
 bin/ctdb_control attach test2.tdb || exit 1
@@ -74,11 +78,12 @@ CTDBPID=`./bin/ctdb_control getpid 2 | sed -e "s/Pid://"`
 kill $CTDBPID
 sleep 1
 
+
 echo
 echo
-echo "Recovery the cluster"
-echo "===================="
-./bin/ctdb_control recover 0 0x220c2a7b
+echo "wait 3 seconds to let the recovery daemon do its job"
+echo "===================================================="
+sleep 3
 
 echo
 echo
-- 
2.47.3