- make specification of a recovery lock file compulsory

author Andrew Tridgell <tridge@samba.org>

Sat, 2 Jun 2007 01:36:42 +0000 (11:36 +1000)

committer Andrew Tridgell <tridge@samba.org>

Sat, 2 Jun 2007 01:36:42 +0000 (11:36 +1000)
author Andrew Tridgell <tridge@samba.org>
Sat, 2 Jun 2007 01:36:42 +0000 (11:36 +1000)
committer Andrew Tridgell <tridge@samba.org>
Sat, 2 Jun 2007 01:36:42 +0000 (11:36 +1000)
diff --git a/ctdb/common/ctdb.c b/ctdb/common/ctdb.c

index 273d40236c78dee930ed078ae9c9dbd1dd2a28d0..59db09609964bbc457a07c4f94f06bffddcccfbc 100644 (file)
--- a/ctdb/common/ctdb.c
+++ b/ctdb/common/ctdb.c
@@ -35,6 +35,15 @@ int ctdb_set_transport(struct ctdb_context *ctdb, const char *transport)
         return 0;
  }
  
+/*
+  choose the recovery lock file
+*/
+int ctdb_set_recovery_lock_file(struct ctdb_context *ctdb, const char *file)
+{
+       ctdb->recovery_lock_file = talloc_strdup(ctdb, file);
+       return 0;
+}
+
  /*
    choose the logfile location
  */
@@ -550,7 +559,7 @@ struct ctdb_context *ctdb_init(struct event_context *ev)
         ctdb->idr              = idr_init(ctdb);
         ctdb->max_lacount      = CTDB_DEFAULT_MAX_LACOUNT;
         ctdb->seqnum_frequency = CTDB_DEFAULT_SEQNUM_FREQUENCY;
-       ctdb->node_list_fd     = -1;
+       ctdb->recovery_lock_fd = -1;
         ctdb->monitoring_mode  = CTDB_MONITORING_ACTIVE;
  
         return ctdb;
diff --git a/ctdb/common/ctdb_recover.c b/ctdb/common/ctdb_recover.c

index 6ba3316f24309d937eaf49962d4d387d1f26b689..01f8373f175367a430726bc2dd249bd2914043a7 100644 (file)
--- a/ctdb/common/ctdb_recover.c
+++ b/ctdb/common/ctdb_recover.c
@@ -464,29 +464,45 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb,
                                  const char **errormsg)
  {
         uint32_t recmode = *(uint32_t *)indata.dptr;
+       int ret;
+       struct ctdb_set_recmode_state *state;
+
         if (ctdb->freeze_mode != CTDB_FREEZE_FROZEN) {
                 DEBUG(0,("Attempt to change recovery mode to %u when not frozen\n", 
                          recmode));
                 (*errormsg) = "Cannot change recovery mode while not frozen";
                 return -1;
         }
-       if (recmode == CTDB_RECOVERY_NORMAL && 
-           ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE) {
-               int ret;
-               struct ctdb_set_recmode_state *state = 
-                       talloc(ctdb, struct ctdb_set_recmode_state);
-               CTDB_NO_MEMORY(ctdb, state);
-               state->c = talloc_steal(state, c);
-               state->recmode = recmode;
-               /* call the events script to tell all subsystems that we have recovered */
-               ret = ctdb_event_script_callback(ctdb, state, 
-                                                ctdb_recovered_callback, 
-                                                state, "recovered");
-               if (ret != 0) {
-                       return ret;
-               }
-               *async_reply = true;
+
+       if (recmode != CTDB_RECOVERY_NORMAL ||
+           ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
+               ctdb->recovery_mode = recmode;
+               return 0;
         }
+
+       /* some special handling when ending recovery mode */
+       state = talloc(ctdb, struct ctdb_set_recmode_state);
+       CTDB_NO_MEMORY(ctdb, state);
+
+       /* we should not be able to get the lock on the nodes list, as it should be
+          held by the recovery master */
+       if (ctdb_recovery_lock(ctdb, false)) {
+               DEBUG(0,("ERROR: node list not locked when recovering!\n"));
+               ctdb_fatal(ctdb, "node list not locked - make sure it is on shared storage");
+               return -1;
+       }       
+
+       state->c = talloc_steal(state, c);
+       state->recmode = recmode;
+       /* call the events script to tell all subsystems that we have recovered */
+       ret = ctdb_event_script_callback(ctdb, state, 
+                                        ctdb_recovered_callback, 
+                                        state, "recovered");
+       if (ret != 0) {
+               return ret;
+       }
+       *async_reply = true;
+
         return 0;
  }
  
@@ -657,20 +673,20 @@ int32_t ctdb_control_delete_low_rsn(struct ctdb_context *ctdb, TDB_DATA indata,
  
  
  /*
-  try and lock the node list file - should only work on the recovery master recovery
-  daemon. Anywhere else is a bug
+  try and get the recovery lock in shared storage - should only work
+  on the recovery master recovery daemon. Anywhere else is a bug
   */
-bool ctdb_lock_node_list(struct ctdb_context *ctdb, bool keep)
+bool ctdb_recovery_lock(struct ctdb_context *ctdb, bool keep)
  {
         struct flock lock;
  
-       if (ctdb->node_list_fd != -1) {
-               close(ctdb->node_list_fd);
+       if (ctdb->recovery_lock_fd != -1) {
+               close(ctdb->recovery_lock_fd);
         }
-       ctdb->node_list_fd = open(ctdb->node_list_file, O_RDWR);
-       if (ctdb->node_list_fd == -1) {
+       ctdb->recovery_lock_fd = open(ctdb->recovery_lock_file, O_RDWR|O_CREAT, 0600);
+       if (ctdb->recovery_lock_fd == -1) {
                 DEBUG(0,("Unable to open %s - (%s)\n", 
-                        ctdb->node_list_file, strerror(errno)));
+                        ctdb->recovery_lock_file, strerror(errno)));
                 return false;
         }
  
@@ -680,13 +696,13 @@ bool ctdb_lock_node_list(struct ctdb_context *ctdb, bool keep)
         lock.l_len = 1;
         lock.l_pid = 0;
  
-       if (fcntl(ctdb->node_list_fd, F_SETLK, &lock) != 0) {
+       if (fcntl(ctdb->recovery_lock_fd, F_SETLK, &lock) != 0) {
                 return false;
         }
  
         if (!keep) {
-               close(ctdb->node_list_fd);
-               ctdb->node_list_fd = -1;
+               close(ctdb->recovery_lock_fd);
+               ctdb->recovery_lock_fd = -1;
         }
  
         return true;
diff --git a/ctdb/common/ctdb_recoverd.c b/ctdb/common/ctdb_recoverd.c

index f4a58bf1228571b2986e349016e0f6c99bb1daef..20ad94aec4041c7da7ecfc2291f8462043627478 100644 (file)
--- a/ctdb/common/ctdb_recoverd.c
+++ b/ctdb/common/ctdb_recoverd.c
@@ -386,8 +386,8 @@ static int do_recovery(struct ctdb_context *ctdb,
         uint32_t generation;
         struct ctdb_dbid_map *dbmap;
  
-       if (!ctdb_lock_node_list(ctdb, true)) {
-               DEBUG(0,("Unable to lock node list - aborting recovery\n"));
+       if (!ctdb_recovery_lock(ctdb, true)) {
+               DEBUG(0,("Unable to get recovery lock - aborting recovery\n"));
                 return -1;
         }
  
@@ -614,9 +614,9 @@ static void election_handler(struct ctdb_context *ctdb, uint64_t srvid,
         }
  
         /* release the recmaster lock */
-       if (ctdb->node_list_fd != -1) {
-               close(ctdb->node_list_fd);
-               ctdb->node_list_fd = -1;
+       if (ctdb->recovery_lock_fd != -1) {
+               close(ctdb->recovery_lock_fd);
+               ctdb->recovery_lock_fd = -1;
         }
  
         /* ok, let that guy become recmaster then */
diff --git a/ctdb/direct/ctdbd.c b/ctdb/direct/ctdbd.c

index 8102eef751e721f0fec03713f79d5e889502c462..8376d8b8473c62ad0e5581d78cdea4e99aa4f2e2 100644 (file)
--- a/ctdb/direct/ctdbd.c
+++ b/ctdb/direct/ctdbd.c
@@ -81,6 +81,7 @@ int main(int argc, const char *argv[])
                 { "transport", 0, POPT_ARG_STRING, &options.transport, 0, "protocol transport", NULL },
                 { "self-connect", 0, POPT_ARG_NONE, &options.self_connect, 0, "enable self connect", "boolean" },
                 { "dbdir", 0, POPT_ARG_STRING, &options.db_dir, 0, "directory for the tdb files", NULL },
+               { "reclock", 0, POPT_ARG_STRING, &options.recovery_lock_file, 0, "location of recovery lock file", "filename" },
                 POPT_TABLEEND
         };
         int opt, ret;
@@ -107,12 +108,23 @@ int main(int argc, const char *argv[])
                 while (extra_argv[extra_argc]) extra_argc++;
         }
  
+       if (!options.recovery_lock_file) {
+               DEBUG(0,("You must specifiy the location of a recovery lock file with --reclock\n"));
+               exit(1);
+       }
+
         block_signal(SIGPIPE);
  
         ev = event_context_init(NULL);
  
         ctdb = ctdb_cmdline_init(ev);
  
+       ret = ctdb_set_recovery_lock_file(ctdb, options.recovery_lock_file);
+       if (ret == -1) {
+               printf("ctdb_set_recovery_lock_file failed - %s\n", ctdb_errstr(ctdb));
+               exit(1);
+       }
+
         if (options.self_connect) {
                 ctdb_set_flags(ctdb, CTDB_FLAG_SELF_CONNECT);
         }
diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h

index 870cbd7a71489f01355f92a4e13755e56a52d90a..a035af59abdf153332c32726e5f796071e91243a 100644 (file)
--- a/ctdb/include/ctdb_private.h
+++ b/ctdb/include/ctdb_private.h
@@ -286,7 +286,8 @@ struct ctdb_context {
         const char *transport;
         const char *logfile;
         char *node_list_file;
-       int node_list_fd;
+       char *recovery_lock_file;
+       int recovery_lock_fd;
         uint32_t vnn; /* our own vnn */
         uint32_t num_nodes;
         uint32_t num_connected;
@@ -970,7 +971,8 @@ void ctdb_release_all_ips(struct ctdb_context *ctdb);
  void set_nonblocking(int fd);
  void set_close_on_exec(int fd);
  
-bool ctdb_lock_node_list(struct ctdb_context *ctdb, bool keep);
+bool ctdb_recovery_lock(struct ctdb_context *ctdb, bool keep);
  
+int ctdb_set_recovery_lock_file(struct ctdb_context *ctdb, const char *file);
  
  #endif
diff --git a/ctdb/packaging/ctdb.init b/ctdb/packaging/ctdb.init

index 488cbd7618f7ef19198da8e1b4367c2b2654347b..519334f666594d9c412df28aaadc9a6edc04bd48 100755 (executable)
--- a/ctdb/packaging/ctdb.init
+++ b/ctdb/packaging/ctdb.init
@@ -41,7 +41,13 @@ if [ -f /etc/sysconfig/ctdb ]; then
     . /etc/sysconfig/ctdb 
  fi
  
-# build up CTDB_OPTIONS variable
+[ -z "$CTDB_RECOVERY_LOCK" ] && {
+    echo "You must configure the location of the CTDB_RECOVERY_LOCK"
+    exit 1
+}
+CTDB_OPTIONS="$CTDB_OPTIONS --reclock=$CTDB_RECOVERY_LOCK"
+
+# build up CTDB_OPTIONS variable from optional parameters
  [ -z "$LOGFILE" ]          || CTDB_OPTIONS="$CTDB_OPTIONS --logfile=$LOGFILE"
  [ -z "$NODES" ]            || CTDB_OPTIONS="$CTDB_OPTIONS --nlist=$NODES"
  [ -z "$CTDB_SOCKET" ]      || CTDB_OPTIONS="$CTDB_OPTIONS --socket=$CTDB_SOCKET"
diff --git a/ctdb/tests/ctdbd.sh b/ctdb/tests/ctdbd.sh

index 5aedf2e3d594fecea3367703762a8c89b7be2512..e2c53b34f9ed8032d2be7a7e5c1cf2a4b3edc820 100755 (executable)
--- a/ctdb/tests/ctdbd.sh
+++ b/ctdb/tests/ctdbd.sh
@@ -3,10 +3,13 @@
  killall -q ctdbd
  
  echo "Starting 2 ctdb daemons"
-$VALGRIND bin/ctdbd --nlist direct/nodes.txt --event-script=tests/events --logfile=-
-$VALGRIND bin/ctdbd --nlist direct/nodes.txt --event-script=tests/events --logfile=-
+$VALGRIND bin/ctdbd --reclock=rec.lock --nlist direct/nodes.txt --event-script=tests/events --logfile=-
+$VALGRIND bin/ctdbd --reclock=rec.lock --nlist direct/nodes.txt --event-script=tests/events --logfile=- --socket=sock.2
  
-sleep 2
+while bin/ctdb status | grep RECOVERY > /dev/null; do
+    echo "`date` Waiting for recovery"
+    sleep 1;
+done
  
  echo "Testing ping"
  $VALGRIND bin/ctdb ping || exit 1
diff --git a/ctdb/tools/ctdb.sysconfig b/ctdb/tools/ctdb.sysconfig

index 595a46804495b567c37d349c0a83a4951f27ad02..16fcbd98967a8c3d8e51c43eed4bfc0deaf0f41c 100644 (file)
--- a/ctdb/tools/ctdb.sysconfig
+++ b/ctdb/tools/ctdb.sysconfig
@@ -1,5 +1,11 @@
  # Options to ctdbd. This is read by /etc/init.d/ctdb
  
+# you must specify the location of a shared lock file across all the
+# nodes. This must be on shared storage
+# there is no default
+# CTDB_RECOVERY_LOCK="/some/place/on/shared/storage"
+
+
  # the NODES file must be specified or ctdb won't start
  # it should contain a list of IPs that ctdb will use
  # it must be exactly the same on all cluster nodes
author	Andrew Tridgell <tridge@samba.org>
	Sat, 2 Jun 2007 01:36:42 +0000 (11:36 +1000)
committer	Andrew Tridgell <tridge@samba.org>
	Sat, 2 Jun 2007 01:36:42 +0000 (11:36 +1000)
ctdb/common/ctdb.c		patch \| blob \| blame \| history
ctdb/common/ctdb_recover.c		patch \| blob \| blame \| history
ctdb/common/ctdb_recoverd.c		patch \| blob \| blame \| history
ctdb/direct/ctdbd.c		patch \| blob \| blame \| history
ctdb/include/ctdb_private.h		patch \| blob \| blame \| history
ctdb/packaging/ctdb.init		patch \| blob \| blame \| history
ctdb/tests/ctdbd.sh		patch \| blob \| blame \| history
ctdb/tools/ctdb.sysconfig		patch \| blob \| blame \| history