return 0;
}
+/*
+ choose the recovery lock file
+*/
+int ctdb_set_recovery_lock_file(struct ctdb_context *ctdb, const char *file)
+{
+ ctdb->recovery_lock_file = talloc_strdup(ctdb, file);
+ return 0;
+}
+
/*
choose the logfile location
*/
ctdb->idr = idr_init(ctdb);
ctdb->max_lacount = CTDB_DEFAULT_MAX_LACOUNT;
ctdb->seqnum_frequency = CTDB_DEFAULT_SEQNUM_FREQUENCY;
- ctdb->node_list_fd = -1;
+ ctdb->recovery_lock_fd = -1;
ctdb->monitoring_mode = CTDB_MONITORING_ACTIVE;
return ctdb;
const char **errormsg)
{
uint32_t recmode = *(uint32_t *)indata.dptr;
+ int ret;
+ struct ctdb_set_recmode_state *state;
+
if (ctdb->freeze_mode != CTDB_FREEZE_FROZEN) {
DEBUG(0,("Attempt to change recovery mode to %u when not frozen\n",
recmode));
(*errormsg) = "Cannot change recovery mode while not frozen";
return -1;
}
- if (recmode == CTDB_RECOVERY_NORMAL &&
- ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE) {
- int ret;
- struct ctdb_set_recmode_state *state =
- talloc(ctdb, struct ctdb_set_recmode_state);
- CTDB_NO_MEMORY(ctdb, state);
- state->c = talloc_steal(state, c);
- state->recmode = recmode;
- /* call the events script to tell all subsystems that we have recovered */
- ret = ctdb_event_script_callback(ctdb, state,
- ctdb_recovered_callback,
- state, "recovered");
- if (ret != 0) {
- return ret;
- }
- *async_reply = true;
+
+ if (recmode != CTDB_RECOVERY_NORMAL ||
+ ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) {
+ ctdb->recovery_mode = recmode;
+ return 0;
}
+
+ /* some special handling when ending recovery mode */
+ state = talloc(ctdb, struct ctdb_set_recmode_state);
+ CTDB_NO_MEMORY(ctdb, state);
+
+ /* we should not be able to get the lock on the nodes list, as it should be
+ held by the recovery master */
+ if (ctdb_recovery_lock(ctdb, false)) {
+ DEBUG(0,("ERROR: node list not locked when recovering!\n"));
+ ctdb_fatal(ctdb, "node list not locked - make sure it is on shared storage");
+ return -1;
+ }
+
+ state->c = talloc_steal(state, c);
+ state->recmode = recmode;
+ /* call the events script to tell all subsystems that we have recovered */
+ ret = ctdb_event_script_callback(ctdb, state,
+ ctdb_recovered_callback,
+ state, "recovered");
+ if (ret != 0) {
+ return ret;
+ }
+ *async_reply = true;
+
return 0;
}
/*
- try and lock the node list file - should only work on the recovery master recovery
- daemon. Anywhere else is a bug
+ try and get the recovery lock in shared storage - should only work
+ on the recovery master recovery daemon. Anywhere else is a bug
*/
-bool ctdb_lock_node_list(struct ctdb_context *ctdb, bool keep)
+bool ctdb_recovery_lock(struct ctdb_context *ctdb, bool keep)
{
struct flock lock;
- if (ctdb->node_list_fd != -1) {
- close(ctdb->node_list_fd);
+ if (ctdb->recovery_lock_fd != -1) {
+ close(ctdb->recovery_lock_fd);
}
- ctdb->node_list_fd = open(ctdb->node_list_file, O_RDWR);
- if (ctdb->node_list_fd == -1) {
+ ctdb->recovery_lock_fd = open(ctdb->recovery_lock_file, O_RDWR|O_CREAT, 0600);
+ if (ctdb->recovery_lock_fd == -1) {
DEBUG(0,("Unable to open %s - (%s)\n",
- ctdb->node_list_file, strerror(errno)));
+ ctdb->recovery_lock_file, strerror(errno)));
return false;
}
lock.l_len = 1;
lock.l_pid = 0;
- if (fcntl(ctdb->node_list_fd, F_SETLK, &lock) != 0) {
+ if (fcntl(ctdb->recovery_lock_fd, F_SETLK, &lock) != 0) {
return false;
}
if (!keep) {
- close(ctdb->node_list_fd);
- ctdb->node_list_fd = -1;
+ close(ctdb->recovery_lock_fd);
+ ctdb->recovery_lock_fd = -1;
}
return true;
uint32_t generation;
struct ctdb_dbid_map *dbmap;
- if (!ctdb_lock_node_list(ctdb, true)) {
- DEBUG(0,("Unable to lock node list - aborting recovery\n"));
+ if (!ctdb_recovery_lock(ctdb, true)) {
+ DEBUG(0,("Unable to get recovery lock - aborting recovery\n"));
return -1;
}
}
/* release the recmaster lock */
- if (ctdb->node_list_fd != -1) {
- close(ctdb->node_list_fd);
- ctdb->node_list_fd = -1;
+ if (ctdb->recovery_lock_fd != -1) {
+ close(ctdb->recovery_lock_fd);
+ ctdb->recovery_lock_fd = -1;
}
/* ok, let that guy become recmaster then */
{ "transport", 0, POPT_ARG_STRING, &options.transport, 0, "protocol transport", NULL },
{ "self-connect", 0, POPT_ARG_NONE, &options.self_connect, 0, "enable self connect", "boolean" },
{ "dbdir", 0, POPT_ARG_STRING, &options.db_dir, 0, "directory for the tdb files", NULL },
+ { "reclock", 0, POPT_ARG_STRING, &options.recovery_lock_file, 0, "location of recovery lock file", "filename" },
POPT_TABLEEND
};
int opt, ret;
while (extra_argv[extra_argc]) extra_argc++;
}
+ if (!options.recovery_lock_file) {
+ DEBUG(0,("You must specifiy the location of a recovery lock file with --reclock\n"));
+ exit(1);
+ }
+
block_signal(SIGPIPE);
ev = event_context_init(NULL);
ctdb = ctdb_cmdline_init(ev);
+ ret = ctdb_set_recovery_lock_file(ctdb, options.recovery_lock_file);
+ if (ret == -1) {
+ printf("ctdb_set_recovery_lock_file failed - %s\n", ctdb_errstr(ctdb));
+ exit(1);
+ }
+
if (options.self_connect) {
ctdb_set_flags(ctdb, CTDB_FLAG_SELF_CONNECT);
}
const char *transport;
const char *logfile;
char *node_list_file;
- int node_list_fd;
+ char *recovery_lock_file;
+ int recovery_lock_fd;
uint32_t vnn; /* our own vnn */
uint32_t num_nodes;
uint32_t num_connected;
void set_nonblocking(int fd);
void set_close_on_exec(int fd);
-bool ctdb_lock_node_list(struct ctdb_context *ctdb, bool keep);
+bool ctdb_recovery_lock(struct ctdb_context *ctdb, bool keep);
+int ctdb_set_recovery_lock_file(struct ctdb_context *ctdb, const char *file);
#endif
. /etc/sysconfig/ctdb
fi
-# build up CTDB_OPTIONS variable
+[ -z "$CTDB_RECOVERY_LOCK" ] && {
+ echo "You must configure the location of the CTDB_RECOVERY_LOCK"
+ exit 1
+}
+CTDB_OPTIONS="$CTDB_OPTIONS --reclock=$CTDB_RECOVERY_LOCK"
+
+# build up CTDB_OPTIONS variable from optional parameters
[ -z "$LOGFILE" ] || CTDB_OPTIONS="$CTDB_OPTIONS --logfile=$LOGFILE"
[ -z "$NODES" ] || CTDB_OPTIONS="$CTDB_OPTIONS --nlist=$NODES"
[ -z "$CTDB_SOCKET" ] || CTDB_OPTIONS="$CTDB_OPTIONS --socket=$CTDB_SOCKET"
killall -q ctdbd
echo "Starting 2 ctdb daemons"
-$VALGRIND bin/ctdbd --nlist direct/nodes.txt --event-script=tests/events --logfile=-
-$VALGRIND bin/ctdbd --nlist direct/nodes.txt --event-script=tests/events --logfile=-
+$VALGRIND bin/ctdbd --reclock=rec.lock --nlist direct/nodes.txt --event-script=tests/events --logfile=-
+$VALGRIND bin/ctdbd --reclock=rec.lock --nlist direct/nodes.txt --event-script=tests/events --logfile=- --socket=sock.2
-sleep 2
+while bin/ctdb status | grep RECOVERY > /dev/null; do
+ echo "`date` Waiting for recovery"
+ sleep 1;
+done
echo "Testing ping"
$VALGRIND bin/ctdb ping || exit 1
# Options to ctdbd. This is read by /etc/init.d/ctdb
+# you must specify the location of a shared lock file across all the
+# nodes. This must be on shared storage
+# there is no default
+# CTDB_RECOVERY_LOCK="/some/place/on/shared/storage"
+
+
# the NODES file must be specified or ctdb won't start
# it should contain a list of IPs that ctdb will use
# it must be exactly the same on all cluster nodes