From: Andrew Tridgell Date: Sat, 2 Jun 2007 01:36:42 +0000 (+1000) Subject: - make specification of a recovery lock file compulsory X-Git-Tag: tevent-0.9.20~348^2~2601 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=ebf12646cf701496ded5c4c4d9dd640fa5a6a740;p=thirdparty%2Fsamba.git - make specification of a recovery lock file compulsory - die if someone other than the recmaster can get the recovery lock (This used to be ctdb commit a827d0d0e430ca8ad5d521367e45097185492869) --- diff --git a/ctdb/common/ctdb.c b/ctdb/common/ctdb.c index 273d40236c7..59db0960996 100644 --- a/ctdb/common/ctdb.c +++ b/ctdb/common/ctdb.c @@ -35,6 +35,15 @@ int ctdb_set_transport(struct ctdb_context *ctdb, const char *transport) return 0; } +/* + choose the recovery lock file +*/ +int ctdb_set_recovery_lock_file(struct ctdb_context *ctdb, const char *file) +{ + ctdb->recovery_lock_file = talloc_strdup(ctdb, file); + return 0; +} + /* choose the logfile location */ @@ -550,7 +559,7 @@ struct ctdb_context *ctdb_init(struct event_context *ev) ctdb->idr = idr_init(ctdb); ctdb->max_lacount = CTDB_DEFAULT_MAX_LACOUNT; ctdb->seqnum_frequency = CTDB_DEFAULT_SEQNUM_FREQUENCY; - ctdb->node_list_fd = -1; + ctdb->recovery_lock_fd = -1; ctdb->monitoring_mode = CTDB_MONITORING_ACTIVE; return ctdb; diff --git a/ctdb/common/ctdb_recover.c b/ctdb/common/ctdb_recover.c index 6ba3316f243..01f8373f175 100644 --- a/ctdb/common/ctdb_recover.c +++ b/ctdb/common/ctdb_recover.c @@ -464,29 +464,45 @@ int32_t ctdb_control_set_recmode(struct ctdb_context *ctdb, const char **errormsg) { uint32_t recmode = *(uint32_t *)indata.dptr; + int ret; + struct ctdb_set_recmode_state *state; + if (ctdb->freeze_mode != CTDB_FREEZE_FROZEN) { DEBUG(0,("Attempt to change recovery mode to %u when not frozen\n", recmode)); (*errormsg) = "Cannot change recovery mode while not frozen"; return -1; } - if (recmode == CTDB_RECOVERY_NORMAL && - ctdb->recovery_mode == CTDB_RECOVERY_ACTIVE) { - int ret; - struct ctdb_set_recmode_state *state = - talloc(ctdb, struct ctdb_set_recmode_state); - CTDB_NO_MEMORY(ctdb, state); - state->c = talloc_steal(state, c); - state->recmode = recmode; - /* call the events script to tell all subsystems that we have recovered */ - ret = ctdb_event_script_callback(ctdb, state, - ctdb_recovered_callback, - state, "recovered"); - if (ret != 0) { - return ret; - } - *async_reply = true; + + if (recmode != CTDB_RECOVERY_NORMAL || + ctdb->recovery_mode != CTDB_RECOVERY_ACTIVE) { + ctdb->recovery_mode = recmode; + return 0; } + + /* some special handling when ending recovery mode */ + state = talloc(ctdb, struct ctdb_set_recmode_state); + CTDB_NO_MEMORY(ctdb, state); + + /* we should not be able to get the lock on the nodes list, as it should be + held by the recovery master */ + if (ctdb_recovery_lock(ctdb, false)) { + DEBUG(0,("ERROR: node list not locked when recovering!\n")); + ctdb_fatal(ctdb, "node list not locked - make sure it is on shared storage"); + return -1; + } + + state->c = talloc_steal(state, c); + state->recmode = recmode; + /* call the events script to tell all subsystems that we have recovered */ + ret = ctdb_event_script_callback(ctdb, state, + ctdb_recovered_callback, + state, "recovered"); + if (ret != 0) { + return ret; + } + *async_reply = true; + return 0; } @@ -657,20 +673,20 @@ int32_t ctdb_control_delete_low_rsn(struct ctdb_context *ctdb, TDB_DATA indata, /* - try and lock the node list file - should only work on the recovery master recovery - daemon. Anywhere else is a bug + try and get the recovery lock in shared storage - should only work + on the recovery master recovery daemon. Anywhere else is a bug */ -bool ctdb_lock_node_list(struct ctdb_context *ctdb, bool keep) +bool ctdb_recovery_lock(struct ctdb_context *ctdb, bool keep) { struct flock lock; - if (ctdb->node_list_fd != -1) { - close(ctdb->node_list_fd); + if (ctdb->recovery_lock_fd != -1) { + close(ctdb->recovery_lock_fd); } - ctdb->node_list_fd = open(ctdb->node_list_file, O_RDWR); - if (ctdb->node_list_fd == -1) { + ctdb->recovery_lock_fd = open(ctdb->recovery_lock_file, O_RDWR|O_CREAT, 0600); + if (ctdb->recovery_lock_fd == -1) { DEBUG(0,("Unable to open %s - (%s)\n", - ctdb->node_list_file, strerror(errno))); + ctdb->recovery_lock_file, strerror(errno))); return false; } @@ -680,13 +696,13 @@ bool ctdb_lock_node_list(struct ctdb_context *ctdb, bool keep) lock.l_len = 1; lock.l_pid = 0; - if (fcntl(ctdb->node_list_fd, F_SETLK, &lock) != 0) { + if (fcntl(ctdb->recovery_lock_fd, F_SETLK, &lock) != 0) { return false; } if (!keep) { - close(ctdb->node_list_fd); - ctdb->node_list_fd = -1; + close(ctdb->recovery_lock_fd); + ctdb->recovery_lock_fd = -1; } return true; diff --git a/ctdb/common/ctdb_recoverd.c b/ctdb/common/ctdb_recoverd.c index f4a58bf1228..20ad94aec40 100644 --- a/ctdb/common/ctdb_recoverd.c +++ b/ctdb/common/ctdb_recoverd.c @@ -386,8 +386,8 @@ static int do_recovery(struct ctdb_context *ctdb, uint32_t generation; struct ctdb_dbid_map *dbmap; - if (!ctdb_lock_node_list(ctdb, true)) { - DEBUG(0,("Unable to lock node list - aborting recovery\n")); + if (!ctdb_recovery_lock(ctdb, true)) { + DEBUG(0,("Unable to get recovery lock - aborting recovery\n")); return -1; } @@ -614,9 +614,9 @@ static void election_handler(struct ctdb_context *ctdb, uint64_t srvid, } /* release the recmaster lock */ - if (ctdb->node_list_fd != -1) { - close(ctdb->node_list_fd); - ctdb->node_list_fd = -1; + if (ctdb->recovery_lock_fd != -1) { + close(ctdb->recovery_lock_fd); + ctdb->recovery_lock_fd = -1; } /* ok, let that guy become recmaster then */ diff --git a/ctdb/direct/ctdbd.c b/ctdb/direct/ctdbd.c index 8102eef751e..8376d8b8473 100644 --- a/ctdb/direct/ctdbd.c +++ b/ctdb/direct/ctdbd.c @@ -81,6 +81,7 @@ int main(int argc, const char *argv[]) { "transport", 0, POPT_ARG_STRING, &options.transport, 0, "protocol transport", NULL }, { "self-connect", 0, POPT_ARG_NONE, &options.self_connect, 0, "enable self connect", "boolean" }, { "dbdir", 0, POPT_ARG_STRING, &options.db_dir, 0, "directory for the tdb files", NULL }, + { "reclock", 0, POPT_ARG_STRING, &options.recovery_lock_file, 0, "location of recovery lock file", "filename" }, POPT_TABLEEND }; int opt, ret; @@ -107,12 +108,23 @@ int main(int argc, const char *argv[]) while (extra_argv[extra_argc]) extra_argc++; } + if (!options.recovery_lock_file) { + DEBUG(0,("You must specifiy the location of a recovery lock file with --reclock\n")); + exit(1); + } + block_signal(SIGPIPE); ev = event_context_init(NULL); ctdb = ctdb_cmdline_init(ev); + ret = ctdb_set_recovery_lock_file(ctdb, options.recovery_lock_file); + if (ret == -1) { + printf("ctdb_set_recovery_lock_file failed - %s\n", ctdb_errstr(ctdb)); + exit(1); + } + if (options.self_connect) { ctdb_set_flags(ctdb, CTDB_FLAG_SELF_CONNECT); } diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h index 870cbd7a714..a035af59abd 100644 --- a/ctdb/include/ctdb_private.h +++ b/ctdb/include/ctdb_private.h @@ -286,7 +286,8 @@ struct ctdb_context { const char *transport; const char *logfile; char *node_list_file; - int node_list_fd; + char *recovery_lock_file; + int recovery_lock_fd; uint32_t vnn; /* our own vnn */ uint32_t num_nodes; uint32_t num_connected; @@ -970,7 +971,8 @@ void ctdb_release_all_ips(struct ctdb_context *ctdb); void set_nonblocking(int fd); void set_close_on_exec(int fd); -bool ctdb_lock_node_list(struct ctdb_context *ctdb, bool keep); +bool ctdb_recovery_lock(struct ctdb_context *ctdb, bool keep); +int ctdb_set_recovery_lock_file(struct ctdb_context *ctdb, const char *file); #endif diff --git a/ctdb/packaging/ctdb.init b/ctdb/packaging/ctdb.init index 488cbd7618f..519334f6665 100755 --- a/ctdb/packaging/ctdb.init +++ b/ctdb/packaging/ctdb.init @@ -41,7 +41,13 @@ if [ -f /etc/sysconfig/ctdb ]; then . /etc/sysconfig/ctdb fi -# build up CTDB_OPTIONS variable +[ -z "$CTDB_RECOVERY_LOCK" ] && { + echo "You must configure the location of the CTDB_RECOVERY_LOCK" + exit 1 +} +CTDB_OPTIONS="$CTDB_OPTIONS --reclock=$CTDB_RECOVERY_LOCK" + +# build up CTDB_OPTIONS variable from optional parameters [ -z "$LOGFILE" ] || CTDB_OPTIONS="$CTDB_OPTIONS --logfile=$LOGFILE" [ -z "$NODES" ] || CTDB_OPTIONS="$CTDB_OPTIONS --nlist=$NODES" [ -z "$CTDB_SOCKET" ] || CTDB_OPTIONS="$CTDB_OPTIONS --socket=$CTDB_SOCKET" diff --git a/ctdb/tests/ctdbd.sh b/ctdb/tests/ctdbd.sh index 5aedf2e3d59..e2c53b34f9e 100755 --- a/ctdb/tests/ctdbd.sh +++ b/ctdb/tests/ctdbd.sh @@ -3,10 +3,13 @@ killall -q ctdbd echo "Starting 2 ctdb daemons" -$VALGRIND bin/ctdbd --nlist direct/nodes.txt --event-script=tests/events --logfile=- -$VALGRIND bin/ctdbd --nlist direct/nodes.txt --event-script=tests/events --logfile=- +$VALGRIND bin/ctdbd --reclock=rec.lock --nlist direct/nodes.txt --event-script=tests/events --logfile=- +$VALGRIND bin/ctdbd --reclock=rec.lock --nlist direct/nodes.txt --event-script=tests/events --logfile=- --socket=sock.2 -sleep 2 +while bin/ctdb status | grep RECOVERY > /dev/null; do + echo "`date` Waiting for recovery" + sleep 1; +done echo "Testing ping" $VALGRIND bin/ctdb ping || exit 1 diff --git a/ctdb/tools/ctdb.sysconfig b/ctdb/tools/ctdb.sysconfig index 595a4680449..16fcbd98967 100644 --- a/ctdb/tools/ctdb.sysconfig +++ b/ctdb/tools/ctdb.sysconfig @@ -1,5 +1,11 @@ # Options to ctdbd. This is read by /etc/init.d/ctdb +# you must specify the location of a shared lock file across all the +# nodes. This must be on shared storage +# there is no default +# CTDB_RECOVERY_LOCK="/some/place/on/shared/storage" + + # the NODES file must be specified or ctdb won't start # it should contain a list of IPs that ctdb will use # it must be exactly the same on all cluster nodes