From 4f72a202d940e414a3e8359fd3929ebd178b836a Mon Sep 17 00:00:00 2001 From: Andrew Tridgell Date: Sat, 2 Jun 2007 10:03:28 +1000 Subject: [PATCH] - moved cmdline options that are only relevant to ctdbd into ctdbd.c - fixed a valgrind error on failing to send a control - don't mark node dead when already disconnected - moved node list lock code into common code (This used to be ctdb commit bcc0432d0fea7ef223f82ccee81cf35c18144b1b) --- ctdb/common/cmdline.c | 53 ------------------------------------- ctdb/common/ctdb_daemon.c | 9 ++++--- ctdb/common/ctdb_monitor.c | 13 ++++++--- ctdb/common/ctdb_recover.c | 36 +++++++++++++++++++++++++ ctdb/common/ctdb_recoverd.c | 34 ++++++------------------ ctdb/direct/ctdbd.c | 51 ++++++++++++++++++++++++++++++++++- ctdb/include/ctdb_private.h | 4 ++- 7 files changed, 112 insertions(+), 88 deletions(-) diff --git a/ctdb/common/cmdline.c b/ctdb/common/cmdline.c index 8cea85d49b2..3de4387407b 100644 --- a/ctdb/common/cmdline.c +++ b/ctdb/common/cmdline.c @@ -29,21 +29,11 @@ */ static struct { - const char *nlist; - const char *transport; - const char *myaddress; const char *socketname; - int self_connect; - const char *db_dir; int torture; const char *events; } ctdb_cmdline = { - .nlist = ETCDIR "/ctdb/nodes", - .transport = "tcp", - .myaddress = NULL, .socketname = CTDB_PATH, - .self_connect = 0, - .db_dir = VARDIR "/ctdb", .torture = 0, }; @@ -64,13 +54,8 @@ static void ctdb_cmdline_callback(poptContext con, struct poptOption popt_ctdb_cmdline[] = { { NULL, 0, POPT_ARG_CALLBACK, (void *)ctdb_cmdline_callback }, - { "nlist", 0, POPT_ARG_STRING, &ctdb_cmdline.nlist, 0, "node list file", "filename" }, - { "listen", 0, POPT_ARG_STRING, &ctdb_cmdline.myaddress, 0, "address to listen on", "address" }, { "socket", 0, POPT_ARG_STRING, &ctdb_cmdline.socketname, 0, "local socket name", "filename" }, - { "transport", 0, POPT_ARG_STRING, &ctdb_cmdline.transport, 0, "protocol transport", NULL }, - { "self-connect", 0, POPT_ARG_NONE, &ctdb_cmdline.self_connect, 0, "enable self connect", "boolean" }, { "debug", 'd', POPT_ARG_INT, &LogLevel, 0, "debug level"}, - { "dbdir", 0, POPT_ARG_STRING, &ctdb_cmdline.db_dir, 0, "directory for the tdb files", NULL }, { "torture", 0, POPT_ARG_NONE, &ctdb_cmdline.torture, 0, "enable nastiness in library", NULL }, { "events", 0, POPT_ARG_STRING, NULL, OPT_EVENTSYSTEM, "event system", NULL }, { NULL } @@ -85,11 +70,6 @@ struct ctdb_context *ctdb_cmdline_init(struct event_context *ev) struct ctdb_context *ctdb; int ret; - if (ctdb_cmdline.nlist == NULL) { - printf("You must provide a node list with --nlist\n"); - exit(1); - } - /* initialise ctdb */ ctdb = ctdb_init(ev); if (ctdb == NULL) { @@ -97,28 +77,10 @@ struct ctdb_context *ctdb_cmdline_init(struct event_context *ev) exit(1); } - if (ctdb_cmdline.self_connect) { - ctdb_set_flags(ctdb, CTDB_FLAG_SELF_CONNECT); - } if (ctdb_cmdline.torture) { ctdb_set_flags(ctdb, CTDB_FLAG_TORTURE); } - ret = ctdb_set_transport(ctdb, ctdb_cmdline.transport); - if (ret == -1) { - printf("ctdb_set_transport failed - %s\n", ctdb_errstr(ctdb)); - exit(1); - } - - /* tell ctdb what address to listen on */ - if (ctdb_cmdline.myaddress) { - ret = ctdb_set_address(ctdb, ctdb_cmdline.myaddress); - if (ret == -1) { - printf("ctdb_set_address failed - %s\n", ctdb_errstr(ctdb)); - exit(1); - } - } - /* tell ctdb the socket address */ ret = ctdb_set_socketname(ctdb, ctdb_cmdline.socketname); if (ret == -1) { @@ -126,21 +88,6 @@ struct ctdb_context *ctdb_cmdline_init(struct event_context *ev) exit(1); } - /* tell ctdb what nodes are available */ - ret = ctdb_set_nlist(ctdb, ctdb_cmdline.nlist); - if (ret == -1) { - printf("ctdb_set_nlist failed - %s\n", ctdb_errstr(ctdb)); - exit(1); - } - - if (ctdb_cmdline.db_dir) { - ret = ctdb_set_tdb_dir(ctdb, ctdb_cmdline.db_dir); - if (ret == -1) { - printf("ctdb_set_tdb_dir failed - %s\n", ctdb_errstr(ctdb)); - exit(1); - } - } - return ctdb; } diff --git a/ctdb/common/ctdb_daemon.c b/ctdb/common/ctdb_daemon.c index ecd095f115f..7a98768177e 100644 --- a/ctdb/common/ctdb_daemon.c +++ b/ctdb/common/ctdb_daemon.c @@ -951,6 +951,7 @@ static void daemon_request_control_from_client(struct ctdb_client *client, TDB_DATA data; int res; struct daemon_control_state *state; + TALLOC_CTX *tmp_ctx = talloc_new(client); if (c->hdr.destnode == CTDB_CURRENT_NODE) { c->hdr.destnode = client->ctdb->vnn; @@ -970,6 +971,10 @@ static void daemon_request_control_from_client(struct ctdb_client *client, } talloc_set_destructor(state, daemon_control_destructor); + + if (c->flags & CTDB_CTRL_FLAG_NOREPLY) { + talloc_steal(tmp_ctx, state); + } data.dptr = &c->data[0]; data.dsize = c->datalen; @@ -983,9 +988,7 @@ static void daemon_request_control_from_client(struct ctdb_client *client, c->hdr.destnode)); } - if (c->flags & CTDB_CTRL_FLAG_NOREPLY) { - talloc_free(state); - } + talloc_free(tmp_ctx); } /* diff --git a/ctdb/common/ctdb_monitor.c b/ctdb/common/ctdb_monitor.c index dd875ba45c3..750a1f5bd63 100644 --- a/ctdb/common/ctdb_monitor.c +++ b/ctdb/common/ctdb_monitor.c @@ -48,12 +48,15 @@ static void ctdb_check_for_dead_nodes(struct event_context *ev, struct timed_eve continue; } - /* it might have come alive again */ - if (!(node->flags & NODE_FLAGS_CONNECTED) && node->rx_cnt != 0) { - ctdb_node_connected(node); + if (!(node->flags & NODE_FLAGS_CONNECTED)) { + /* it might have come alive again */ + if (node->rx_cnt != 0) { + ctdb_node_connected(node); + } continue; } + if (node->rx_cnt == 0) { node->dead_count++; } else { @@ -63,6 +66,7 @@ static void ctdb_check_for_dead_nodes(struct event_context *ev, struct timed_eve node->rx_cnt = 0; if (node->dead_count >= CTDB_MONITORING_DEAD_COUNT) { + DEBUG(0,("dead count reached for node %u\n", node->vnn)); ctdb_node_dead(node); ctdb_send_keepalive(ctdb, node->vnn); /* maybe tell the transport layer to kill the @@ -71,7 +75,8 @@ static void ctdb_check_for_dead_nodes(struct event_context *ev, struct timed_eve continue; } - if (node->tx_cnt == 0 && (node->flags & NODE_FLAGS_CONNECTED)) { + if (node->tx_cnt == 0) { + DEBUG(5,("sending keepalive to %u\n", node->vnn)); ctdb_send_keepalive(ctdb, node->vnn); } diff --git a/ctdb/common/ctdb_recover.c b/ctdb/common/ctdb_recover.c index 824b0adf89b..6ba3316f243 100644 --- a/ctdb/common/ctdb_recover.c +++ b/ctdb/common/ctdb_recover.c @@ -655,3 +655,39 @@ int32_t ctdb_control_delete_low_rsn(struct ctdb_context *ctdb, TDB_DATA indata, return 0; } + +/* + try and lock the node list file - should only work on the recovery master recovery + daemon. Anywhere else is a bug + */ +bool ctdb_lock_node_list(struct ctdb_context *ctdb, bool keep) +{ + struct flock lock; + + if (ctdb->node_list_fd != -1) { + close(ctdb->node_list_fd); + } + ctdb->node_list_fd = open(ctdb->node_list_file, O_RDWR); + if (ctdb->node_list_fd == -1) { + DEBUG(0,("Unable to open %s - (%s)\n", + ctdb->node_list_file, strerror(errno))); + return false; + } + + lock.l_type = F_WRLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 1; + lock.l_pid = 0; + + if (fcntl(ctdb->node_list_fd, F_SETLK, &lock) != 0) { + return false; + } + + if (!keep) { + close(ctdb->node_list_fd); + ctdb->node_list_fd = -1; + } + + return true; +} diff --git a/ctdb/common/ctdb_recoverd.c b/ctdb/common/ctdb_recoverd.c index cacffa90d5e..f4a58bf1228 100644 --- a/ctdb/common/ctdb_recoverd.c +++ b/ctdb/common/ctdb_recoverd.c @@ -375,7 +375,9 @@ static int update_vnnmap_on_all_nodes(struct ctdb_context *ctdb, struct ctdb_nod return 0; } - +/* + we are the recmaster, and recovery is needed - start a recovery run + */ static int do_recovery(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, uint32_t vnn, uint32_t num_active, struct ctdb_node_map *nodemap, struct ctdb_vnn_map *vnnmap) @@ -383,7 +385,11 @@ static int do_recovery(struct ctdb_context *ctdb, int i, j, ret; uint32_t generation; struct ctdb_dbid_map *dbmap; - struct flock lock; + + if (!ctdb_lock_node_list(ctdb, true)) { + DEBUG(0,("Unable to lock node list - aborting recovery\n")); + return -1; + } /* set recovery mode to active on all nodes */ ret = set_recovery_mode(ctdb, nodemap, CTDB_RECOVERY_ACTIVE); @@ -392,30 +398,6 @@ static int do_recovery(struct ctdb_context *ctdb, return -1; } - /* get the recmaster lock */ - if (ctdb->node_list_fd != -1) { - close(ctdb->node_list_fd); - } - - ctdb->node_list_fd = open(ctdb->node_list_file, O_RDWR); - if (ctdb->node_list_fd == -1) { - DEBUG(0,("Unable to open %s - aborting recovery (%s)\n", - ctdb->node_list_file, strerror(errno))); - return -1; - } - - lock.l_type = F_WRLCK; - lock.l_whence = SEEK_SET; - lock.l_start = 0; - lock.l_len = 1; - lock.l_pid = 0; - - if (fcntl(ctdb->node_list_fd, F_SETLK, &lock) != 0) { - DEBUG(0,("Unable to lock %s - aborting recovery (%s)\n", - ctdb->node_list_file, strerror(errno))); - return -1; - } - DEBUG(0, (__location__ " Recovery initiated\n")); /* pick a new generation number */ diff --git a/ctdb/direct/ctdbd.c b/ctdb/direct/ctdbd.c index 367ed8b6ba5..8102eef751e 100644 --- a/ctdb/direct/ctdbd.c +++ b/ctdb/direct/ctdbd.c @@ -39,13 +39,23 @@ static void block_signal(int signum) } static struct { + const char *nlist; + const char *transport; + const char *myaddress; const char *public_address_list; const char *public_interface; const char *event_script; const char *logfile; + const char *recovery_lock_file; + const char *db_dir; + int self_connect; } options = { + .nlist = ETCDIR "/ctdb/nodes", + .transport = "tcp", .event_script = ETCDIR "/ctdb/events", - .logfile = VARDIR "/log/log.ctdb" + .logfile = VARDIR "/log/log.ctdb", + .db_dir = VARDIR "/ctdb", + .self_connect = 0, }; @@ -66,6 +76,11 @@ int main(int argc, const char *argv[]) { "public-interface", 0, POPT_ARG_STRING, &options.public_interface, 0, "public interface", "interface"}, { "event-script", 0, POPT_ARG_STRING, &options.event_script, 0, "event script", "filename" }, { "logfile", 0, POPT_ARG_STRING, &options.logfile, 0, "log file location", "filename" }, + { "nlist", 0, POPT_ARG_STRING, &options.nlist, 0, "node list file", "filename" }, + { "listen", 0, POPT_ARG_STRING, &options.myaddress, 0, "address to listen on", "address" }, + { "transport", 0, POPT_ARG_STRING, &options.transport, 0, "protocol transport", NULL }, + { "self-connect", 0, POPT_ARG_NONE, &options.self_connect, 0, "enable self connect", "boolean" }, + { "dbdir", 0, POPT_ARG_STRING, &options.db_dir, 0, "directory for the tdb files", NULL }, POPT_TABLEEND }; int opt, ret; @@ -98,6 +113,40 @@ int main(int argc, const char *argv[]) ctdb = ctdb_cmdline_init(ev); + if (options.self_connect) { + ctdb_set_flags(ctdb, CTDB_FLAG_SELF_CONNECT); + } + + ret = ctdb_set_transport(ctdb, options.transport); + if (ret == -1) { + printf("ctdb_set_transport failed - %s\n", ctdb_errstr(ctdb)); + exit(1); + } + + /* tell ctdb what address to listen on */ + if (options.myaddress) { + ret = ctdb_set_address(ctdb, options.myaddress); + if (ret == -1) { + printf("ctdb_set_address failed - %s\n", ctdb_errstr(ctdb)); + exit(1); + } + } + + /* tell ctdb what nodes are available */ + ret = ctdb_set_nlist(ctdb, options.nlist); + if (ret == -1) { + printf("ctdb_set_nlist failed - %s\n", ctdb_errstr(ctdb)); + exit(1); + } + + if (options.db_dir) { + ret = ctdb_set_tdb_dir(ctdb, options.db_dir); + if (ret == -1) { + printf("ctdb_set_tdb_dir failed - %s\n", ctdb_errstr(ctdb)); + exit(1); + } + } + ret = ctdb_set_logfile(ctdb, options.logfile); if (ret == -1) { printf("ctdb_set_logfile to %s failed - %s\n", options.logfile, ctdb_errstr(ctdb)); diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h index d366c5071d8..870cbd7a714 100644 --- a/ctdb/include/ctdb_private.h +++ b/ctdb/include/ctdb_private.h @@ -357,7 +357,7 @@ struct ctdb_db_context { #define CTDB_MONITORING_TIMEOUT 2 /* number of monitoring timeouts before a node is considered dead */ -#define CTDB_MONITORING_DEAD_COUNT 2 +#define CTDB_MONITORING_DEAD_COUNT 3 /* number of consecutive calls from the same node before we give them @@ -970,5 +970,7 @@ void ctdb_release_all_ips(struct ctdb_context *ctdb); void set_nonblocking(int fd); void set_close_on_exec(int fd); +bool ctdb_lock_node_list(struct ctdb_context *ctdb, bool keep); + #endif -- 2.47.3