From 7f02e16143a067b0ee40cdbc60006ccad6c3879a Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Mon, 27 Aug 2007 10:31:22 +1000 Subject: [PATCH] add async versions of the freeze node control and freeze all nodes in parallell (This used to be ctdb commit f34e89f54d9f4380e76eb1b5b2385a4d8500b505) --- ctdb/client/ctdb_client.c | 39 +++++++++++--- ctdb/include/ctdb.h | 11 +++- ctdb/server/ctdb_recoverd.c | 104 +++++++++++++++++++++++++++++++----- 3 files changed, 132 insertions(+), 22 deletions(-) diff --git a/ctdb/client/ctdb_client.c b/ctdb/client/ctdb_client.c index 0aa0e9b97ed..91351befd49 100644 --- a/ctdb/client/ctdb_client.c +++ b/ctdb/client/ctdb_client.c @@ -1807,24 +1807,49 @@ int ctdb_ctrl_getpid(struct ctdb_context *ctdb, struct timeval timeout, uint32_t /* - freeze a node + async freeze send control */ -int ctdb_ctrl_freeze(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode) +struct ctdb_client_control_state * +ctdb_ctrl_freeze_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct timeval timeout, uint32_t destnode) +{ + return ctdb_control_send(ctdb, destnode, 0, + CTDB_CONTROL_FREEZE, 0, tdb_null, + mem_ctx, NULL, &timeout, NULL); +} + +/* + async freeze recv control +*/ +int ctdb_ctrl_freeze_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, struct ctdb_client_control_state *state) { int ret; int32_t res; - ret = ctdb_control(ctdb, destnode, 0, - CTDB_CONTROL_FREEZE, 0, tdb_null, - NULL, NULL, &res, &timeout, NULL); - if (ret != 0 || res != 0) { - DEBUG(0,(__location__ " ctdb_control freeze failed\n")); + ret = ctdb_control_recv(ctdb, state, mem_ctx, NULL, &res, NULL); + if ( (ret != 0) || (res != 0) ){ + DEBUG(0,(__location__ " ctdb_ctrl_freeze_recv failed\n")); return -1; } return 0; } +/* + freeze a node + */ +int ctdb_ctrl_freeze(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode) +{ + TALLOC_CTX *tmp_ctx = talloc_new(ctdb); + struct ctdb_client_control_state *state; + int ret; + + state = ctdb_ctrl_freeze_send(ctdb, tmp_ctx, timeout, destnode); + ret = ctdb_ctrl_freeze_recv(ctdb, tmp_ctx, state); + talloc_free(tmp_ctx); + + return ret; +} + /* thaw a node */ diff --git a/ctdb/include/ctdb.h b/ctdb/include/ctdb.h index e46cc68a794..133b667cc93 100644 --- a/ctdb/include/ctdb.h +++ b/ctdb/include/ctdb.h @@ -379,7 +379,16 @@ int ctdb_dump_db(struct ctdb_db_context *ctdb_db, FILE *f); */ int ctdb_ctrl_getpid(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t *pid); -int ctdb_ctrl_freeze(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode); +int ctdb_ctrl_freeze(struct ctdb_context *ctdb, struct timeval timeout, + uint32_t destnode); + +struct ctdb_client_control_state * +ctdb_ctrl_freeze_send(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, + struct timeval timeout, uint32_t destnode); + +int ctdb_ctrl_freeze_recv(struct ctdb_context *ctdb, TALLOC_CTX *mem_ctx, + struct ctdb_client_control_state *state); + int ctdb_ctrl_thaw(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode); int ctdb_ctrl_getvnn(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode); diff --git a/ctdb/server/ctdb_recoverd.c b/ctdb/server/ctdb_recoverd.c index 38ecf35bdcc..64bb0cf5193 100644 --- a/ctdb/server/ctdb_recoverd.c +++ b/ctdb/server/ctdb_recoverd.c @@ -116,6 +116,87 @@ static void ctdb_ban_node(struct ctdb_recoverd *rec, uint32_t vnn, uint32_t ban_ } } +enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED}; + + +struct freeze_node_data { + uint32_t count; + enum monitor_result status; +}; + + +static void freeze_node_callback(struct ctdb_client_control_state *state) +{ + struct freeze_node_data *fndata = talloc_get_type(state->async.private, struct freeze_node_data); + + + /* one more node has responded to our freeze node*/ + fndata->count--; + + /* if we failed to freeze the node, we must trigger another recovery */ + if ( (state->state != CTDB_CONTROL_DONE) || (state->status != 0) ) { + DEBUG(0, (__location__ " Failed to freeze node:%u. recovery failed\n", state->c->hdr.destnode)); + fndata->status = MONITOR_RECOVERY_NEEDED; + } + + return; +} + + + +/* freeze all nodes */ +static enum monitor_result freeze_all_nodes(struct ctdb_context *ctdb, struct ctdb_node_map *nodemap) +{ + struct freeze_node_data *fndata; + TALLOC_CTX *mem_ctx = talloc_new(ctdb); + struct ctdb_client_control_state *state; + enum monitor_result status; + int j; + + fndata = talloc(mem_ctx, struct freeze_node_data); + CTDB_NO_MEMORY_FATAL(ctdb, fndata); + fndata->count = 0; + fndata->status = MONITOR_OK; + + /* loop over all active nodes and send an async freeze call to + them*/ + for (j=0; jnum; j++) { + if (nodemap->nodes[j].flags & NODE_FLAGS_INACTIVE) { + continue; + } + state = ctdb_ctrl_freeze_send(ctdb, mem_ctx, + CONTROL_TIMEOUT(), + nodemap->nodes[j].vnn); + if (state == NULL) { + /* we failed to send the control, treat this as + an error and try again next iteration + */ + DEBUG(0,("Failed to call ctdb_ctrl_freeze_send during recovery\n")); + talloc_free(mem_ctx); + return MONITOR_RECOVERY_NEEDED; + } + + /* set up the callback functions */ + state->async.fn = freeze_node_callback; + state->async.private = fndata; + + /* one more control to wait for to complete */ + fndata->count++; + } + + + /* now wait for up to the maximum number of seconds allowed + or until all nodes we expect a response from has replied + */ + while (fndata->count > 0) { + event_loop_once(ctdb->ev); + } + + status = fndata->status; + talloc_free(mem_ctx); + return status; +} + /* change recovery mode on all nodes @@ -124,10 +205,15 @@ static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *no { int j, ret; - /* start the freeze process immediately on all nodes */ - ctdb_control(ctdb, CTDB_BROADCAST_CONNECTED, 0, - CTDB_CONTROL_FREEZE, CTDB_CTRL_FLAG_NOREPLY, tdb_null, - NULL, NULL, NULL, NULL, NULL); + /* freeze all nodes */ + if (rec_mode == CTDB_RECOVERY_ACTIVE) { + ret = freeze_all_nodes(ctdb, nodemap); + if (ret != MONITOR_OK) { + DEBUG(0, (__location__ " Unable to freeze nodes. Recovery failed.\n")); + return -1; + } + } + /* set recovery mode to active on all nodes */ for (j=0; jnum; j++) { @@ -136,14 +222,6 @@ static int set_recovery_mode(struct ctdb_context *ctdb, struct ctdb_node_map *no continue; } - if (rec_mode == CTDB_RECOVERY_ACTIVE) { - ret = ctdb_ctrl_freeze(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].vnn); - if (ret != 0) { - DEBUG(0, (__location__ " Unable to freeze node %u\n", nodemap->nodes[j].vnn)); - return -1; - } - } - ret = ctdb_ctrl_setrecmode(ctdb, CONTROL_TIMEOUT(), nodemap->nodes[j].vnn, rec_mode); if (ret != 0) { DEBUG(0, (__location__ " Unable to set recmode on node %u\n", nodemap->nodes[j].vnn)); @@ -1148,8 +1226,6 @@ static void monitor_handler(struct ctdb_context *ctdb, uint64_t srvid, } -enum monitor_result { MONITOR_OK, MONITOR_RECOVERY_NEEDED, MONITOR_ELECTION_NEEDED, MONITOR_FAILED}; - struct verify_recmode_normal_data { uint32_t count; -- 2.47.3