From: Andrew Tridgell Date: Fri, 1 Jun 2007 09:05:41 +0000 (+1000) Subject: make the running of the takeover and release event scripts async, to prevent outages... X-Git-Tag: tevent-0.9.20~348^2~2614 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=7db1d04d5cdc548b65b57bdfca03e80a2d5dc5c5;p=thirdparty%2Fsamba.git make the running of the takeover and release event scripts async, to prevent outages due to slow scripts (This used to be ctdb commit 4189be97eee7ab2a50335c860f2fcd9566667d01) --- diff --git a/ctdb/common/ctdb.c b/ctdb/common/ctdb.c index d957e372f98..273d40236c7 100644 --- a/ctdb/common/ctdb.c +++ b/ctdb/common/ctdb.c @@ -41,7 +41,7 @@ int ctdb_set_transport(struct ctdb_context *ctdb, const char *transport) int ctdb_set_logfile(struct ctdb_context *ctdb, const char *logfile) { ctdb->logfile = talloc_strdup(ctdb, logfile); - if (ctdb->logfile != NULL) { + if (ctdb->logfile != NULL && strcmp(logfile, "-") != 0) { int fd; close(1); close(2); diff --git a/ctdb/common/ctdb_control.c b/ctdb/common/ctdb_control.c index ac677ac1c32..319adfc6e1f 100644 --- a/ctdb/common/ctdb_control.c +++ b/ctdb/common/ctdb_control.c @@ -265,11 +265,11 @@ static int32_t ctdb_control_dispatch(struct ctdb_context *ctdb, case CTDB_CONTROL_TAKEOVER_IP: CHECK_CONTROL_DATA_SIZE(sizeof(struct sockaddr)); - return ctdb_control_takeover_ip(ctdb, indata); + return ctdb_control_takeover_ip(ctdb, c, indata, async_reply); case CTDB_CONTROL_RELEASE_IP: CHECK_CONTROL_DATA_SIZE(sizeof(struct sockaddr)); - return ctdb_control_release_ip(ctdb, indata); + return ctdb_control_release_ip(ctdb, c, indata, async_reply); case CTDB_CONTROL_DELETE_LOW_RSN: CHECK_CONTROL_DATA_SIZE(sizeof(struct ctdb_control_delete_low_rsn)); diff --git a/ctdb/common/ctdb_daemon.c b/ctdb/common/ctdb_daemon.c index b98981b9dc5..3309d375e48 100644 --- a/ctdb/common/ctdb_daemon.c +++ b/ctdb/common/ctdb_daemon.c @@ -32,7 +32,7 @@ static void daemon_incoming_packet(void *, struct ctdb_req_header *); /* called when the "startup" event script has finished */ -static void ctdb_start_transport(struct ctdb_context *ctdb, int status) +static void ctdb_start_transport(struct ctdb_context *ctdb, int status, void *p) { if (status != 0) { DEBUG(0,("startup event failed!\n")); @@ -87,7 +87,8 @@ static void ctdb_main_loop(struct ctdb_context *ctdb) CTDB_CTRL_FLAG_NOREPLY, tdb_null, NULL, NULL); - ret = ctdb_event_script_callback(ctdb, ctdb_start_transport, "startup"); + ret = ctdb_event_script_callback(ctdb, ctdb, + ctdb_start_transport, NULL, "startup"); if (ret != 0) { DEBUG(0,("Failed startup event script\n")); return; diff --git a/ctdb/include/ctdb_private.h b/ctdb/include/ctdb_private.h index d3320ac163e..f2087e8e90c 100644 --- a/ctdb/include/ctdb_private.h +++ b/ctdb/include/ctdb_private.h @@ -925,10 +925,16 @@ int ctdb_ctrl_set_rsn_nonempty(struct ctdb_context *ctdb, struct timeval timeout int ctdb_ctrl_delete_low_rsn(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, uint32_t db_id, uint64_t rsn); void ctdb_set_realtime(void); -int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, TDB_DATA indata); +int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, + struct ctdb_req_control *c, + TDB_DATA indata, + bool *async_reply); int ctdb_ctrl_takeover_ip(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, const char *ip); -int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, TDB_DATA indata); +int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, + struct ctdb_req_control *c, + TDB_DATA indata, + bool *async_reply); int ctdb_ctrl_release_ip(struct ctdb_context *ctdb, struct timeval timeout, uint32_t destnode, const char *ip); @@ -951,8 +957,10 @@ int32_t ctdb_control_startup(struct ctdb_context *ctdb, uint32_t vnn); void ctdb_takeover_client_destructor_hook(struct ctdb_client *client); int ctdb_event_script(struct ctdb_context *ctdb, const char *fmt, ...) PRINTF_ATTRIBUTE(2,3); int ctdb_event_script_callback(struct ctdb_context *ctdb, - void (*callback)(struct ctdb_context *, int), - const char *fmt, ...) PRINTF_ATTRIBUTE(3,4); + TALLOC_CTX *mem_ctx, + void (*callback)(struct ctdb_context *, int, void *), + void *private_data, + const char *fmt, ...) PRINTF_ATTRIBUTE(5,6); void ctdb_release_all_ips(struct ctdb_context *ctdb); void set_nonblocking(int fd); diff --git a/ctdb/takeover/ctdb_takeover.c b/ctdb/takeover/ctdb_takeover.c index d5fcfcee64c..af250f570bc 100644 --- a/ctdb/takeover/ctdb_takeover.c +++ b/ctdb/takeover/ctdb_takeover.c @@ -91,52 +91,48 @@ static void ctdb_control_send_arp(struct event_context *ev, struct timed_event * ctdb_control_send_arp, arp); } +struct takeover_callback_state { + struct ctdb_req_control *c; + struct sockaddr_in *sin; +}; /* - take over an ip address + called when takeip event finishes */ -int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, TDB_DATA indata) +static void takeover_ip_callback(struct ctdb_context *ctdb, int status, + void *private_data) { - int ret; - struct sockaddr_in *sin = (struct sockaddr_in *)indata.dptr; + struct takeover_callback_state *state = + talloc_get_type(private_data, struct takeover_callback_state); struct ctdb_takeover_arp *arp; - char *ip = inet_ntoa(sin->sin_addr); + char *ip = inet_ntoa(state->sin->sin_addr); struct ctdb_tcp_list *tcp; - if (ctdb_sys_have_ip(ip)) { - return 0; - } - - DEBUG(0,("Takover of IP %s/%u on interface %s\n", - ip, ctdb->nodes[ctdb->vnn]->public_netmask_bits, - ctdb->takeover.interface)); - ret = ctdb_event_script(ctdb, "takeip %s %s %u", - ctdb->takeover.interface, - ip, - ctdb->nodes[ctdb->vnn]->public_netmask_bits); - if (ret != 0) { + if (status != 0) { DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n", ip, ctdb->takeover.interface)); - return -1; + ctdb_request_control_reply(ctdb, state->c, NULL, status, NULL); + talloc_free(state); + return; } if (!ctdb->takeover.last_ctx) { ctdb->takeover.last_ctx = talloc_new(ctdb); - CTDB_NO_MEMORY(ctdb, ctdb->takeover.last_ctx); + if (!ctdb->takeover.last_ctx) goto failed; } arp = talloc_zero(ctdb->takeover.last_ctx, struct ctdb_takeover_arp); - CTDB_NO_MEMORY(ctdb, arp); + if (!arp) goto failed; arp->ctdb = ctdb; - arp->sin = *sin; + arp->sin = *state->sin; /* add all of the known tcp connections for this IP to the list of tcp connections to send tickle acks for */ for (tcp=ctdb->tcp_list;tcp;tcp=tcp->next) { - if (sin->sin_addr.s_addr == tcp->daddr.sin_addr.s_addr) { + if (state->sin->sin_addr.s_addr == tcp->daddr.sin_addr.s_addr) { struct ctdb_tcp_list *t2 = talloc(arp, struct ctdb_tcp_list); - CTDB_NO_MEMORY(ctdb, t2); + if (t2 == NULL) goto failed; *t2 = *tcp; DLIST_ADD(arp->tcp_list, t2); } @@ -145,42 +141,78 @@ int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, TDB_DATA indata) event_add_timed(arp->ctdb->ev, arp->ctdb->takeover.last_ctx, timeval_zero(), ctdb_control_send_arp, arp); - return ret; + /* the control succeeded */ + ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL); + talloc_free(state); + return; + +failed: + ctdb_request_control_reply(ctdb, state->c, NULL, -1, NULL); + talloc_free(state); + return; } /* - release an ip address + take over an ip address */ -int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, TDB_DATA indata) +int32_t ctdb_control_takeover_ip(struct ctdb_context *ctdb, + struct ctdb_req_control *c, + TDB_DATA indata, + bool *async_reply) { + int ret; + struct takeover_callback_state *state; struct sockaddr_in *sin = (struct sockaddr_in *)indata.dptr; - TDB_DATA data; char *ip = inet_ntoa(sin->sin_addr); - int ret; - struct ctdb_tcp_list *tcp; - if (!ctdb_sys_have_ip(ip)) { + /* if our kernel already has this IP, do nothing */ + if (ctdb_sys_have_ip(ip)) { return 0; } - DEBUG(0,("Release of IP %s/%u on interface %s\n", + state = talloc(ctdb, struct takeover_callback_state); + CTDB_NO_MEMORY(ctdb, state); + + state->c = talloc_steal(ctdb, c); + state->sin = talloc(ctdb, struct sockaddr_in); + CTDB_NO_MEMORY(ctdb, state->sin); + *state->sin = *(struct sockaddr_in *)indata.dptr; + + DEBUG(0,("Takover of IP %s/%u on interface %s\n", ip, ctdb->nodes[ctdb->vnn]->public_netmask_bits, ctdb->takeover.interface)); - /* stop any previous arps */ - talloc_free(ctdb->takeover.last_ctx); - ctdb->takeover.last_ctx = NULL; - - ret = ctdb_event_script(ctdb, "releaseip %s %s %u", - ctdb->takeover.interface, - ip, - ctdb->nodes[ctdb->vnn]->public_netmask_bits); + ret = ctdb_event_script_callback(ctdb, state, takeover_ip_callback, state, + "takeip %s %s %u", + ctdb->takeover.interface, + ip, + ctdb->nodes[ctdb->vnn]->public_netmask_bits); if (ret != 0) { - DEBUG(0,(__location__ " Failed to release IP %s on interface %s\n", + DEBUG(0,(__location__ " Failed to takeover IP %s on interface %s\n", ip, ctdb->takeover.interface)); + talloc_free(state); return -1; } + /* tell ctdb_control.c that we will be replying asynchronously */ + *async_reply = true; + + return 0; +} + + +/* + called when releaseip event finishes + */ +static void release_ip_callback(struct ctdb_context *ctdb, int status, + void *private_data) +{ + struct takeover_callback_state *state = + talloc_get_type(private_data, struct takeover_callback_state); + char *ip = inet_ntoa(state->sin->sin_addr); + TDB_DATA data; + struct ctdb_tcp_list *tcp; + /* send a message to all clients of this node telling them that the cluster has been reconfigured and they should release any sockets on this IP */ @@ -192,7 +224,7 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, TDB_DATA indata) /* tell other nodes about any tcp connections we were holding with this IP */ for (tcp=ctdb->tcp_list;tcp;tcp=tcp->next) { if (tcp->vnn == ctdb->vnn && - sin->sin_addr.s_addr == tcp->daddr.sin_addr.s_addr) { + state->sin->sin_addr.s_addr == tcp->daddr.sin_addr.s_addr) { struct ctdb_control_tcp_vnn t; t.vnn = ctdb->vnn; @@ -208,6 +240,59 @@ int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, TDB_DATA indata) } } + /* the control succeeded */ + ctdb_request_control_reply(ctdb, state->c, NULL, 0, NULL); + talloc_free(state); +} + + +/* + release an ip address + */ +int32_t ctdb_control_release_ip(struct ctdb_context *ctdb, + struct ctdb_req_control *c, + TDB_DATA indata, + bool *async_reply) +{ + int ret; + struct takeover_callback_state *state; + struct sockaddr_in *sin = (struct sockaddr_in *)indata.dptr; + char *ip = inet_ntoa(sin->sin_addr); + + if (!ctdb_sys_have_ip(ip)) { + return 0; + } + + DEBUG(0,("Release of IP %s/%u on interface %s\n", + ip, ctdb->nodes[ctdb->vnn]->public_netmask_bits, + ctdb->takeover.interface)); + + /* stop any previous arps */ + talloc_free(ctdb->takeover.last_ctx); + ctdb->takeover.last_ctx = NULL; + + state = talloc(ctdb, struct takeover_callback_state); + CTDB_NO_MEMORY(ctdb, state); + + state->c = talloc_steal(state, c); + state->sin = talloc(state, struct sockaddr_in); + CTDB_NO_MEMORY(ctdb, state->sin); + *state->sin = *(struct sockaddr_in *)indata.dptr; + + ret = ctdb_event_script_callback(ctdb, state, release_ip_callback, state, + "releaseip %s %s %u", + ctdb->takeover.interface, + ip, + ctdb->nodes[ctdb->vnn]->public_netmask_bits); + if (ret != 0) { + DEBUG(0,(__location__ " Failed to release IP %s on interface %s\n", + ip, ctdb->takeover.interface)); + talloc_free(state); + return -1; + } + + /* tell the control that we will be reply asynchronously */ + *async_reply = true; return 0; } diff --git a/ctdb/takeover/system.c b/ctdb/takeover/system.c index 59016e2c372..cff122f35b0 100644 --- a/ctdb/takeover/system.c +++ b/ctdb/takeover/system.c @@ -312,8 +312,9 @@ int ctdb_event_script(struct ctdb_context *ctdb, const char *fmt, ...) struct ctdb_event_script_state { struct ctdb_context *ctdb; pid_t child; - void (*callback)(struct ctdb_context *, int); + void (*callback)(struct ctdb_context *, int, void *); int fd[2]; + void *private_data; }; /* called when child is finished */ @@ -327,28 +328,41 @@ static void ctdb_event_script_handler(struct event_context *ev, struct fd_event if (status != -1) { status = WEXITSTATUS(status); } - state->callback(state->ctdb, status); + state->callback(state->ctdb, status, state->private_data); + talloc_set_destructor(state, NULL); talloc_free(state); } +/* + destroy a running event script + */ +static int event_script_destructor(struct ctdb_event_script_state *state) +{ + kill(state->child, SIGKILL); + waitpid(state->child, NULL, 0); + return 0; +} /* run the event script in the background, calling the callback when finished */ int ctdb_event_script_callback(struct ctdb_context *ctdb, - void (*callback)(struct ctdb_context *, int), + TALLOC_CTX *mem_ctx, + void (*callback)(struct ctdb_context *, int, void *), + void *private_data, const char *fmt, ...) { struct ctdb_event_script_state *state; va_list ap; int ret; - state = talloc(ctdb, struct ctdb_event_script_state); + state = talloc(mem_ctx, struct ctdb_event_script_state); CTDB_NO_MEMORY(ctdb, state); state->ctdb = ctdb; state->callback = callback; + state->private_data = private_data; ret = pipe(state->fd); if (ret != 0) { @@ -373,6 +387,8 @@ int ctdb_event_script_callback(struct ctdb_context *ctdb, _exit(ret); } + talloc_set_destructor(state, event_script_destructor); + close(state->fd[1]); event_add_fd(ctdb->ev, state, state->fd[0], EVENT_FD_READ|EVENT_FD_AUTOCLOSE,