From: Aurelien DARRAGON Date: Tue, 12 Dec 2023 13:42:34 +0000 (+0100) Subject: MEDIUM: server/dns: clear RMAINT when addr resolves again X-Git-Tag: v3.0-dev1~54 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=334ebfa1a2c7184b8c9b9e2732dec1d61f8a1d92;p=thirdparty%2Fhaproxy.git MEDIUM: server/dns: clear RMAINT when addr resolves again snr_update_srv_status() and srvrq_update_srv_status() will both set or clear the server RMAINT state depending of the result of the current dns resolution. This used to work pretty well in the past, but now that addr:svc_port changes are changed atomically through a dedicated task, the change is performed asynchronously, so this can cause some flapping issues if the server is put out of maintenance while the server's address is still unassigned. To prevent errors, the resolver's code is now only allowed to put the server under maintenance but not to remove it from maintenance: the decision to remove a server from maintenance is performed by the task responsible for updating the server's addr: if the addr resolves again thanks to a valid DNS resolution and the server was previously under RMAINT, then it cleared from RMAINT state. srvrq_update_srv_status() was renamed srvrq_set_srv_down(), since it is only called to put the server in maintenance as a result of a failing SRV entry. snr_update_srv_status() was renamed srv_set_srv_down() and slightly modified so that it only takes care of putting the server under maintenance when needed. The cli command "set server x/y addr" does not need to remove the RMAINT flag anymore. --- diff --git a/include/haproxy/server.h b/include/haproxy/server.h index 2298991536..9b451ce24e 100644 --- a/include/haproxy/server.h +++ b/include/haproxy/server.h @@ -78,8 +78,7 @@ void srv_event_hdl_publish_check(struct server *srv, struct check *check); /* functions related to server name resolution */ int srv_prepare_for_resolution(struct server *srv, const char *hostname); -int srvrq_update_srv_status(struct server *s, int has_no_ip); -int snr_update_srv_status(struct server *s, int has_no_ip); +int srvrq_set_srv_down(struct server *s); int srv_set_fqdn(struct server *srv, const char *fqdn, int resolv_locked); const char *srv_update_fqdn(struct server *server, const char *fqdn, const char *updater, int dns_locked); int snr_resolution_cb(struct resolv_requester *requester, struct dns_counters *counters); diff --git a/src/resolvers.c b/src/resolvers.c index 8f219f1928..e02c8a5612 100644 --- a/src/resolvers.c +++ b/src/resolvers.c @@ -661,7 +661,7 @@ static void resolv_srvrq_cleanup_srv(struct server *srv) { _resolv_unlink_resolution(srv->resolv_requester); HA_SPIN_LOCK(SERVER_LOCK, &srv->lock); - srvrq_update_srv_status(srv, 1); + srvrq_set_srv_down(srv); ha_free(&srv->hostname); ha_free(&srv->hostname_dn); srv->hostname_dn_len = 0; @@ -887,9 +887,6 @@ srv_found: resolv_link_resolution(srv, OBJ_TYPE_SERVER, 1); } - /* Update the server status */ - srvrq_update_srv_status(srv, (srv->addr.ss_family != AF_INET && srv->addr.ss_family != AF_INET6)); - if (!srv->resolv_opts.ignore_weight) { char weight[9]; int ha_weight; diff --git a/src/server.c b/src/server.c index 44a757069f..c3ddcb2a67 100644 --- a/src/server.c +++ b/src/server.c @@ -307,6 +307,22 @@ static struct task *server_atomic_sync(struct task *task, void *context, unsigne data->safe.next.port.svc, data->safe.next.port.map); /* propagate the changes, force connection cleanup */ + if (new_addr.ss_family != AF_UNSPEC && + (srv->next_admin & SRV_ADMF_RMAINT)) { + /* server was previously put under DNS maintenance due + * to DNS error, but addr resolves again, so we must + * put it out of maintenance + */ + srv_clr_admin_flag(srv, SRV_ADMF_RMAINT); + + /* thanks to valid DNS resolution? */ + if (data->safe.updater.dns) { + chunk_reset(&trash); + chunk_printf(&trash, "Server %s/%s administratively READY thanks to valid DNS answer", srv->proxy->id, srv->id); + ha_warning("%s.\n", trash.area); + send_log(srv->proxy, LOG_NOTICE, "%s.\n", trash.area); + } + } srv_cleanup_connections(srv); srv_set_dyncookie(srv); srv_set_addr_desc(srv, 1); @@ -4225,24 +4241,18 @@ const char *srv_update_addr_port(struct server *s, const char *addr, const char } /* - * update server status based on result of SRV resolution + * put the server in maintenance because of failing SRV resolution * returns: - * 0 if server status is updated + * 0 if server was put under maintenance * 1 if server status has not changed * * Must be called with the server lock held. */ -int srvrq_update_srv_status(struct server *s, int has_no_ip) +int srvrq_set_srv_down(struct server *s) { if (!s->srvrq) return 1; - /* since this server has an IP, it can go back in production */ - if (has_no_ip == 0) { - srv_clr_admin_flag(s, SRV_ADMF_RMAINT); - return 1; - } - if (s->next_admin & SRV_ADMF_RMAINT) return 1; @@ -4251,22 +4261,22 @@ int srvrq_update_srv_status(struct server *s, int has_no_ip) } /* - * update server status based on result of name resolution + * put server under maintenance as a result of name resolution * returns: - * 0 if server status is updated + * 0 if server was put under maintenance * 1 if server status has not changed * * Must be called with the server lock held. */ -int snr_update_srv_status(struct server *s, int has_no_ip) +int snr_set_srv_down(struct server *s, int has_no_ip) { struct resolvers *resolvers = s->resolvers; struct resolv_resolution *resolution = (s->resolv_requester ? s->resolv_requester->resolution : NULL); int exp; /* If resolution is NULL we're dealing with SRV records Additional records */ - if (resolution == NULL) - return srvrq_update_srv_status(s, has_no_ip); + if (resolution == NULL && has_no_ip) + return srvrq_set_srv_down(s); switch (resolution->status) { case RSLV_STATUS_NONE: @@ -4275,25 +4285,16 @@ int snr_update_srv_status(struct server *s, int has_no_ip) break; case RSLV_STATUS_VALID: - /* - * resume health checks - * server will be turned back on if health check is safe - */ if (has_no_ip) { + /* + * valid resolution but no usable server address + */ if (s->next_admin & SRV_ADMF_RMAINT) return 1; srv_set_admin_flag(s, SRV_ADMF_RMAINT, SRV_ADM_STCHGC_DNS_NOIP); return 0; } - if (!(s->next_admin & SRV_ADMF_RMAINT)) - return 1; - srv_clr_admin_flag(s, SRV_ADMF_RMAINT); - chunk_printf(&trash, "Server %s/%s administratively READY thanks to valid DNS answer", - s->proxy->id, s->id); - - ha_warning("%s.\n", trash.area); - send_log(s->proxy, LOG_NOTICE, "%s.\n", trash.area); return 0; case RSLV_STATUS_NX: @@ -4447,7 +4448,7 @@ int snr_resolution_cb(struct resolv_requester *requester, struct dns_counters *c srv_update_addr(s, firstip, firstip_sin_family, SERVER_INETADDR_UPDATER_DNS_CACHE); update_status: - if (!snr_update_srv_status(s, has_no_ip) && has_no_ip) + if (!snr_set_srv_down(s, has_no_ip) && has_no_ip) memset(&s->addr, 0, sizeof(s->addr)); return 1; @@ -4456,7 +4457,7 @@ int snr_resolution_cb(struct resolv_requester *requester, struct dns_counters *c counters->app.resolver.invalid++; goto update_status; } - if (!snr_update_srv_status(s, has_no_ip) && has_no_ip) + if (!snr_set_srv_down(s, has_no_ip) && has_no_ip) memset(&s->addr, 0, sizeof(s->addr)); return 0; } @@ -4537,7 +4538,7 @@ int snr_resolution_error_cb(struct resolv_requester *requester, int error_code) return 0; HA_SPIN_LOCK(SERVER_LOCK, &s->lock); - if (!snr_update_srv_status(s, 1)) { + if (!snr_set_srv_down(s, 1)) { memset(&s->addr, 0, sizeof(s->addr)); HA_SPIN_UNLOCK(SERVER_LOCK, &s->lock); resolv_detach_from_resolution_answer_items(requester->resolution, requester); @@ -5073,7 +5074,6 @@ static int cli_parse_set_server(char **args, char *payload, struct appctx *appct warning = srv_update_addr_port(sv, addr, port, SERVER_INETADDR_UPDATER_CLI); if (warning) cli_msg(appctx, LOG_WARNING, warning); - srv_clr_admin_flag(sv, SRV_ADMF_RMAINT); HA_SPIN_UNLOCK(SERVER_LOCK, &sv->lock); } else if (strcmp(args[3], "fqdn") == 0) {