From: Alex Rousskov Date: Thu, 29 Sep 2022 21:03:06 +0000 (+0000) Subject: Report (problems with) DEAD-at-start cache_peers (#1155) X-Git-Tag: SQUID_6_0_1~93 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=92c4f6230c940b7a32a448e9d66b18ad09ceb6e5;p=thirdparty%2Fsquid.git Report (problems with) DEAD-at-start cache_peers (#1155) When a cache_peer is (re)configured, Squid probes its address using a throw-away TCP connection. If and only if that TCP probe succeeds, peer selection algorithms may pick the peer. Until then, the cache_peer is treated as if it was DEAD. This change preserves the logic summarized above but starts _reporting_ the initial probe failure and the fact that it marks the cache_peer DEAD. Without these reports, admins often naturally assume that the cache_peer is alive, especially if traffic can be served without it. --- diff --git a/src/CachePeer.h b/src/CachePeer.h index f6b8deaabd..76d1e48193 100644 --- a/src/CachePeer.h +++ b/src/CachePeer.h @@ -143,7 +143,11 @@ public: char *digest_url = nullptr; #endif - int tcp_up = 0; /* 0 if a connect() fails */ + /// The number of failures sufficient to stop selecting this cache_peer. All + /// cache_peer selection algorithms skip cache_peers with 0 tcp_up values. + /// The initial 0 value prevents unprobed cache_peers from being selected. + int tcp_up = 0; + /// whether to do another TCP probe after current TCP probes bool reprobe = false; diff --git a/src/neighbors.cc b/src/neighbors.cc index 70ae456e60..0bc3a87c9a 100644 --- a/src/neighbors.cc +++ b/src/neighbors.cc @@ -1275,32 +1275,35 @@ peerRefreshDNS(void *data) eventAddIsh("peerRefreshDNS", peerRefreshDNS, nullptr, 3600.0, 1); } -static void -peerConnectFailedSilent(CachePeer * p) +// TODO: Move to CachePeer::noteFailure() or similar. +// TODO: Require callers to detail failures instead of using one (and often +// misleading!) "TCP" label for all of them. +void +peerConnectFailed(CachePeer * const p) { p->stats.last_connect_failure = squid_curtime; + if (p->tcp_up > 0) + --p->tcp_up; - if (!p->tcp_up) { - debugs(15, 2, "TCP connection to " << p->host << "/" << p->http_port << - " dead"); - return; - } + // TODO: Report peer name. Same-addresses peers often have different names. - -- p->tcp_up; + const auto consideredAliveByAdmin = p->stats.logged_state == PEER_ALIVE; + const auto level = consideredAliveByAdmin ? DBG_IMPORTANT : 2; + debugs(15, level, "ERROR: TCP connection to " << p->host << "/" << p->http_port << " failed"); - if (!p->tcp_up) { - debugs(15, DBG_IMPORTANT, "Detected DEAD " << neighborTypeStr(p) << ": " << p->name); - p->stats.logged_state = PEER_DEAD; + if (consideredAliveByAdmin) { + if (!p->tcp_up) { + debugs(15, DBG_IMPORTANT, "Detected DEAD " << neighborTypeStr(p) << ": " << p->name); + p->stats.logged_state = PEER_DEAD; + } else { + debugs(15, 2, "additional failures needed to mark this cache_peer DEAD: " << p->tcp_up); + } + } else { + assert(!p->tcp_up); + debugs(15, 2, "cache_peer " << p->host << "/" << p->http_port << " is still DEAD"); } } -void -peerConnectFailed(CachePeer *p) -{ - debugs(15, DBG_IMPORTANT, "ERROR: TCP connection to " << p->host << "/" << p->http_port << " failed"); - peerConnectFailedSilent(p); -} - void peerConnectSucceded(CachePeer * p) { @@ -1368,7 +1371,7 @@ peerProbeConnectDone(const Comm::ConnectionPointer &conn, Comm::Flag status, int if (status == Comm::OK) { peerConnectSucceded(p); } else { - peerConnectFailedSilent(p); + peerConnectFailed(p); } -- p->testing_now;