From: Vladimír Čunát <vladimir.cunat@nic.cz>
Date: Mon, 7 Mar 2022 17:04:05 +0000 (+0100)
Subject: lib/selection: fix interaction of timeouts with reboots
X-Git-Tag: v5.5.0~2^2~1
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=03d24b4b7e5d473bc988551ebc00d955dae01e2b;p=thirdparty%2Fknot-resolver.git

lib/selection: fix interaction of timeouts with reboots

We use "monotonic" time-stamps for the dead_since field;
that breaks on system reboots, in which case we reset the stats.
(if the server was categorized as dead)

If the server times out afterwards, we'd fail the condition
`cur_state.consecutive_timeouts == old_state.consecutive_timeouts`
so its stats would not update.  Therefore we'd get stuck forever
in a state where the unusable server has high priority (no_rtt_info).

This commit changes a bit more than was necessary to fix this,
including precision of the stats (in some cases).
---

diff --git a/NEWS b/NEWS
index 0d90b3cdf..7686faa78 100644
--- a/NEWS
+++ b/NEWS
@@ -22,6 +22,7 @@ Bugfixes
 - doh2: fix CORS by adding `access-control-allow-origin: *` (!1246)
 - net: fix listen by interface - add interface suffix to link-local IPv6 (!1253)
 - daemon/tls: fix resumption for outgoing TLS (e.g. TLS_FORWARD) (!1261)
+- nameserver selection: fix interaction of timeouts with reboots (#722, !1269)
 
 
 Knot Resolver 5.4.4 (2022-01-05)
diff --git a/lib/selection.c b/lib/selection.c
index 5bcc2fda3..c6eb93eff 100644
--- a/lib/selection.c
+++ b/lib/selection.c
@@ -19,7 +19,7 @@
 #define DEFAULT_TIMEOUT 400
 #define MAX_TIMEOUT 10000
 #define EXPLORE_TIMEOUT_COEFFICIENT 2
-#define MAX_BACKOFF 5
+#define MAX_BACKOFF 8
 #define MINIMAL_TIMEOUT_ADDITION 20
 
 /* After TCP_TIMEOUT_THRESHOLD timeouts one transport, we'll switch to TCP. */
@@ -255,7 +255,7 @@ static void invalidate_dead_upstream(struct address_state *state,
 				     unsigned int retry_timeout)
 {
 	struct rtt_state *rs = &state->rtt_state;
-	if (rs->consecutive_timeouts >= KR_NS_TIMEOUT_ROW_DEAD) {
+	if (rs->dead_since) {
 		uint64_t now = kr_now();
 		if (now < rs->dead_since) {
 			// broken continuity of timestamp (reboot, different machine, etc.)
@@ -590,36 +590,50 @@ void update_rtt(struct kr_query *qry, struct address_state *addr_state,
 	}
 }
 
-static void cache_timeout(const struct kr_query *qry, const struct kr_transport *transport,
+/// Update rtt_state (including caching) after a server timed out.
+static void server_timeout(const struct kr_query *qry, const struct kr_transport *transport,
 			  struct address_state *addr_state, struct kr_cache *cache)
 {
-	if (transport->deduplicated) {
-		/* Transport was chosen by a different query, that one will
-		 * cache the result. */
+	// Make sure that the timeout wasn't capped; see kr_transport::timeout_capped
+	if (transport->timeout_capped)
 		return;
-	}
 
-	uint8_t *address = ip_to_bytes(&transport->address, transport->address_len);
+	const uint8_t *address = ip_to_bytes(&transport->address, transport->address_len);
 	if (transport->address_len == sizeof(struct in6_addr))
 		no6_timed_out(qry, address);
 
-	struct rtt_state old_state = addr_state->rtt_state;
-	struct rtt_state cur_state =
-		get_rtt_state(address, transport->address_len, cache);
-
-	/* We could lose some update from some other process by doing this,
-	 * but at least timeout count can't blow up. */
-	if (cur_state.consecutive_timeouts == old_state.consecutive_timeouts) {
-		if (++cur_state.consecutive_timeouts >=
-		    KR_NS_TIMEOUT_ROW_DEAD) {
-			cur_state.dead_since = kr_now();
-		}
-		put_rtt_state(address, transport->address_len, cur_state, cache);
+	struct rtt_state *state = &addr_state->rtt_state;
+	// While we were waiting for timeout, the stats might have changed considerably,
+	// so let's overwrite what we had by fresh cache contents.
+	// This is useful when the address is busy (we query it concurrently).
+	*state = get_rtt_state(address, transport->address_len, cache);
+
+	++state->consecutive_timeouts;
+	// Avoid overflow; we don't utilize very high values anyway (arbitrary limit).
+	state->consecutive_timeouts = MIN(64, state->consecutive_timeouts);
+	if (state->consecutive_timeouts >= KR_NS_TIMEOUT_ROW_DEAD) {
+		// Only mark as dead if we waited long enough,
+		// so that many (concurrent) short attempts can't cause the dead state.
+		if (transport->timeout >= KR_NS_TIMEOUT_MIN_DEAD_TIMEOUT)
+			state->dead_since = kr_now();
+	}
+
+	// If transport was chosen by a different query, that one will cache it.
+	if (!transport->deduplicated) {
+		put_rtt_state(address, transport->address_len, *state, cache);
 	} else {
-		/* `get_rtt_state` opens a cache transaction, we have to end it. */
-		kr_cache_commit(cache);
+		kr_cache_commit(cache); // Avoid any risk of long transaction.
 	}
 }
+// Not everything can be checked in nice ways like static_assert()
+static __attribute__((constructor)) void test_RTT_consts(void)
+{
+	// See KR_NS_TIMEOUT_MIN_DEAD_TIMEOUT above.
+	kr_require(
+		calc_timeout((struct rtt_state){ .consecutive_timeouts = MAX_BACKOFF, })
+		 >= KR_NS_TIMEOUT_MIN_DEAD_TIMEOUT
+	);
+}
 
 void error(struct kr_query *qry, struct address_state *addr_state,
 	   const struct kr_transport *transport,
@@ -644,12 +658,7 @@ void error(struct kr_query *qry, struct address_state *addr_state,
 	case KR_SELECTION_TLS_HANDSHAKE_FAILED:
 	case KR_SELECTION_QUERY_TIMEOUT:
 		qry->server_selection.local_state->timeouts++;
-		/* Make sure that the query was chosen by this query and timeout wasn't capped
-		 * (see kr_transport::timeout_capped for details). */
-		if (!transport->deduplicated && !transport->timeout_capped) {
-			cache_timeout(qry, transport, addr_state,
-				      &qry->request->ctx->cache);
-		}
+		server_timeout(qry, transport, addr_state, &qry->request->ctx->cache);
 		break;
 	case KR_SELECTION_FORMERR:
 		if (qry->flags.NO_EDNS) {
diff --git a/lib/selection.h b/lib/selection.h
index 9e712cc45..e35eb7506 100644
--- a/lib/selection.h
+++ b/lib/selection.h
@@ -12,8 +12,11 @@
 
 #include "lib/cache/api.h"
 
-/* After KR_NS_TIMEOUT_ROW_DEAD consecutive timeouts, we consider the upstream IP dead for KR_NS_TIMEOUT_RETRY_INTERVAL ms */
+/* After KR_NS_TIMEOUT_ROW_DEAD consecutive timeouts
+ * where at least one was over KR_NS_TIMEOUT_MIN_DEAD_TIMEOUT ms,
+ * we consider the upstream IP dead for KR_NS_TIMEOUT_RETRY_INTERVAL ms */
 #define KR_NS_TIMEOUT_ROW_DEAD 4
+#define KR_NS_TIMEOUT_MIN_DEAD_TIMEOUT 800 /* == DEFAULT_TIMEOUT * 2 */
 #define KR_NS_TIMEOUT_RETRY_INTERVAL 1000
 
 /**
diff --git a/lib/selection_forward.c b/lib/selection_forward.c
index baab3abd5..fe33c2373 100644
--- a/lib/selection_forward.c
+++ b/lib/selection_forward.c
@@ -8,6 +8,11 @@
 #define VERBOSE_MSG(qry, ...) kr_log_q((qry), SELECTION, __VA_ARGS__)
 
 #define FORWARDING_TIMEOUT 2000
+/* TODO: well, this is a bit hard; maybe we'd better have a different approach
+ * for estimating DEAD-ness for forwarding.
+ * Even ACKs on connections might be useful here. */
+static_assert(FORWARDING_TIMEOUT >= KR_NS_TIMEOUT_MIN_DEAD_TIMEOUT,
+		"Bad combination of NS selection limits.");
 
 struct forward_local_state {
 	kr_sockaddr_array_t *targets;