From fff9f62a1ec008185868a369207b0d56ffb7dcb3 Mon Sep 17 00:00:00 2001 From: Yorgos Thessalonikefs Date: Tue, 31 Dec 2024 16:28:12 +0100 Subject: [PATCH] Serve expired cache update fixes (#1174) - Fixes a regression bug with serve-expired that appeared in 1.22.0 and would not allow the iterator to update the cache with not-yet-validated entries resulting in increased outgoing traffic. - Treat serve_expired_norec_ttl as a backoff timer for failed updates of expired records. - Try to use expired answers instead of SERVFAIL if serve-expired is enabled even without serve-expired-client-timeout. - Add suggestion to refresh the cached norec_ttl and expired_ttl when a response cannot update the usable expired entry. --- daemon/worker.c | 6 +- services/cache/dns.c | 41 +++++++++- services/mesh.c | 6 +- .../serve_expired_client_timeout_servfail.rpl | 48 +++++++++--- ...serve_expired_client_timeout_val_bogus.rpl | 18 +++-- testdata/serve_expired_ttl_reset.rpl | 22 +++--- testdata/serve_expired_val_bogus.rpl | 75 +++++++++++++++++-- 7 files changed, 170 insertions(+), 46 deletions(-) diff --git a/daemon/worker.c b/daemon/worker.c index 413abdb87..3985d1108 100644 --- a/daemon/worker.c +++ b/daemon/worker.c @@ -1845,10 +1845,10 @@ lookup_cache: * its qname must be that used for cache * lookup. */ if((worker->env.cfg->prefetch && - *worker->env.now >= rep->prefetch_ttl) || + rep->prefetch_ttl <= *worker->env.now) || (worker->env.cfg->serve_expired && - *worker->env.now > rep->ttl)) { - + rep->ttl < *worker->env.now && + !(*worker->env.now < rep->serve_expired_norec_ttl))) { time_t leeway = rep->ttl - *worker->env.now; if(rep->ttl < *worker->env.now) leeway = 0; diff --git a/services/cache/dns.c b/services/cache/dns.c index d9536c0e7..351b3568c 100644 --- a/services/cache/dns.c +++ b/services/cache/dns.c @@ -1056,7 +1056,7 @@ dns_cache_lookup(struct module_env* env, int dns_cache_store(struct module_env* env, struct query_info* msgqinf, - struct reply_info* msgrep, int is_referral, time_t leeway, int pside, + struct reply_info* msgrep, int is_referral, time_t leeway, int pside, struct regional* region, uint32_t flags, time_t qstarttime, int is_valrec) { @@ -1066,7 +1066,7 @@ dns_cache_store(struct module_env* env, struct query_info* msgqinf, * useful expired record exists. */ struct msgreply_entry* e = msg_cache_lookup(env, msgqinf->qname, msgqinf->qname_len, msgqinf->qtype, - msgqinf->qclass, flags, 0, 0); + msgqinf->qclass, flags, 0, 1); if(e) { struct reply_info* cached = e->entry.data; if(cached->ttl < *env->now @@ -1081,7 +1081,42 @@ dns_cache_store(struct module_env* env, struct query_info* msgqinf, && cached->security != sec_status_bogus && (env->need_to_validate && msgrep->security == sec_status_unchecked) - && !is_valrec) { + /* Exceptions to that rule are: + * o recursions that don't need validation but + * need to update the cache for coherence + * (delegation information while iterating, + * DNSKEY and DS lookups from validator) + * o explicit RRSIG queries that are not + * validated. */ + && !is_valrec + && msgqinf->qtype != LDNS_RR_TYPE_RRSIG) { + if((int)FLAGS_GET_RCODE(msgrep->flags) != + LDNS_RCODE_NOERROR && + (int)FLAGS_GET_RCODE(msgrep->flags) != + LDNS_RCODE_NXDOMAIN) { + /* The current response has an + * erroneous rcode. Adjust norec time + * so that additional lookups are not + * performed for some time. */ + verbose(VERB_ALGO, "set " + "serve-expired-norec-ttl for " + "response in cache"); + cached->serve_expired_norec_ttl = + NORR_TTL + *env->now; + if(env->cfg->serve_expired_ttl_reset && + cached->serve_expired_ttl + < *env->now + + env->cfg->serve_expired_ttl) { + /* Reset serve-expired-ttl for + * valid response in cache. */ + verbose(VERB_ALGO, "reset " + "serve-expired-ttl " + "for response in cache"); + cached->serve_expired_ttl = + *env->now + + env->cfg->serve_expired_ttl; + } + } verbose(VERB_ALGO, "a validated expired entry " "could be overwritten, skip caching " "the new message at this stage"); diff --git a/services/mesh.c b/services/mesh.c index a25094d12..2289277ea 100644 --- a/services/mesh.c +++ b/services/mesh.c @@ -1512,8 +1512,10 @@ void mesh_query_done(struct mesh_state* mstate) } if(mstate->s.return_rcode == LDNS_RCODE_SERVFAIL || (rep && FLAGS_GET_RCODE(rep->flags) == LDNS_RCODE_SERVFAIL)) { - /* we are SERVFAILing; check for expired answer here */ - mesh_serve_expired_callback(mstate); + if(mstate->s.env->cfg->serve_expired) { + /* we are SERVFAILing; check for expired answer here */ + mesh_respond_serve_expired(mstate); + } if((mstate->reply_list || mstate->cb_list) && mstate->s.env->cfg->log_servfail && !mstate->s.env->cfg->val_log_squelch) { diff --git a/testdata/serve_expired_client_timeout_servfail.rpl b/testdata/serve_expired_client_timeout_servfail.rpl index 13d03da6c..3c5b35e17 100644 --- a/testdata/serve_expired_client_timeout_servfail.rpl +++ b/testdata/serve_expired_client_timeout_servfail.rpl @@ -9,7 +9,6 @@ server: ede: yes ede-serve-expired: yes - stub-zone: name: "example.com" stub-addr: 1.2.3.4 @@ -25,8 +24,10 @@ SCENARIO_BEGIN Test serve-expired with client-timeout and a SERVFAIL upstream re ; - check that we get the expired cached answer ; - query again (the answer is available on the upstream server now) ; - check that we get the immediate expired answer back instead -; - (the upstream query does happen after the expired reply and updates the cache) -; - query again (the upstream has no answer) +; - query again (the answer is available on the upstream server now) +; - check that we *still* get the immediate expired answer back instead, recursion is blocked for NORR_TTL(5) +; - wait for NORR_TTL(5) to expire +; - query again ; - check that we get the freshly cached answer ; ns.example.com. @@ -73,7 +74,7 @@ RANGE_BEGIN 30 40 RANGE_END ; ns.example.com. -RANGE_BEGIN 50 60 +RANGE_BEGIN 50 100 ADDRESS 1.2.3.4 ; response to A query ENTRY_BEGIN @@ -148,12 +149,8 @@ ENTRY_BEGIN example.com. IN A ENTRY_END -; Allow for upstream query to resolve. -STEP 51 TRAFFIC - ; Check that we got an immediate stale answer because of the previous failure, -; regardless if upstream has the answer already in this range. The query will -; be resolved after the immediate cached answer and will cache the result. +; regardless if upstream has the answer already in this range. STEP 60 CHECK_ANSWER ENTRY_BEGIN MATCH all ttl ede=3 @@ -171,14 +168,41 @@ ENTRY_END ; Query again STEP 70 QUERY ENTRY_BEGIN - REPLY RD + REPLY RD DO SECTION QUESTION example.com. IN A ENTRY_END -; Check that we got the cached updated answer from the previous step since -; there is no upstream in this range. +; Check that we still get the immediate stale answer because of the previous failure, +; regardless if upstream has the answer already in this range. NORR_TTL(5) blocks us from +; recursion. STEP 80 CHECK_ANSWER +ENTRY_BEGIN + MATCH all ttl ede=3 + REPLY QR RD RA DO NOERROR + SECTION QUESTION + example.com. IN A + SECTION ANSWER + example.com. 123 IN A 5.6.7.8 + SECTION AUTHORITY + example.com. 123 IN NS ns.example.com. + SECTION ADDITIONAL + ns.example.com. 123 IN A 1.2.3.4 +ENTRY_END + +; Let NORR_TTL(5) expire +STEP 81 TIME_PASSES ELAPSE 5 + +; Query again +STEP 90 QUERY +ENTRY_BEGIN + REPLY RD + SECTION QUESTION + example.com. IN A +ENTRY_END + +; Check fresh reply +STEP 100 CHECK_ANSWER ENTRY_BEGIN MATCH all ttl REPLY QR RD RA NOERROR diff --git a/testdata/serve_expired_client_timeout_val_bogus.rpl b/testdata/serve_expired_client_timeout_val_bogus.rpl index 4334168cd..47a6545a4 100644 --- a/testdata/serve_expired_client_timeout_val_bogus.rpl +++ b/testdata/serve_expired_client_timeout_val_bogus.rpl @@ -30,13 +30,13 @@ SCENARIO_BEGIN Test serve-expired with client-timeout and bogus answer ; - wait for the record to expire ; - (upstream now has a bogus response) ; - query again for www.example.com. IN A -; - check that we get the expired valid response instead -; - query once more +; - check that we get the expired valid response instead; recursion is blocked for NORR_TTL(5) because of the failure ; - (upstream has the valid response again) +; - query once more ; - check that we get the immediate expired valid response -; - (the prefetch query updates the cache with the valid response) +; - let NORR_TTL(5) expire ; - query one last time -; - check that we get the immediate valid cache response; upstream does not have an answer at this moment +; - check that we get the immediate valid cache response ; The example.com NS and ns.example.com A record are commented out. ; This to make the test succeed. It then keeps the dnssec valid lookup. @@ -199,7 +199,7 @@ RANGE_END ;; ;; ns.example.com. with valid data again ;; -RANGE_BEGIN 40 60 +RANGE_BEGIN 40 70 ADDRESS 1.2.3.4 ; response to query of interest ENTRY_BEGIN @@ -279,7 +279,7 @@ SECTION QUESTION www.example.com. IN A ENTRY_END -; immediate cached answer because upstream is valid again +; immediate cached answer; although upstream is valid again STEP 50 CHECK_ANSWER ENTRY_BEGIN MATCH all ttl ede=3 @@ -297,7 +297,9 @@ SECTION ADDITIONAL ;ns.example.com. 123 IN RRSIG A 3 3 3600 20070926134150 20070829134150 2854 example.com. MC0CFQCQMyTjn7WWwpwAR1LlVeLpRgZGuQIUCcJDEkwAuzytTDRlYK7nIMwH1CM= ;{id = 2854} ENTRY_END -; upstream query is resolved before this query comes in +STEP 51 TIME_PASSES ELAPSE 5 + +; query one last time STEP 60 QUERY ENTRY_BEGIN REPLY RD DO @@ -305,7 +307,7 @@ SECTION QUESTION www.example.com. IN A ENTRY_END -; prefetch query updated the cache, since there is no upstream response in this range +; this is the fresh valid response STEP 70 CHECK_ANSWER ENTRY_BEGIN MATCH all ttl diff --git a/testdata/serve_expired_ttl_reset.rpl b/testdata/serve_expired_ttl_reset.rpl index faedb1cfc..9f215c96b 100644 --- a/testdata/serve_expired_ttl_reset.rpl +++ b/testdata/serve_expired_ttl_reset.rpl @@ -16,11 +16,11 @@ SCENARIO_BEGIN Serve expired ttl with reset on forwarder with a timeout on upstr ; - Wait for it to expire (+ serve-expired-ttl) ; - Send query again ; - Upstream timeouts -; - Error response from iterator SERVFAIL, resets expired-ttl on cache -; - Check we are getting the SERVFAIL response +; - Error response from iterator SERVFAIL, resets expired-ttl on cache and sets norec_ttl blocking recursion +; - Check we are getting the cached response because it was expired-ttl-reset ; - Query again -; - Check we are getting the expired answer -; - Upstream still timeouts +; - Check we are getting the expired answer; prefetching is blocked by norec_ttl +; - If there was prefetching the test would fail with the pending upstream query STEP 1 QUERY ENTRY_BEGIN @@ -54,7 +54,7 @@ STEP 4 TIME_PASSES ELAPSE 12 STEP 5 QUERY ENTRY_BEGIN -REPLY RD +REPLY RD DO SECTION QUESTION www.example.com. IN A ENTRY_END @@ -67,14 +67,16 @@ STEP 8 TIMEOUT STEP 9 TIMEOUT STEP 10 TIMEOUT -; Returns servfail +; Returns ; but error response from iterator resets the expired ttl STEP 11 CHECK_ANSWER ENTRY_BEGIN -MATCH all ttl -REPLY QR RA RD SERVFAIL +MATCH all ttl ede=3 +REPLY QR RA RD DO NOERROR SECTION QUESTION www.example.com. IN A +SECTION ANSWER +www.example.com. 123 IN A 0.0.0.0 ENTRY_END ; Query again @@ -96,8 +98,4 @@ SECTION ANSWER www.example.com. 123 IN A 0.0.0.0 ENTRY_END -; But the pending query times out! -; Only one because RTT reached the limit. -STEP 16 TIMEOUT - SCENARIO_END diff --git a/testdata/serve_expired_val_bogus.rpl b/testdata/serve_expired_val_bogus.rpl index 6c28aa9a6..54b66fe98 100644 --- a/testdata/serve_expired_val_bogus.rpl +++ b/testdata/serve_expired_val_bogus.rpl @@ -31,12 +31,18 @@ SCENARIO_BEGIN Test serve-expired with client-timeout and bogus answer ; - (upstream now has a bogus response) ; - query again for www.example.com. IN A ; - check that we get the immediate expired valid response -; - (prefetch response is bogus and is not cached) +; - (prefetch response is bogus and is not cached; recursion is blocked for NORR_TTL(5) because of the failure) +; - (upstream has a valid response again) ; - query once more -; - check that we still get the immediate expired valid response and not the fresh bogus one -; - (upstream has a valid response again; prefetch will update the cache) +; - check that we still get the immediate expired valid response (prefetch will not trigger because of NORR_TTL(5)) +; - query and check that cache was not updated +; - let NORR_TTL(5) expire +; - query once more +; - check that we still get the immediate expired valid response +; - (prefetch should be allowed to refresh the record at this point) +; - (upstream does not have the answer anymore) ; - query one last time -; - check that we get an immediate valid cache response +; - check that we get the immediate valid cache response ; The example.com NS and ns.example.com A record are commented out. ; This to make the test succeed. It then keeps the dnssec valid lookup. @@ -273,6 +279,7 @@ SECTION ADDITIONAL ;ns.example.com. 123 IN RRSIG A 3 3 3600 20070926134150 20070829134150 2854 example.com. MC0CFQCQMyTjn7WWwpwAR1LlVeLpRgZGuQIUCcJDEkwAuzytTDRlYK7nIMwH1CM= ;{id = 2854} ENTRY_END +; query with response available on the server STEP 40 QUERY ENTRY_BEGIN REPLY RD DO @@ -280,7 +287,8 @@ SECTION QUESTION www.example.com. IN A ENTRY_END -; this is still the immediate cache response because the previous upstream response was bogus +; this is still the immediate expired cache response because the previous upstream response was bogus +; upstream query did not go out because of the previous failure NORR_TTL(5). STEP 50 CHECK_ANSWER ENTRY_BEGIN MATCH all ttl ede=3 @@ -298,6 +306,7 @@ SECTION ADDITIONAL ;ns.example.com. 123 IN RRSIG A 3 3 3600 20070926134150 20070829134150 2854 example.com. MC0CFQCQMyTjn7WWwpwAR1LlVeLpRgZGuQIUCcJDEkwAuzytTDRlYK7nIMwH1CM= ;{id = 2854} ENTRY_END +; query with response available STEP 60 QUERY ENTRY_BEGIN REPLY RD DO @@ -305,9 +314,63 @@ SECTION QUESTION www.example.com. IN A ENTRY_END -; this is the immediate cache response because the previous upstream response was valid +; this is still the immediate expired cache response because resolution is blocked for NORR_TTL(5) STEP 70 CHECK_ANSWER ENTRY_BEGIN +MATCH all ttl ede=3 +REPLY QR RD RA AD DO NOERROR +SECTION QUESTION +www.example.com. IN A +SECTION ANSWER +www.example.com. 123 IN A 10.20.30.40 +www.example.com. 123 IN RRSIG A 3 3 3600 20070926134150 20070829134150 2854 example.com. MC0CFC99iE9K5y2WNgI0gFvBWaTi9wm6AhUAoUqOpDtG5Zct+Qr9F3mSdnbc6V4= ;{id = 2854} +SECTION AUTHORITY +;example.com. 123 IN NS ns.example.com. +;example.com. 123 IN RRSIG NS 3 2 3600 20070926134150 20070829134150 2854 example.com. MC0CFQCN+qHdJxoI/2tNKwsb08pra/G7aAIUAWA5sDdJTbrXA1/3OaesGBAO3sI= ;{id = 2854} +SECTION ADDITIONAL +;ns.example.com. 123 IN A 1.2.3.4 +;ns.example.com. 123 IN RRSIG A 3 3 3600 20070926134150 20070829134150 2854 example.com. MC0CFQCQMyTjn7WWwpwAR1LlVeLpRgZGuQIUCcJDEkwAuzytTDRlYK7nIMwH1CM= ;{id = 2854} +ENTRY_END + +; expire NORR_TTL(5) +STEP 71 TIME_PASSES ELAPSE 5 + +; query again +STEP 80 QUERY +ENTRY_BEGIN +REPLY RD DO +SECTION QUESTION +www.example.com. IN A +ENTRY_END + +; this is still the immediate expired cache response but prefetching will be allowed to update the cache +STEP 90 CHECK_ANSWER +ENTRY_BEGIN +MATCH all ttl ede=3 +REPLY QR RD RA AD DO NOERROR +SECTION QUESTION +www.example.com. IN A +SECTION ANSWER +www.example.com. 123 IN A 10.20.30.40 +www.example.com. 123 IN RRSIG A 3 3 3600 20070926134150 20070829134150 2854 example.com. MC0CFC99iE9K5y2WNgI0gFvBWaTi9wm6AhUAoUqOpDtG5Zct+Qr9F3mSdnbc6V4= ;{id = 2854} +SECTION AUTHORITY +;example.com. 123 IN NS ns.example.com. +;example.com. 123 IN RRSIG NS 3 2 3600 20070926134150 20070829134150 2854 example.com. MC0CFQCN+qHdJxoI/2tNKwsb08pra/G7aAIUAWA5sDdJTbrXA1/3OaesGBAO3sI= ;{id = 2854} +SECTION ADDITIONAL +;ns.example.com. 123 IN A 1.2.3.4 +;ns.example.com. 123 IN RRSIG A 3 3 3600 20070926134150 20070829134150 2854 example.com. MC0CFQCQMyTjn7WWwpwAR1LlVeLpRgZGuQIUCcJDEkwAuzytTDRlYK7nIMwH1CM= ;{id = 2854} +ENTRY_END + +STEP 100 QUERY +ENTRY_BEGIN +REPLY RD DO +SECTION QUESTION +www.example.com. IN A +ENTRY_END + +; this is the immediate cache response because the previous upstream response was valid +STEP 110 CHECK_ANSWER +ENTRY_BEGIN MATCH all ttl REPLY QR RD RA AD DO NOERROR SECTION QUESTION -- 2.47.2