From: Robert Edmonds Date: Thu, 13 Nov 2025 08:33:05 +0000 (-0500) Subject: Mesh reply counters (#1374) X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=fceb4e85850fd01232f255f3e799e81f462f762b;p=thirdparty%2Funbound.git Mesh reply counters (#1374) * Statistics counter for number of queries dropped by limit on reply addresses Request list entries can be associated with multiple pending "reply addresses". Basically each request list entry keeps its own list of clients that should receive the response once the recursion is finished. This requires keeping allocations around for each client, and there is a global limit on the number of *additional* reply addresses that can be allocated. (Each new request list entry seems to get its own initial reply address which is not counted against the limit.) This commit adds a statistics counter "num_queries_replyaddr_limit" that counts the number of incoming client queries that have been dropped due to the restriction on allocating additional reply addresses. This allows distinguishing these drops from other kinds of drops. * Statistics counter for number of mesh reply entries Request list entries can be associated with multiple pending "reply addresses". Since there is a limit on the number of additional reply addresses that can be allocated which can cause incoming queries to be dropped if exceeded, it would be nice to be able to track this number. This commit basically exports the mesh_area's internal counter `num_reply_addrs` as "threadX.requestlist.current.replies" / "total.requestlist.current.replies". --- diff --git a/daemon/remote.c b/daemon/remote.c index 0d55619c2..862a43cfd 100644 --- a/daemon/remote.c +++ b/daemon/remote.c @@ -801,6 +801,8 @@ print_stats(RES* ssl, const char* nm, struct ub_stats_info* s) (unsigned long)s->svr.num_queries_cookie_invalid)) return 0; if(!ssl_printf(ssl, "%s.num.queries_discard_timeout"SQ"%lu\n", nm, (unsigned long)s->svr.num_queries_discard_timeout)) return 0; + if(!ssl_printf(ssl, "%s.num.queries_replyaddr_limit"SQ"%lu\n", nm, + (unsigned long)s->svr.num_queries_replyaddr_limit)) return 0; if(!ssl_printf(ssl, "%s.num.queries_wait_limit"SQ"%lu\n", nm, (unsigned long)s->svr.num_queries_wait_limit)) return 0; if(!ssl_printf(ssl, "%s.num.cachehits"SQ"%lu\n", nm, @@ -845,6 +847,8 @@ print_stats(RES* ssl, const char* nm, struct ub_stats_info* s) (unsigned long)s->mesh_num_states)) return 0; if(!ssl_printf(ssl, "%s.requestlist.current.user"SQ"%lu\n", nm, (unsigned long)s->mesh_num_reply_states)) return 0; + if(!ssl_printf(ssl, "%s.requestlist.current.replies"SQ"%lu\n", nm, + (unsigned long)s->mesh_num_reply_addrs)) return 0; #ifndef S_SPLINT_S sumwait.tv_sec = s->mesh_replies_sum_wait_sec; sumwait.tv_usec = s->mesh_replies_sum_wait_usec; diff --git a/daemon/stats.c b/daemon/stats.c index 41c4656aa..43a9f7092 100644 --- a/daemon/stats.c +++ b/daemon/stats.c @@ -262,6 +262,7 @@ server_stats_compile(struct worker* worker, struct ub_stats_info* s, int reset) s->svr = worker->stats; s->mesh_num_states = (long long)worker->env.mesh->all.count; s->mesh_num_reply_states = (long long)worker->env.mesh->num_reply_states; + s->mesh_num_reply_addrs = (long long)worker->env.mesh->num_reply_addrs; s->mesh_jostled = (long long)worker->env.mesh->stats_jostled; s->mesh_dropped = (long long)worker->env.mesh->stats_dropped; s->mesh_replies_sent = (long long)worker->env.mesh->replies_sent; @@ -284,6 +285,8 @@ server_stats_compile(struct worker* worker, struct ub_stats_info* s, int reset) NUM_BUCKETS_HIST); s->svr.num_queries_discard_timeout += (long long)worker->env.mesh->num_queries_discard_timeout; + s->svr.num_queries_replyaddr_limit += + (long long)worker->env.mesh->num_queries_replyaddr_limit; s->svr.num_queries_wait_limit += (long long)worker->env.mesh->num_queries_wait_limit; s->svr.num_dns_error_reports += @@ -448,6 +451,8 @@ void server_stats_add(struct ub_stats_info* total, struct ub_stats_info* a) total->svr.num_queries_cookie_invalid += a->svr.num_queries_cookie_invalid; total->svr.num_queries_discard_timeout += a->svr.num_queries_discard_timeout; + total->svr.num_queries_replyaddr_limit += + a->svr.num_queries_replyaddr_limit; total->svr.num_queries_wait_limit += a->svr.num_queries_wait_limit; total->svr.num_dns_error_reports += a->svr.num_dns_error_reports; total->svr.num_queries_missed_cache += a->svr.num_queries_missed_cache; @@ -519,6 +524,7 @@ void server_stats_add(struct ub_stats_info* total, struct ub_stats_info* a) total->mesh_num_states += a->mesh_num_states; total->mesh_num_reply_states += a->mesh_num_reply_states; + total->mesh_num_reply_addrs += a->mesh_num_reply_addrs; total->mesh_jostled += a->mesh_jostled; total->mesh_dropped += a->mesh_dropped; total->mesh_replies_sent += a->mesh_replies_sent; diff --git a/doc/unbound-control.8.in b/doc/unbound-control.8.in index 782a98e50..433b37354 100644 --- a/doc/unbound-control.8.in +++ b/doc/unbound-control.8.in @@ -880,6 +880,11 @@ number of queries removed due to discard\-timeout by thread .UNINDENT .INDENT 0.0 .TP +.B threadX.num.queries_replyaddr_limit +number of queries removed due to replyaddr limits by thread +.UNINDENT +.INDENT 0.0 +.TP .B threadX.num.queries_wait_limit number of queries removed due to wait\-limit by thread .UNINDENT @@ -994,6 +999,13 @@ Current size of the request list, only the requests from client queries. .UNINDENT .INDENT 0.0 .TP +.B threadX.requestlist.current.replies +Current count of the number of reply entries waiting on request list +entries. Because a request list entry can send results to multiple reply +addresses, this number may be larger than the size of the request list. +.UNINDENT +.INDENT 0.0 +.TP .B threadX.recursion.time.avg Average time it took to answer queries that needed recursive processing. Note that queries that were answered from the cache are not in this average. @@ -1048,6 +1060,11 @@ summed over threads. .UNINDENT .INDENT 0.0 .TP +.B total.num.queries_replyaddr_limit +summed over threads. +.UNINDENT +.INDENT 0.0 +.TP .B total.num.queries_wait_limit summed over threads. .UNINDENT @@ -1138,6 +1155,16 @@ summed over threads. .UNINDENT .INDENT 0.0 .TP +.B total.requestlist.current.user +summed over threads. +.UNINDENT +.INDENT 0.0 +.TP +.B total.requestlist.current.replies +summed over threads. +.UNINDENT +.INDENT 0.0 +.TP .B total.recursion.time.median averaged over threads. .UNINDENT diff --git a/doc/unbound-control.rst b/doc/unbound-control.rst index 71ff6ee37..630f2e160 100644 --- a/doc/unbound-control.rst +++ b/doc/unbound-control.rst @@ -815,6 +815,10 @@ number of statistic counters: number of queries removed due to discard-timeout by thread +@@UAHL@unbound-control.stats@threadX.num.queries_replyaddr_limit@@ + number of queries removed due to replyaddr limits by thread + + @@UAHL@unbound-control.stats@threadX.num.queries_wait_limit@@ number of queries removed due to wait-limit by thread @@ -910,6 +914,12 @@ number of statistic counters: Current size of the request list, only the requests from client queries. +@@UAHL@unbound-control.stats@threadX.requestlist.current.replies@@ + Current count of the number of reply entries waiting on request list + entries. Because a request list entry can send results to multiple reply + addresses, this number may be larger than the size of the request list. + + @@UAHL@unbound-control.stats@threadX.recursion.time.avg@@ Average time it took to answer queries that needed recursive processing. Note that queries that were answered from the cache are not in this average. @@ -955,6 +965,10 @@ number of statistic counters: summed over threads. +@@UAHL@unbound-control.stats@total.num.queries_replyaddr_limit@@ + summed over threads. + + @@UAHL@unbound-control.stats@total.num.queries_wait_limit@@ summed over threads. @@ -1027,6 +1041,14 @@ number of statistic counters: summed over threads. +@@UAHL@unbound-control.stats@total.requestlist.current.user@@ + summed over threads. + + +@@UAHL@unbound-control.stats@total.requestlist.current.replies@@ + summed over threads. + + @@UAHL@unbound-control.stats@total.recursion.time.median@@ averaged over threads. diff --git a/libunbound/unbound.h b/libunbound/unbound.h index c274f80ab..5a31f98e5 100644 --- a/libunbound/unbound.h +++ b/libunbound/unbound.h @@ -853,6 +853,8 @@ struct ub_server_stats { long long qquic; /** number of queries removed due to discard-timeout */ long long num_queries_discard_timeout; + /** number of queries removed due to replyaddr limit */ + long long num_queries_replyaddr_limit; /** number of queries removed due to wait-limit */ long long num_queries_wait_limit; /** number of dns error reports generated */ @@ -872,6 +874,8 @@ struct ub_stats_info { long long mesh_num_states; /** mesh stats: current number of reply (user) states */ long long mesh_num_reply_states; + /** mesh stats: current number of reply entries */ + long long mesh_num_reply_addrs; /** mesh stats: number of reply states overwritten with a new one */ long long mesh_jostled; /** mesh stats: number of incoming queries dropped */ diff --git a/services/mesh.c b/services/mesh.c index ca622e9c9..4a947766d 100644 --- a/services/mesh.c +++ b/services/mesh.c @@ -231,6 +231,7 @@ mesh_create(struct module_stack* stack, struct module_env* env) mesh->ans_expired = 0; mesh->ans_cachedb = 0; mesh->num_queries_discard_timeout = 0; + mesh->num_queries_replyaddr_limit = 0; mesh->num_queries_wait_limit = 0; mesh->num_dns_error_reports = 0; mesh->max_reply_states = env->cfg->num_queries_per_thread; @@ -474,7 +475,7 @@ void mesh_new_client(struct mesh_area* mesh, struct query_info* qinfo, verbose(VERB_ALGO, "Too many requests queued. " "dropping incoming query."); comm_point_drop_reply(rep); - mesh->stats_dropped++; + mesh->num_queries_replyaddr_limit++; return; } } @@ -2295,6 +2296,7 @@ mesh_stats_clear(struct mesh_area* mesh) memset(&mesh->rpz_action[0], 0, sizeof(size_t)*UB_STATS_RPZ_ACTION_NUM); mesh->ans_nodata = 0; mesh->num_queries_discard_timeout = 0; + mesh->num_queries_replyaddr_limit = 0; mesh->num_queries_wait_limit = 0; mesh->num_dns_error_reports = 0; } diff --git a/services/mesh.h b/services/mesh.h index 53a05b443..d2fac9d3c 100644 --- a/services/mesh.h +++ b/services/mesh.h @@ -141,6 +141,8 @@ struct mesh_area { size_t rpz_action[UB_STATS_RPZ_ACTION_NUM]; /** stats, number of queries removed due to discard-timeout */ size_t num_queries_discard_timeout; + /** stats, number of queries removed due to replyaddr limit */ + size_t num_queries_replyaddr_limit; /** stats, number of queries removed due to wait-limit */ size_t num_queries_wait_limit; /** stats, number of dns error reports generated */ diff --git a/smallapp/unbound-control.c b/smallapp/unbound-control.c index 696750c19..bb1d5237e 100644 --- a/smallapp/unbound-control.c +++ b/smallapp/unbound-control.c @@ -236,6 +236,8 @@ static void pr_stats(const char* nm, struct ub_stats_info* s) s->svr.num_queries_cookie_invalid); PR_UL_NM("num.queries_discard_timeout", s->svr.num_queries_discard_timeout); + PR_UL_NM("num.queries_replyaddr_limit", + s->svr.num_queries_replyaddr_limit); PR_UL_NM("num.queries_wait_limit", s->svr.num_queries_wait_limit); PR_UL_NM("num.cachehits", s->svr.num_queries - s->svr.num_queries_missed_cache); @@ -263,6 +265,7 @@ static void pr_stats(const char* nm, struct ub_stats_info* s) PR_UL_NM("requestlist.exceeded", s->mesh_dropped); PR_UL_NM("requestlist.current.all", s->mesh_num_states); PR_UL_NM("requestlist.current.user", s->mesh_num_reply_states); + PR_UL_NM("requestlist.current.replies", s->mesh_num_reply_addrs); #ifndef S_SPLINT_S sumwait.tv_sec = s->mesh_replies_sum_wait_sec; sumwait.tv_usec = s->mesh_replies_sum_wait_usec;