stats: add query max wait time metric

author Vadim Fedorenko <vadfed@meta.com>

Fri, 21 Apr 2023 16:23:21 +0000 (09:23 -0700)

committer Vadim Fedorenko <vadfed@meta.com>

Wed, 26 Apr 2023 10:27:25 +0000 (03:27 -0700)
author Vadim Fedorenko <vadfed@meta.com>
Fri, 21 Apr 2023 16:23:21 +0000 (09:23 -0700)
committer Vadim Fedorenko <vadfed@meta.com>
Wed, 26 Apr 2023 10:27:25 +0000 (03:27 -0700)
diff --git a/daemon/remote.c b/daemon/remote.c

index d8a3a6cafc1e02651436a3229770f87c525d62bc..03daa935ee1809cf7f34cb972ab0a42bb64feda5 100644 (file)
--- a/daemon/remote.c
+++ b/daemon/remote.c
@@ -681,6 +681,8 @@ print_stats(RES* ssl, const char* nm, struct ub_stats_info* s)
                 (unsigned long)s->svr.num_queries_prefetch)) return 0;
         if(!ssl_printf(ssl, "%s.num.queries_timed_out"SQ"%lu\n", nm,
                 (unsigned long)s->svr.num_queries_timed_out)) return 0;
+       if(!ssl_printf(ssl, "%s.query.queue_time_us.max"SQ"%lu\n", nm,
+               (unsigned long)s->svr.max_query_time_us)) return 0;
         if(!ssl_printf(ssl, "%s.num.expired"SQ"%lu\n", nm,
                 (unsigned long)s->svr.ans_expired)) return 0;
         if(!ssl_printf(ssl, "%s.num.recursivereplies"SQ"%lu\n", nm,
diff --git a/daemon/stats.c b/daemon/stats.c

index 38f4597b3a9a81b5a590f5524e636124e95efbf3..cef7de8272109631f9c0c1bbbab92efa6a060f77 100644 (file)
--- a/daemon/stats.c
+++ b/daemon/stats.c
@@ -433,6 +433,8 @@ void server_stats_add(struct ub_stats_info* total, struct ub_stats_info* a)
         total->svr.num_queries_missed_cache += a->svr.num_queries_missed_cache;
         total->svr.num_queries_prefetch += a->svr.num_queries_prefetch;
         total->svr.num_queries_timed_out += a->svr.num_queries_timed_out;
+       if (total->svr.max_query_time_us < a->svr.max_query_time_us)
+               total->svr.max_query_time_us = a->svr.max_query_time_us;
         total->svr.sum_query_list_size += a->svr.sum_query_list_size;
         total->svr.ans_expired += a->svr.ans_expired;
  #ifdef USE_DNSCRYPT
diff --git a/daemon/worker.c b/daemon/worker.c

index 63b8678e31e7f570fe87cfb013d04c63a4af358c..5c373b79b09161ad2519df1e2d32df4eae51089c 100644 (file)
--- a/daemon/worker.c
+++ b/daemon/worker.c
@@ -1281,6 +1281,7 @@ worker_handle_request(struct comm_point* c, void* arg, int error,
         int is_expired_answer = 0;
         int is_secure_answer = 0;
         int rpz_passthru = 0;
+       long long wait_queue_time = 0;
         /* We might have to chase a CNAME chain internally, in which case
          * we'll have up to two replies and combine them to build a complete
          * answer.  These variables control this case. */
@@ -1289,6 +1290,7 @@ worker_handle_request(struct comm_point* c, void* arg, int error,
         struct query_info* lookup_qinfo = &qinfo;
         struct query_info qinfo_tmp; /* placeholder for lookup_qinfo */
         struct respip_client_info* cinfo = NULL, cinfo_tmp;
+       struct timeval wait_time;
         memset(&qinfo, 0, sizeof(qinfo));
  
         if((error != NETEVENT_NOERROR && error != NETEVENT_DONE)|| !repinfo) {
@@ -1297,9 +1299,13 @@ worker_handle_request(struct comm_point* c, void* arg, int error,
                 return 0;
         }
  
-       if (worker->env.cfg->sock_queue_timeout && timeval_isset(c->recv_tv)) {
-         c->recv_tv.tv_sec += worker->env.cfg->sock_queue_timeout;
-               if (timeval_smaller(c->recv_tv, worker->env.now_tv)) {
+       if (worker->env.cfg->sock_queue_timeout && timeval_isset(&c->recv_tv)) {
+               timeval_subtract(&wait_time, worker->env.now_tv, &c->recv_tv);
+               wait_queue_time = wait_time.tv_sec * 1000000 +  wait_time.tv_usec;
+               if (worker->stats.max_query_time_us < wait_queue_time)
+                       worker->stats.max_query_time_us = wait_queue_time;
+               c->recv_tv.tv_sec += worker->env.cfg->sock_queue_timeout;
+               if (timeval_smaller(&c->recv_tv, worker->env.now_tv)) {
                 /* count and drop queries that were sitting in the socket queue too long */
                         worker->stats.num_queries_timed_out++;
                         return 0;
diff --git a/libunbound/unbound.h b/libunbound/unbound.h

index f83a38f8ffd28c3a026b3197c8612de36da50e9b..8a97b16feaee1a06c1c6ba9eb905f0669aae84e2 100644 (file)
--- a/libunbound/unbound.h
+++ b/libunbound/unbound.h
@@ -701,7 +701,8 @@ struct ub_server_stats {
         long long num_queries_prefetch;
         /** number of queries which are too late to process */
         long long num_queries_timed_out;
-
+       /** the longest wait time in the queue */
+       long long max_query_time_us;
         /**
          * Sum of the querylistsize of the worker for
          * every query that missed cache. To calculate average.
diff --git a/smallapp/unbound-control.c b/smallapp/unbound-control.c

index d5f124b49bc003cf15724b659908a0a3824202d8..bbc09659fb049dc7b6cf5afba33cef338e3bb608 100644 (file)
--- a/smallapp/unbound-control.c
+++ b/smallapp/unbound-control.c
@@ -209,6 +209,7 @@ static void pr_stats(const char* nm, struct ub_stats_info* s)
         PR_UL_NM("num.cachemiss", s->svr.num_queries_missed_cache);
         PR_UL_NM("num.prefetch", s->svr.num_queries_prefetch);
         PR_UL_NM("num.queries_timed_out", s->svr.num_queries_timed_out);
+       PR_UL_NM("query.queue_time_us.max", s->svr.max_query_time_us);
         PR_UL_NM("num.expired", s->svr.ans_expired);
         PR_UL_NM("num.recursivereplies", s->mesh_replies_sent);
  #ifdef USE_DNSCRYPT
author	Vadim Fedorenko <vadfed@meta.com>
	Fri, 21 Apr 2023 16:23:21 +0000 (09:23 -0700)
committer	Vadim Fedorenko <vadfed@meta.com>
	Wed, 26 Apr 2023 10:27:25 +0000 (03:27 -0700)
daemon/remote.c		patch \| blob \| blame \| history
daemon/stats.c		patch \| blob \| blame \| history
daemon/worker.c		patch \| blob \| blame \| history
libunbound/unbound.h		patch \| blob \| blame \| history
smallapp/unbound-control.c		patch \| blob \| blame \| history